Python response = request.urlopen(url) not working with specific website - python

I am trying to get some news from this website https://www.onvista.de, using my code I previously used on other websites, but in this case it doesn't work. Can anyone please tell me why?
import urllib.request, urllib.error, urllib.parse, datetime, os
url = 'https://www.onvista.de'
response = urllib.request.urlopen(url)
webContent = response.read()
print(type(webContent))
Here is what I get back:
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "C:\Program Files\WindowsApps\PythonSoftwareFoundation.Python.3.8_3.8.2800.0_x64__qbz5n2kfra8p0\lib\urllib\request.py", line 222, in urlopen
return opener.open(url, data, timeout)
File "C:\Program Files\WindowsApps\PythonSoftwareFoundation.Python.3.8_3.8.2800.0_x64__qbz5n2kfra8p0\lib\urllib\request.py", line 525, in open
response = self._open(req, data)
File "C:\Program Files\WindowsApps\PythonSoftwareFoundation.Python.3.8_3.8.2800.0_x64__qbz5n2kfra8p0\lib\urllib\request.py", line 542, in _open
result = self._call_chain(self.handle_open, protocol, protocol +
File "C:\Program Files\WindowsApps\PythonSoftwareFoundation.Python.3.8_3.8.2800.0_x64__qbz5n2kfra8p0\lib\urllib\request.py", line 502, in _call_chain
result = func(*args)
File "C:\Program Files\WindowsApps\PythonSoftwareFoundation.Python.3.8_3.8.2800.0_x64__qbz5n2kfra8p0\lib\urllib\request.py", line 1397, in https_open
return self.do_open(http.client.HTTPSConnection, req,
File "C:\Program Files\WindowsApps\PythonSoftwareFoundation.Python.3.8_3.8.2800.0_x64__qbz5n2kfra8p0\lib\urllib\request.py", line 1358, in do_open
r = h.getresponse()
File "C:\Program Files\WindowsApps\PythonSoftwareFoundation.Python.3.8_3.8.2800.0_x64__qbz5n2kfra8p0\lib\http\client.py", line 1344, in getresponse
response.begin()
File "C:\Program Files\WindowsApps\PythonSoftwareFoundation.Python.3.8_3.8.2800.0_x64__qbz5n2kfra8p0\lib\http\client.py", line 307, in begin
version, status, reason = self._read_status()
File "C:\Program Files\WindowsApps\PythonSoftwareFoundation.Python.3.8_3.8.2800.0_x64__qbz5n2kfra8p0\lib\http\client.py", line 276, in _read_status
raise RemoteDisconnected("Remote end closed connection without"
http.client.RemoteDisconnected: Remote end closed connection without response
>>> response = urllib.request.urlopen('https://www.onvista.de')
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "C:\Program Files\WindowsApps\PythonSoftwareFoundation.Python.3.8_3.8.2800.0_x64__qbz5n2kfra8p0\lib\urllib\request.py", line 222, in urlopen
return opener.open(url, data, timeout)
File "C:\Program Files\WindowsApps\PythonSoftwareFoundation.Python.3.8_3.8.2800.0_x64__qbz5n2kfra8p0\lib\urllib\request.py", line 525, in open
response = self._open(req, data)
File "C:\Program Files\WindowsApps\PythonSoftwareFoundation.Python.3.8_3.8.2800.0_x64__qbz5n2kfra8p0\lib\urllib\request.py", line 542, in _open
result = self._call_chain(self.handle_open, protocol, protocol +
File "C:\Program Files\WindowsApps\PythonSoftwareFoundation.Python.3.8_3.8.2800.0_x64__qbz5n2kfra8p0\lib\urllib\request.py", line 502, in _call_chain
result = func(*args)
File "C:\Program Files\WindowsApps\PythonSoftwareFoundation.Python.3.8_3.8.2800.0_x64__qbz5n2kfra8p0\lib\urllib\request.py", line 1397, in https_open
return self.do_open(http.client.HTTPSConnection, req,
File "C:\Program Files\WindowsApps\PythonSoftwareFoundation.Python.3.8_3.8.2800.0_x64__qbz5n2kfra8p0\lib\urllib\request.py", line 1358, in do_open
r = h.getresponse()
File "C:\Program Files\WindowsApps\PythonSoftwareFoundation.Python.3.8_3.8.2800.0_x64__qbz5n2kfra8p0\lib\http\client.py", line 1344, in getresponse
response.begin()
File "C:\Program Files\WindowsApps\PythonSoftwareFoundation.Python.3.8_3.8.2800.0_x64__qbz5n2kfra8p0\lib\http\client.py", line 307, in begin
version, status, reason = self._read_status()
File "C:\Program Files\WindowsApps\PythonSoftwareFoundation.Python.3.8_3.8.2800.0_x64__qbz5n2kfra8p0\lib\http\client.py", line 276, in _read_status
raise RemoteDisconnected("Remote end closed connection without"
http.client.RemoteDisconnected: Remote end closed connection without response
Thank you for any adivce, I am using Python 3.8 on Windows 10.

You should put headers in a request.
import requests
url = 'https://www.onvista.de'
headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:101.0) Gecko/20100101 Firefox/101.0'}
response = requests.get(url, headers=headers) # <Response [200]>

The website seems to block user agents that are identified as bots. You can set a custom user agent so the server will accept your request.
req = urllib.request.Request(
url,
data=None,
headers={
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36'
}
)
...

Related

python url request to finance website hangs [duplicate]

This question already has answers here:
Sending "User-agent" using Requests library in Python
(3 answers)
Closed 6 days ago.
I am struggling to get reliable dividend data for my website so I have the following script to hit nasdaq.com:
import requests
dividends = "https://www.nasdaq.com/market-activity/stocks/hd/dividend-history"
response = requests.get(dividends)
The script just hangs and does nothing, the traceback when you cancel it looks like this:
^CTraceback (most recent call last):
File "/home/cchilders/projects/stocks_backend/scripts/get_dividends_nasdaq_dot_com.py", line 5, in <module>
response = requests.get(dividends)
File "/home/cchilders/.local/lib/python3.10/site-packages/requests/api.py", line 73, in get
return request("get", url, params=params, **kwargs)
File "/home/cchilders/.local/lib/python3.10/site-packages/requests/api.py", line 59, in request
return session.request(method=method, url=url, **kwargs)
File "/home/cchilders/.local/lib/python3.10/site-packages/requests/sessions.py", line 587, in request
resp = self.send(prep, **send_kwargs)
File "/home/cchilders/.local/lib/python3.10/site-packages/requests/sessions.py", line 701, in send
r = adapter.send(request, **kwargs)
File "/home/cchilders/.local/lib/python3.10/site-packages/requests/adapters.py", line 489, in send
resp = conn.urlopen(
File "/usr/lib/python3/dist-packages/urllib3/connectionpool.py", line 699, in urlopen
httplib_response = self._make_request(
File "/usr/lib/python3/dist-packages/urllib3/connectionpool.py", line 445, in _make_request
six.raise_from(e, None)
File "<string>", line 3, in raise_from
File "/usr/lib/python3/dist-packages/urllib3/connectionpool.py", line 440, in _make_request
httplib_response = conn.getresponse()
File "/usr/lib/python3.10/http/client.py", line 1374, in getresponse
response.begin()
File "/usr/lib/python3.10/http/client.py", line 318, in begin
version, status, reason = self._read_status()
File "/usr/lib/python3.10/http/client.py", line 279, in _read_status
line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
File "/usr/lib/python3.10/socket.py", line 705, in readinto
return self._sock.recv_into(b)
File "/usr/lib/python3.10/ssl.py", line 1273, in recv_into
return self.read(nbytes, buffer)
File "/usr/lib/python3.10/ssl.py", line 1129, in read
return self._sslobj.read(len, buffer)
KeyboardInterrupt
The test script works and shows 200 response:
response = requests.get("https://www.google.com")
print(response)
does this mean that the site has blocked requests module and other libraries from connecting or is there something else I can do? I cannot find dividends data going back more than a few years from any site except this one
The site has blocked some user-agents. Try using the user-agent of a browser:
import requests
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:104.0) Gecko/20100101 Firefox/104.0'
}
dividends = "https://www.nasdaq.com/market-activity/stocks/hd/dividend-history"
response = requests.get(dividends, headers=headers)
print(response)
Result:
<Response [200]>

URLlib UrlOpen [WinError 10054] An existing connection was forcibly closed

I am trying to scrape some Photos from different websites for my coding class.
I am using Beautiful Soup, and Urlib to do this.
Here is my code
import json
import time
from urllib.request import urlopen, Request
from urllib.request import urlretrieve
import urllib.request
from bs4 import BeautifulSoup
import os
import re
site = "https://www.hollisterco.com/shop/us/guys-new-arrivals"
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.19582"}
req = Request(url=site, headers=headers)
html = urlopen(req, timeout=30)
bs = BeautifulSoup(html, 'html.parser')
images = bs.find_all('img', {'src':re.compile('.jpg')})
count = 26
for image in images:
try:
#print(image["src"])
urlretrieve(image["src"], str(count)+".jpg")
count += 1
except:
continue
This code works for some websites, like www.vineyardvines.com worked just fine, but its doesnt work for www.hollisterco.com, what can I do to fix this, here is the error I am getting for hollisterco.com:
Traceback (most recent call last):
File "C:/Users/momin/PycharmProjects/scraper/scraper.py", line 22, in <module>
html = urlopen(req, timeout=30).read().decode()
File "C:\Users\momin\AppData\Local\Programs\Python\Python37\lib\urllib\request.py", line 222, in urlopen
return opener.open(url, data, timeout)
File "C:\Users\momin\AppData\Local\Programs\Python\Python37\lib\urllib\request.py", line 525, in open
response = self._open(req, data)
File "C:\Users\momin\AppData\Local\Programs\Python\Python37\lib\urllib\request.py", line 543, in _open
'_open', req)
File "C:\Users\momin\AppData\Local\Programs\Python\Python37\lib\urllib\request.py", line 503, in _call_chain
result = func(*args)
File "C:\Users\momin\AppData\Local\Programs\Python\Python37\lib\urllib\request.py", line 1360, in https_open
context=self._context, check_hostname=self._check_hostname)
File "C:\Users\momin\AppData\Local\Programs\Python\Python37\lib\urllib\request.py", line 1320, in do_open
r = h.getresponse()
File "C:\Users\momin\AppData\Local\Programs\Python\Python37\lib\http\client.py", line 1336, in getresponse
response.begin()
File "C:\Users\momin\AppData\Local\Programs\Python\Python37\lib\http\client.py", line 306, in begin
version, status, reason = self._read_status()
File "C:\Users\momin\AppData\Local\Programs\Python\Python37\lib\http\client.py", line 267, in _read_status
line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
File "C:\Users\momin\AppData\Local\Programs\Python\Python37\lib\socket.py", line 589, in readinto
return self._sock.recv_into(b)
File "C:\Users\momin\AppData\Local\Programs\Python\Python37\lib\ssl.py", line 1071, in recv_into
return self.read(nbytes, buffer)
File "C:\Users\momin\AppData\Local\Programs\Python\Python37\lib\ssl.py", line 929, in read
return self._sslobj.read(len, buffer)
ConnectionResetError: [WinError 10054] An existing connection was forcibly closed by the remote host
Okay, I figured out a solution.
Here is my advice:
Use selenium or requests for getting the content HTML data.

request.get() is getting stuck

Hello I am trying to scrape some data from a website and request.get() is getting caught up on something.
here is my code:
page_url = front_end+str(i)+'/'
page = requests.get(page_url)
so I want it to be a string, because I am just entering an url and if I stop the code or it runs too long I get something like:
File "/usr/local/lib/python3.6/site-packages/urllib3/connectionpool.py",
line 377, in _make_request
httplib_response = conn.getresponse(buffering=True)
TypeError: getresponse() got an unexpected keyword argument 'buffering'
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "main.py", line 24, in <module>
page = requests.get(page_url)
File "/usr/local/lib/python3.6/site-packages/requests/api.py", line 75, in get
return request('get', url, params=params, **kwargs)
File "/usr/local/lib/python3.6/site-packages/requests/api.py", line 60, in request
return session.request(method=method, url=url, **kwargs)
File "/usr/local/lib/python3.6/site-packages/requests/sessions.py", line 533, in request
resp = self.send(prep, **send_kwargs)
File "/usr/local/lib/python3.6/site-packages/requests/sessions.py", line 646, in send
r = adapter.send(request, **kwargs)
File "/usr/local/lib/python3.6/site-packages/requests/adapters.py", line 449, in send
timeout=timeout
File "/usr/local/lib/python3.6/site-packages/urllib3/connectionpool.py", line 600, in urlopen
chunked=chunked)
File "/usr/local/lib/python3.6/site-packages/urllib3/connectionpool.py", line 380, in _make_request
httplib_response = conn.getresponse()
File "/usr/local/lib/python3.6/http/client.py", line 1331, in getresponse
response.begin()
File "/usr/local/lib/python3.6/http/client.py", line 297, inbegin
version, status, reason = self._read_status()
File "/usr/local/lib/python3.6/http/client.py", line 258, in_read_status
line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
File "/usr/local/lib/python3.6/socket.py", line 586, in readinto
return self._sock.recv_into(b)
File "/usr/local/lib/python3.6/ssl.py", line 1002, in recv_into
return self.read(nbytes, buffer)
File "/usr/local/lib/python3.6/ssl.py", line 865, in read
return self._sslobj.read(len, buffer)
File "/usr/local/lib/python3.6/ssl.py", line 625, in read
v = self._sslobj.read(len, buffer)
I do not understand what the TypeError: getresponse() got an unexpected keyword argument 'buffering' means or how to fix it.
Answer:
Sometimes requests from requests.get() gets blocked by server, so solution is to make the server think the request is coming from a web browser.
Example:
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36',}
page = requests.get("https://example.com", headers=headers)

Python Requests Get not Working (GCloud Debian 4.9.110)

I have a simple Get request I'd like to make using Python's Request library.
import requests
u_a = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36"
url = 'http://stats.nba.com/stats/playergamelogs?DateFrom=&DateTo=&GameSegment=&LastNGames=0&LeagueID=00&Location=&MeasureType=Base&Month=0&OpponentTeamID=0&Outcome=&PORound=0&PaceAdjust=N&PerMode=Totals&Period=0&PlayerID=202391&PlusMinus=N&Rank=N&Season=2014-15&SeasonSegment=&SeasonType=Regular+Season&ShotClockRange=&VsConference=&VsDivision='
response = requests.get(url, headers={"USER-AGENT":u_a})
The requests.get call hangs and I have to force the script to exit. However, I am able to make the same call on my local MacOS and Ubuntu machines. I can also copy/paste the url into my local computer's browsers and view the resulting JSON. Is the problem with the User Agent?
Edit (Added stacktrace):
Traceback (most recent call last):
File "prune_simulation.py", line 336, in <module>
main()
File "prune_simulation.py", line 44, in main
response = requests.get(url, headers={"USER-AGENT":u_a})
File "/home/nole/.local/lib/python2.7/site-packages/requests/api.py", line 72, in get
return request('get', url, params=params, **kwargs)
File "/home/nole/.local/lib/python2.7/site-packages/requests/api.py", line 58, in request
return session.request(method=method, url=url, **kwargs)
File "/home/nole/.local/lib/python2.7/site-packages/requests/sessions.py", line 512, in request
resp = self.send(prep, **send_kwargs)
File "/home/nole/.local/lib/python2.7/site-packages/requests/sessions.py", line 622, in send
r = adapter.send(request, **kwargs)
File "/home/nole/.local/lib/python2.7/site-packages/requests/adapters.py", line 445, in send
timeout=timeout
File "/home/nole/.local/lib/python2.7/site-packages/urllib3/connectionpool.py", line 600, in urlopen
chunked=chunked)
File "/home/nole/.local/lib/python2.7/site-packages/urllib3/connectionpool.py", line 377, in _make_request
httplib_response = conn.getresponse(buffering=True)
File "/usr/lib/python2.7/httplib.py", line 1121, in getresponse
response.begin()
File "/usr/lib/python2.7/httplib.py", line 438, in begin
version, status, reason = self._read_status()
File "/usr/lib/python2.7/httplib.py", line 394, in _read_status
line = self.fp.readline(_MAXLINE + 1)
File "/usr/lib/python2.7/socket.py", line 480, in readline
data = self._sock.recv(self._rbufsize)
KeyboardInterrupt

How to read an image url with Unicode characters in Python 2 with urllib?

Requirements: Python 2.7 and no external libraries like Requests or BeautifulSoup :(
I get the error in the traceback from retrieveUrl when I call this url:
u'http://%E7%9F%A5%E3%81%A3%E5%BE%97%E8%A2%8B.biz/wp-content/uploads/2016/10/104743-300x225.jpg'
As you can see, my server already gives me that url nice and uriencoded-ready, but it still blows up.
def retrieveUrl(url):
req = urllib2.Request(url, None, {'User-Agent': 'Mozilla/5.0 (compatible; Anki)'})
filecontents = urllib2.urlopen(req).read()
path = unicode(urllib2.unquote(url.encode("utf8")), "utf8")
filename, file_extension = os.path.splitext(path)
return filename, file_extension, filecontents
Error Traceback
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "C:\Python27\lib\urllib2.py", line 154, in urlopen
return opener.open(url, data, timeout)
File "C:\Python27\lib\urllib2.py", line 431, in open
response = self._open(req, data)
File "C:\Python27\lib\urllib2.py", line 449, in _open
'_open', req)
File "C:\Python27\lib\urllib2.py", line 409, in _call_chain
result = func(*args)
File "C:\Python27\lib\urllib2.py", line 1227, in http_open
return self.do_open(httplib.HTTPConnection, req)
File "C:\Python27\lib\urllib2.py", line 1194, in do_open
h.request(req.get_method(), req.get_selector(), req.data, headers)
File "C:\Python27\lib\httplib.py", line 1057, in request
self._send_request(method, url, body, headers)
File "C:\Python27\lib\httplib.py", line 1097, in _send_request
self.endheaders(body)
File "C:\Python27\lib\httplib.py", line 1053, in endheaders
self._send_output(message_body)
File "C:\Python27\lib\httplib.py", line 897, in _send_output
self.send(msg)
File "C:\Python27\lib\httplib.py", line 859, in send
self.connect()
File "C:\Python27\lib\httplib.py", line 836, in connect
self.timeout, self.source_address)
File "C:\Python27\lib\socket.py", line 557, in create_connection
for res in getaddrinfo(host, port, 0, SOCK_STREAM):
File "C:\Python27\lib\encodings\idna.py", line 164, in encode
result.append(ToASCII(label))
File "C:\Python27\lib\encodings\idna.py", line 76, in ToASCII
label = nameprep(label)
File "C:\Python27\lib\encodings\idna.py", line 38, in nameprep
raise UnicodeError("Invalid character %r" % c)
UnicodeError: Invalid character u'\x9f'
I haven't managed to even figure out what character u'\x9f' is.
Any ideas how I can fix that function to get the filecontents?

Categories

Resources