I am trying to scrape some Photos from different websites for my coding class.
I am using Beautiful Soup, and Urlib to do this.
Here is my code
import json
import time
from urllib.request import urlopen, Request
from urllib.request import urlretrieve
import urllib.request
from bs4 import BeautifulSoup
import os
import re
site = "https://www.hollisterco.com/shop/us/guys-new-arrivals"
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.19582"}
req = Request(url=site, headers=headers)
html = urlopen(req, timeout=30)
bs = BeautifulSoup(html, 'html.parser')
images = bs.find_all('img', {'src':re.compile('.jpg')})
count = 26
for image in images:
try:
#print(image["src"])
urlretrieve(image["src"], str(count)+".jpg")
count += 1
except:
continue
This code works for some websites, like www.vineyardvines.com worked just fine, but its doesnt work for www.hollisterco.com, what can I do to fix this, here is the error I am getting for hollisterco.com:
Traceback (most recent call last):
File "C:/Users/momin/PycharmProjects/scraper/scraper.py", line 22, in <module>
html = urlopen(req, timeout=30).read().decode()
File "C:\Users\momin\AppData\Local\Programs\Python\Python37\lib\urllib\request.py", line 222, in urlopen
return opener.open(url, data, timeout)
File "C:\Users\momin\AppData\Local\Programs\Python\Python37\lib\urllib\request.py", line 525, in open
response = self._open(req, data)
File "C:\Users\momin\AppData\Local\Programs\Python\Python37\lib\urllib\request.py", line 543, in _open
'_open', req)
File "C:\Users\momin\AppData\Local\Programs\Python\Python37\lib\urllib\request.py", line 503, in _call_chain
result = func(*args)
File "C:\Users\momin\AppData\Local\Programs\Python\Python37\lib\urllib\request.py", line 1360, in https_open
context=self._context, check_hostname=self._check_hostname)
File "C:\Users\momin\AppData\Local\Programs\Python\Python37\lib\urllib\request.py", line 1320, in do_open
r = h.getresponse()
File "C:\Users\momin\AppData\Local\Programs\Python\Python37\lib\http\client.py", line 1336, in getresponse
response.begin()
File "C:\Users\momin\AppData\Local\Programs\Python\Python37\lib\http\client.py", line 306, in begin
version, status, reason = self._read_status()
File "C:\Users\momin\AppData\Local\Programs\Python\Python37\lib\http\client.py", line 267, in _read_status
line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
File "C:\Users\momin\AppData\Local\Programs\Python\Python37\lib\socket.py", line 589, in readinto
return self._sock.recv_into(b)
File "C:\Users\momin\AppData\Local\Programs\Python\Python37\lib\ssl.py", line 1071, in recv_into
return self.read(nbytes, buffer)
File "C:\Users\momin\AppData\Local\Programs\Python\Python37\lib\ssl.py", line 929, in read
return self._sslobj.read(len, buffer)
ConnectionResetError: [WinError 10054] An existing connection was forcibly closed by the remote host
Okay, I figured out a solution.
Here is my advice:
Use selenium or requests for getting the content HTML data.
Related
This question already has answers here:
Sending "User-agent" using Requests library in Python
(3 answers)
Closed 6 days ago.
I am struggling to get reliable dividend data for my website so I have the following script to hit nasdaq.com:
import requests
dividends = "https://www.nasdaq.com/market-activity/stocks/hd/dividend-history"
response = requests.get(dividends)
The script just hangs and does nothing, the traceback when you cancel it looks like this:
^CTraceback (most recent call last):
File "/home/cchilders/projects/stocks_backend/scripts/get_dividends_nasdaq_dot_com.py", line 5, in <module>
response = requests.get(dividends)
File "/home/cchilders/.local/lib/python3.10/site-packages/requests/api.py", line 73, in get
return request("get", url, params=params, **kwargs)
File "/home/cchilders/.local/lib/python3.10/site-packages/requests/api.py", line 59, in request
return session.request(method=method, url=url, **kwargs)
File "/home/cchilders/.local/lib/python3.10/site-packages/requests/sessions.py", line 587, in request
resp = self.send(prep, **send_kwargs)
File "/home/cchilders/.local/lib/python3.10/site-packages/requests/sessions.py", line 701, in send
r = adapter.send(request, **kwargs)
File "/home/cchilders/.local/lib/python3.10/site-packages/requests/adapters.py", line 489, in send
resp = conn.urlopen(
File "/usr/lib/python3/dist-packages/urllib3/connectionpool.py", line 699, in urlopen
httplib_response = self._make_request(
File "/usr/lib/python3/dist-packages/urllib3/connectionpool.py", line 445, in _make_request
six.raise_from(e, None)
File "<string>", line 3, in raise_from
File "/usr/lib/python3/dist-packages/urllib3/connectionpool.py", line 440, in _make_request
httplib_response = conn.getresponse()
File "/usr/lib/python3.10/http/client.py", line 1374, in getresponse
response.begin()
File "/usr/lib/python3.10/http/client.py", line 318, in begin
version, status, reason = self._read_status()
File "/usr/lib/python3.10/http/client.py", line 279, in _read_status
line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
File "/usr/lib/python3.10/socket.py", line 705, in readinto
return self._sock.recv_into(b)
File "/usr/lib/python3.10/ssl.py", line 1273, in recv_into
return self.read(nbytes, buffer)
File "/usr/lib/python3.10/ssl.py", line 1129, in read
return self._sslobj.read(len, buffer)
KeyboardInterrupt
The test script works and shows 200 response:
response = requests.get("https://www.google.com")
print(response)
does this mean that the site has blocked requests module and other libraries from connecting or is there something else I can do? I cannot find dividends data going back more than a few years from any site except this one
The site has blocked some user-agents. Try using the user-agent of a browser:
import requests
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:104.0) Gecko/20100101 Firefox/104.0'
}
dividends = "https://www.nasdaq.com/market-activity/stocks/hd/dividend-history"
response = requests.get(dividends, headers=headers)
print(response)
Result:
<Response [200]>
I am trying to get some news from this website https://www.onvista.de, using my code I previously used on other websites, but in this case it doesn't work. Can anyone please tell me why?
import urllib.request, urllib.error, urllib.parse, datetime, os
url = 'https://www.onvista.de'
response = urllib.request.urlopen(url)
webContent = response.read()
print(type(webContent))
Here is what I get back:
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "C:\Program Files\WindowsApps\PythonSoftwareFoundation.Python.3.8_3.8.2800.0_x64__qbz5n2kfra8p0\lib\urllib\request.py", line 222, in urlopen
return opener.open(url, data, timeout)
File "C:\Program Files\WindowsApps\PythonSoftwareFoundation.Python.3.8_3.8.2800.0_x64__qbz5n2kfra8p0\lib\urllib\request.py", line 525, in open
response = self._open(req, data)
File "C:\Program Files\WindowsApps\PythonSoftwareFoundation.Python.3.8_3.8.2800.0_x64__qbz5n2kfra8p0\lib\urllib\request.py", line 542, in _open
result = self._call_chain(self.handle_open, protocol, protocol +
File "C:\Program Files\WindowsApps\PythonSoftwareFoundation.Python.3.8_3.8.2800.0_x64__qbz5n2kfra8p0\lib\urllib\request.py", line 502, in _call_chain
result = func(*args)
File "C:\Program Files\WindowsApps\PythonSoftwareFoundation.Python.3.8_3.8.2800.0_x64__qbz5n2kfra8p0\lib\urllib\request.py", line 1397, in https_open
return self.do_open(http.client.HTTPSConnection, req,
File "C:\Program Files\WindowsApps\PythonSoftwareFoundation.Python.3.8_3.8.2800.0_x64__qbz5n2kfra8p0\lib\urllib\request.py", line 1358, in do_open
r = h.getresponse()
File "C:\Program Files\WindowsApps\PythonSoftwareFoundation.Python.3.8_3.8.2800.0_x64__qbz5n2kfra8p0\lib\http\client.py", line 1344, in getresponse
response.begin()
File "C:\Program Files\WindowsApps\PythonSoftwareFoundation.Python.3.8_3.8.2800.0_x64__qbz5n2kfra8p0\lib\http\client.py", line 307, in begin
version, status, reason = self._read_status()
File "C:\Program Files\WindowsApps\PythonSoftwareFoundation.Python.3.8_3.8.2800.0_x64__qbz5n2kfra8p0\lib\http\client.py", line 276, in _read_status
raise RemoteDisconnected("Remote end closed connection without"
http.client.RemoteDisconnected: Remote end closed connection without response
>>> response = urllib.request.urlopen('https://www.onvista.de')
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "C:\Program Files\WindowsApps\PythonSoftwareFoundation.Python.3.8_3.8.2800.0_x64__qbz5n2kfra8p0\lib\urllib\request.py", line 222, in urlopen
return opener.open(url, data, timeout)
File "C:\Program Files\WindowsApps\PythonSoftwareFoundation.Python.3.8_3.8.2800.0_x64__qbz5n2kfra8p0\lib\urllib\request.py", line 525, in open
response = self._open(req, data)
File "C:\Program Files\WindowsApps\PythonSoftwareFoundation.Python.3.8_3.8.2800.0_x64__qbz5n2kfra8p0\lib\urllib\request.py", line 542, in _open
result = self._call_chain(self.handle_open, protocol, protocol +
File "C:\Program Files\WindowsApps\PythonSoftwareFoundation.Python.3.8_3.8.2800.0_x64__qbz5n2kfra8p0\lib\urllib\request.py", line 502, in _call_chain
result = func(*args)
File "C:\Program Files\WindowsApps\PythonSoftwareFoundation.Python.3.8_3.8.2800.0_x64__qbz5n2kfra8p0\lib\urllib\request.py", line 1397, in https_open
return self.do_open(http.client.HTTPSConnection, req,
File "C:\Program Files\WindowsApps\PythonSoftwareFoundation.Python.3.8_3.8.2800.0_x64__qbz5n2kfra8p0\lib\urllib\request.py", line 1358, in do_open
r = h.getresponse()
File "C:\Program Files\WindowsApps\PythonSoftwareFoundation.Python.3.8_3.8.2800.0_x64__qbz5n2kfra8p0\lib\http\client.py", line 1344, in getresponse
response.begin()
File "C:\Program Files\WindowsApps\PythonSoftwareFoundation.Python.3.8_3.8.2800.0_x64__qbz5n2kfra8p0\lib\http\client.py", line 307, in begin
version, status, reason = self._read_status()
File "C:\Program Files\WindowsApps\PythonSoftwareFoundation.Python.3.8_3.8.2800.0_x64__qbz5n2kfra8p0\lib\http\client.py", line 276, in _read_status
raise RemoteDisconnected("Remote end closed connection without"
http.client.RemoteDisconnected: Remote end closed connection without response
Thank you for any adivce, I am using Python 3.8 on Windows 10.
You should put headers in a request.
import requests
url = 'https://www.onvista.de'
headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:101.0) Gecko/20100101 Firefox/101.0'}
response = requests.get(url, headers=headers) # <Response [200]>
The website seems to block user agents that are identified as bots. You can set a custom user agent so the server will accept your request.
req = urllib.request.Request(
url,
data=None,
headers={
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36'
}
)
...
Hello I am trying to scrape some data from a website and request.get() is getting caught up on something.
here is my code:
page_url = front_end+str(i)+'/'
page = requests.get(page_url)
so I want it to be a string, because I am just entering an url and if I stop the code or it runs too long I get something like:
File "/usr/local/lib/python3.6/site-packages/urllib3/connectionpool.py",
line 377, in _make_request
httplib_response = conn.getresponse(buffering=True)
TypeError: getresponse() got an unexpected keyword argument 'buffering'
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "main.py", line 24, in <module>
page = requests.get(page_url)
File "/usr/local/lib/python3.6/site-packages/requests/api.py", line 75, in get
return request('get', url, params=params, **kwargs)
File "/usr/local/lib/python3.6/site-packages/requests/api.py", line 60, in request
return session.request(method=method, url=url, **kwargs)
File "/usr/local/lib/python3.6/site-packages/requests/sessions.py", line 533, in request
resp = self.send(prep, **send_kwargs)
File "/usr/local/lib/python3.6/site-packages/requests/sessions.py", line 646, in send
r = adapter.send(request, **kwargs)
File "/usr/local/lib/python3.6/site-packages/requests/adapters.py", line 449, in send
timeout=timeout
File "/usr/local/lib/python3.6/site-packages/urllib3/connectionpool.py", line 600, in urlopen
chunked=chunked)
File "/usr/local/lib/python3.6/site-packages/urllib3/connectionpool.py", line 380, in _make_request
httplib_response = conn.getresponse()
File "/usr/local/lib/python3.6/http/client.py", line 1331, in getresponse
response.begin()
File "/usr/local/lib/python3.6/http/client.py", line 297, inbegin
version, status, reason = self._read_status()
File "/usr/local/lib/python3.6/http/client.py", line 258, in_read_status
line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
File "/usr/local/lib/python3.6/socket.py", line 586, in readinto
return self._sock.recv_into(b)
File "/usr/local/lib/python3.6/ssl.py", line 1002, in recv_into
return self.read(nbytes, buffer)
File "/usr/local/lib/python3.6/ssl.py", line 865, in read
return self._sslobj.read(len, buffer)
File "/usr/local/lib/python3.6/ssl.py", line 625, in read
v = self._sslobj.read(len, buffer)
I do not understand what the TypeError: getresponse() got an unexpected keyword argument 'buffering' means or how to fix it.
Answer:
Sometimes requests from requests.get() gets blocked by server, so solution is to make the server think the request is coming from a web browser.
Example:
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36',}
page = requests.get("https://example.com", headers=headers)
I want to download images from an URL link which has a random component in it, so i have generated a code to do the same, but i'm getting an error -
Code:
import urllib.request
import random
random_number=random.randint(500,600)
url_image="'https://csgostash.com/img/skins/s"+str(random_number)+"fn.png'"
image=urllib.request.urlretrieve(url_image, 'skin.png')
Error:
Traceback (most recent call last):
File "C:/Users/luke/Desktop/scraper/test image download/cs test.py", line 8, in <module>
image=urllib.request.urlretrieve(url_image, 'skin.png')
File "C:\Users\luke\AppData\Local\Programs\Python\Python35-32\lib\urllib\request.py", line 187, in urlretrieve
with contextlib.closing(urlopen(url, data)) as fp:
File "C:\Users\luke\AppData\Local\Programs\Python\Python35-32\lib\urllib\request.py", line 162, in urlopen
return opener.open(url, data, timeout)
File "C:\Users\luke\AppData\Local\Programs\Python\Python35-32\lib\urllib\request.py", line 465, in open
response = self._open(req, data)
File "C:\Users\luke\AppData\Local\Programs\Python\Python35-32\lib\urllib\request.py", line 488, in _open
'unknown_open', req)
File "C:\Users\luke\AppData\Local\Programs\Python\Python35-32\lib\urllib\request.py", line 443, in _call_chain
result = func(*args)
File "C:\Users\luke\AppData\Local\Programs\Python\Python35-32\lib\urllib\request.py", line 1310, in unknown_open
raise URLError('unknown url type: %s' % type)
urllib.error.URLError: <urlopen error unknown url type: 'https>
First, url_image has an weird syntax.
url_image="https://csgostash.com/img/skins/s"+str(random_number)+"fn.png"
If you fix this, you will notice an 403 - Vax! Protection against bot: use a user agent.
import urllib.request
import random
random_number=random.randint(500,600)
url_image="https://csgostash.com/img/skins/s"+str(random_number)+"fn.png"
user_agent = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64)'
headers = {'User-Agent': user_agent}
req = urllib.request.Request(url_image, None, headers)
print(url_image)
#image, h = urllib.request.urlretrieve(url_image)
with urllib.request.urlopen(req) as response:
the_page = response.read()
print (the_page)
Edit: ofcourse you may save to file:
with open('skin.png', 'wb') as f:
f.write(the_page)
Check out this project using requests.
Hy!
I tried to open web-page, that is normally opening in browser, but python just swears and does not want to work.
import urllib.request, urllib.error
f = urllib.request.urlopen('http://www.booking.com/reviewlist.html?cc1=tr;pagename=sapphire')
And another way
import urllib.request, urllib.error
opener=urllib.request.build_opener()
f=opener.open('http://www.booking.com/reviewlist.html?cc1=tr;pagename=sapphi
re')
Both options give one type of error:
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "C:\Python34\lib\urllib\request.py", line 461, in open
response = meth(req, response)
File "C:\Python34\lib\urllib\request.py", line 571, in http_response
'http', request, response, code, msg, hdrs)
File "C:\Python34\lib\urllib\request.py", line 493, in error
result = self._call_chain(*args)
File "C:\Python34\lib\urllib\request.py", line 433, in _call_chain
result = func(*args)
File "C:\Python34\lib\urllib\request.py", line 676, in http_error_302
return self.parent.open(new, timeout=req.timeout)
File "C:\Python34\lib\urllib\request.py", line 461, in open
response = meth(req, response)
File "C:\Python34\lib\urllib\request.py", line 571, in http_response
'http', request, response, code, msg, hdrs)
File "C:\Python34\lib\urllib\request.py", line 499, in error
return self._call_chain(*args)
File "C:\Python34\lib\urllib\request.py", line 433, in _call_chain
result = func(*args)
File "C:\Python34\lib\urllib\request.py", line 579, in http_error_default
raise HTTPError(req.full_url, code, msg, hdrs, fp)
urllib.error.HTTPError: HTTP Error 400: Bad Request
Any ideas?
They are probably blocking the fact that it isn't coming from a browser. You probably need a valid User-Agent header or something.
Using requests, this works:
import requests
headers =
{
'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2049.0 Safari/537.36'
}
r = requests.get('http://www.booking.com/reviewlist.html?cc1=tr;pagename=sapphire', headers=headers)
print r
print r.headers
This URL seems to be doing user agent string checking. If I adjust my user agent string in Firefox to Python-urllib/2.7, it fails with the Bad Request you are seeing.
As you are using urllib, you can adjust the User Agent following this tutorial
from urllib.request import FancyURLopener
class MyOpener(FancyURLopener):
version = 'My new User-Agent' # Set this to a string you want for your user agent
myopener = MyOpener()
page = myopener.open('http://www.booking.com/reviewlist.html?cc1=tr;pagename=sapphire')