python scraping max retries exceeded with url - python

I am trying to scrape a website (~3000 pages) once a day. My code for the requests is:
#Return soup of page of Carsales ads
def getsoup(pagelimit,offset):
url = "http://www.carsales.com.au/cars/results?q=%28Service%3D%5BCarsales%5D%26%28%28%28SiloType%3D%5BDemo%20and%20near%20new%20cars%5D%7CSiloType%3D%5BDealer%20used%20cars%5D%29%7CSiloType%3D%5BDemo%20and%20near%20new%20cars%5D%29%7CSiloType%3D%5BPrivate%20seller%20cars%5D%29%29&cpw=1&sortby={0}&limit={1}&offset={2}".format('Price',str(pagelimit),str(offset))
#Sortby options: LastUpdated,Price,
r = requests.get(url, headers)
soup = BeautifulSoup(r.text, "html5lib") #"html.parser"
totalpages = int(soup.find("div", class_="pagination").text.split(' of ',1)[1].split('\n', 1)[0])
currentpage = int(soup.find("div", class_="pagination").text.split('Page ',1)[1].split(' of', 1)[0])
return (soup, totalpages, currentpage)
adscrape = []
#Run through all search result pages, appending ads to adscrape
while currentpage < totalpages:
soup, totalpages, currentpage = getsoup(pagelimit,offset)
print 'Page: {0} of {1}. Offset is {2}.'.format(currentpage,totalpages,offset)
adscrape.extend(getpageads(soup,offset))
offset = offset+pagelimit
# sleep(1)
I have previously run it successfully with no sleep() function to rate limit. However now I am getting errors midway through execution, it does this whether or not sleep(1) is active in the code or not:
...
Page: 1523 of 2956. Offset is 91320.
Page: 1524 of 2966. Offset is 91380.
Page: 1525 of 2956. Offset is 91440.
Traceback (most recent call last):
File "D:\Google Drive\pythoning\carsales\carsales_v2.py", line 82, in <module>
soup, totalpages, currentpage = getsoup(pagelimit,offset)
File "D:\Google Drive\pythoning\carsales\carsales_v2.py", line 28, in getsoup
r = requests.get(url, headers)
File "C:\Python27\lib\site-packages\requests\api.py", line 69, in get
return request('get', url, params=params, **kwargs)
File "C:\Python27\lib\site-packages\requests\api.py", line 50, in request
response = session.request(method=method, url=url, **kwargs)
File "C:\Python27\lib\site-packages\requests\sessions.py", line 468, in request
resp = self.send(prep, **send_kwargs)
File "C:\Python27\lib\site-packages\requests\sessions.py", line 576, in send
r = adapter.send(request, **kwargs)
File "C:\Python27\lib\site-packages\requests\adapters.py", line 423, in send
raise ConnectionError(e, request=request)
requests.exceptions.ConnectionError: HTTPConnectionPool(host='www.carsales.com.au', port=80): Max retries exceeded with url: /cars/results?q=%28Service%3D%5BCarsales%5D%26%28%28%28SiloType%3D%5BDemo%20and%20near%20new%20cars%5D%7CSiloType%3D%5BDealer%20used%20cars%5D%29%7CSiloType%3D%5BDemo%20and%20near%20new%20cars%5D%29%7CSiloType%3D%5BPrivate%20seller%20cars%5D%29%29&cpw=1&sortby=Price&limit=60&offset=91500&user-agent=Mozilla%2F5.0 (Caused by NewConnectionError('<requests.packages.urllib3.connection.HTTPConnection object at 0x000000001A097710>: Failed to establish a new connection: [Errno 11004] getaddrinfo failed',))
<<< Process finished. (Exit code 1)
I'm assuming this is due to hitting the server with too many requests in a certain time. If so, how can I avoid this without making the script take hours and hours to run? What is the normal practice for scraping websites to counter this issue?

Related

Max retries exceed with URL (Caused by NewConnection Error)

I am trying to create code that scrapes and downloads specific files from archive.org. When I run the program, I run into this code error.
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "D:\ROMS\Gamecube\main.py", line 16, in <module>
response = requests.get(DOMAIN + file_link)
File "C:\Users\cycle\AppData\Local\Programs\Python\Python38-32\lib\site-packages\requests\api.py", line 76, in get
return request('get', url, params=params, **kwargs)
File "C:\Users\cycle\AppData\Local\Programs\Python\Python38-32\lib\site-packages\requests\api.py", line 61, in request
return session.request(method=method, url=url, **kwargs)
File "C:\Users\cycle\AppData\Local\Programs\Python\Python38-32\lib\site-packages\requests\sessions.py", line 530, in request
resp = self.send(prep, **send_kwargs)
File "C:\Users\cycle\AppData\Local\Programs\Python\Python38-32\lib\site-packages\requests\sessions.py", line 643, in send
r = adapter.send(request, **kwargs)
File "C:\Users\cycle\AppData\Local\Programs\Python\Python38-32\lib\site-packages\requests\adapters.py", line 516, in send
raise ConnectionError(e, request=request)
requests.exceptions.ConnectionError: HTTPSConnectionPool(host='archive.org007%20-%20agent%20under%20fire%20%28usa%29.nkit.gcz', port=443): Max retries exceeded with url: / (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x043979B8>: Failed to establish a new connection: [Errno 11001] getaddrinfo failed'))
This is my code:
from bs4 import BeautifulSoup as bs
import requests
DOMAIN = 'https://archive.org'
URL = 'https://archive.org/download/GCRedumpNKitPart1'
FILETYPE = '%28USA%29.nkit.gcz'
def get_soup(url):
return bs(requests.get(url).text, 'html.parser')
for link in get_soup(URL).find_all('a'):
file_link = link.get('href')
if FILETYPE in file_link:
print(file_link)
with open(link.text, 'wb') as file:
response = requests.get(DOMAIN + file_link)
file.write(response.content)
You simply forgot / after https://archive.org so you create incorrect urls.
Add / at the end of domain
DOMAIN = 'https://archive.org/'
or add / later
response = requests.get(DOMAIN + '/' + file_link)
or use urllib.parse.urljoin() to create urls
import urllib.parse
response = requests.get(urllib.parse.urljoin(DOMAIN, file_link))

Python requests: using proxy but get connect 'Connection aborted'

My vpn works WELL, and can visit google(from China).
sock5 Port of my VPN: 1080
But when I run the following code, I get error.
import requests
headers = {'user-agent': ''}
proxies = {"http": "socks5://127.0.0.1:1080",'https': 'socks5://127.0.0.1:1080'}
# url = 'https://www.baidu.com/'
url = 'https://www.google.com/search?q=python' #
res = requests.get(url, headers=headers, proxies=proxies)
print("res.status_code:\n",res.status_code)
if I remove , proxies=proxies, and change the url to baidu it works.
...
url = 'https://www.baidu.com/'
# url = 'https://www.google.com/search?q=python'
res = requests.get(url, headers=headers)
print("res.status_code:\n",res.status_code)
the error in 3:
Traceback (most recent call last):
File "Try.py", line 17, in <module>
res = requests.get(url, headers=headers, proxies=proxies)
File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/requests-2.22.0-py3.7.egg/requests/api.py", line 75, in get
return request('get', url, params=params, **kwargs)
File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/requests-2.22.0-py3.7.egg/requests/api.py", line 60, in request
return session.request(method=method, url=url, **kwargs)
File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/requests-2.22.0-py3.7.egg/requests/sessions.py", line 533, in request
resp = self.send(prep, **send_kwargs)
File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/requests-2.22.0-py3.7.egg/requests/sessions.py", line 646, in send
r = adapter.send(request, **kwargs)
File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/requests-2.22.0-py3.7.egg/requests/adapters.py", line 498, in send
raise ConnectionError(err, request=request)
requests.exceptions.ConnectionError: ('Connection aborted.', OSError(0, 'Error'))
from 1 to 4, 1 is contradictory to 4. I don't really know where the problem is. I'd be extremely grateful If someone can help.
Solved by substituting sock5 with sock5h

Max retries exceeded with URL Error on using requests.get()

I'm trying to write a script in python to send HTTP get request to automatically generated URLs and get its response code and elapsed time. The URLs need not necessarily be a valid one, 400 responses are acceptable too.
script1.py
import sys
import requests
str1="http://www.googl"
str3=".com"
str2='a'
for x in range(0, 8):
y = chr(ord(str2)+x)
str_s=str1+y+str3
r=requests.get(str_s)
print(str_s, r.status_code, r.elapsed.total_seconds())
Error:
File "script1.py", line 12, in <module><br>
r=requests.get(str_s)<br>
File "/usr/local/lib/python2.7/dist-packages/requests/api.py", line 72, in get<br>
return request('get', url, params=params, **kwargs)<br>
File "/usr/local/lib/python2.7/dist-packages/requests/api.py", line 58, in request<br>
return session.request(method=method, url=url, **kwargs)<br>
File "/usr/local/lib/python2.7/dist-packages/requests/sessions.py", line 508, in request<br>
resp = self.send(prep, **send_kwargs)<br>
File "/usr/local/lib/python2.7/dist-packages/requests/sessions.py", line 618, in send<br>
r = adapter.send(request, **kwargs)<br>
File "/usr/local/lib/python2.7/dist-packages/requests/adapters.py", line 508, in send<br>
raise ConnectionError(e, request=request)<br>
requests.exceptions.ConnectionError: HTTPConnectionPool(host='www.googla.com', port=80): Max retries exceeded with url: / (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7fc44c891e50>: Failed to establish a new connection: [Errno -2] Name or service not known',))
I just want see the time taken to receive response of each request.
Only one request has to be sent
Response code does not matter.
I guess you want to get something like this:
import sys
import requests
str1="http://www.googl"
str3=".com"
str2='a'
for x in range(0, 8):
y = chr(ord(str2)+x)
str_s=str1+y+str3
print('Connecting to ' + str_s)
try:
r = requests.get(str_s)
print(str_s, r.status_code, r.elapsed.total_seconds())
except requests.ConnectionError as e:
print(" Failed to open url")
In this case, using the try...except you can catch the exception that get raises and handle it in a nice way.

SSLError: request module cannot connect via https

What am I missing?
HINT: I've also tried using urllib module
import requests
import sys
import time
import random
headers = {"User-Agent": "Mozilla/5.0 (X11; U; Linux i686) Gecko/20071127 Firefox/25.0"}
url = "HTTP LINK TO YOUTUBE VIDEO"
views = 10
videoMins = 3
videoSec = 33
refreshRate = videoMins * 60 + videoSec
proxy_list = [
{"http":"49.156.37.30:65309"}, {"http":"160.202.42.106:8080"},
{"http":"218.248.73.193:808"}, {"http":"195.246.57.154:8080"},
{"http":"80.161.30.156:80"}, {"http":"122.228.25.97:8101"},
{"http":"165.84.167.54:8080"},{"https":"178.140.216.229:8080"},
{"https":"46.37.193.74:3128"},{"https":"5.1.27.124:53281"},
{"https":"196.202.194.127:62225"},{"https":"194.243.194.51:8080"},
{"https":"206.132.165.246:8080"},{"https":"92.247.127.177:3128"}]
proxies = random.choice(proxy_list)
while True:
for view in range(views): # to loop in the number of allocated views
s = requests.Session()
s.get(url, headers=headers, proxies=proxies, stream=True, timeout=refreshRate)
s.close()
time.sleep(60) # time between loops so we appear real
sys.exit()
Here's the traceback error I got:
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "pytest.py", line 24, in <module>
s.get(url, headers=headers, proxies=proxies, stream=True,
timeout=refreshRate)
File "C:\Python\lib\site-packages\requests\sessions.py", line 521, in get
return self.request('GET', url, **kwargs)
File "C:\Python\lib\site-packages\requests\sessions.py", line 508, in
request
resp = self.send(prep, **send_kwargs)
File "C:\Python\lib\site-packages\requests\sessions.py", line 640, in send
history = [resp for resp in gen] if allow_redirects else []
File "C:\Python\lib\site-packages\requests\sessions.py", line 640, in
<listcomp>
history = [resp for resp in gen] if allow_redirects else []
File "C:\Python\lib\site-packages\requests\sessions.py", line 218, in
resolve_redirects
**adapter_kwargs
File "C:\Python\lib\site-packages\requests\sessions.py", line 618, in send
r = adapter.send(request, **kwargs)
File "C:\Python\lib\site-packages\requests\adapters.py", line 506, in send
raise SSLError(e, request=request)
requests.exceptions.SSLError: HTTPSConnectionPool(host='www.youtube.com',
port=443): Max retries exceed
ch?v=dHUP25DkKWo (Caused by SSLError(SSLError("bad handshake:
SysCallError(-1, 'Unexpected EOF')",),))
I suspect max retries from youtube. But its confusing because I'm connecting via random proxies. If that's the case, maybe the proxies aren't working...or no https connection was made.

How to move on if the error occur in response on python in beautiful Soup

I have made a web crawler that takes thousands of Urls from a text file and then crawls the data on that webpage.
Now that it has many Urls; some Urls are broken too.
So it gives me the error:
Traceback (most recent call last):
File "C:/Users/khize_000/PycharmProjects/untitled3/new.py", line 57, in <module>
crawl_data("http://www.foasdasdasdasdodily.com/r/126e7649cc-sweetssssie-pies-mac-and-cheese-recipe-by-the-dr-oz-show")
File "C:/Users/khize_000/PycharmProjects/untitled3/new.py", line 18, in crawl_data
data = requests.get(url)
File "C:\Python27\lib\site-packages\requests\api.py", line 67, in get
return request('get', url, params=params, **kwargs)
File "C:\Python27\lib\site-packages\requests\api.py", line 53, in request
return session.request(method=method, url=url, **kwargs)
File "C:\Python27\lib\site-packages\requests\sessions.py", line 468, in request
resp = self.send(prep, **send_kwargs)
File "C:\Python27\lib\site-packages\requests\sessions.py", line 576, in send
r = adapter.send(request, **kwargs)
File "C:\Python27\lib\site-packages\requests\adapters.py", line 437, in send
raise ConnectionError(e, request=request)
requests.exceptions.ConnectionError: HTTPConnectionPool(host='www.foasdasdasdasdodily.com', port=80): Max retries exceeded with url: /r/126e7649cc-sweetssssie-pies-mac-and-cheese-recipe-by-the-dr-oz-show (Caused by NewConnectionError('<requests.packages.urllib3.connection.HTTPConnection object at 0x0310FCB0>: Failed to establish a new connection: [Errno 11001] getaddrinfo failed',))
Here's my code:
def crawl_data(url):
global connectString
data = requests.get(url)
response = str( data )
if response != "<Response [200]>":
return
soup = BeautifulSoup(data.text,"lxml")
titledb = soup.h1.string
But it still gives me the same exception or error.
I simply want it to ignore that Urls from which there is no response
and move on to the next Url.
You need to learn about exception handling. The easiest way to ignore these errors is to surround the code that processes a single URL with a try-except construct, making you code read something like:
try:
<process a single URL>
except requests.exceptions.ConnectionError:
pass
This will mean that if the specified exception occurs your program will just execute the pass (do nothing) statement and move on to the next
Use try-except:
def crawl_data(url):
global connectString
try:
data = requests.get(url)
except requests.exceptions.ConnectionError:
return
response = str( data )
soup = BeautifulSoup(data.text,"lxml")
titledb = soup.h1.string

Categories

Resources