Python: improvements for rotating proxy crawler - python

I wrote a piece of code to rotate proxies for a multithreaded crawler, but it doesn't look very good and I want to see what I can improve.
What I had in mind:
1) Make a number of requests (a random range) with a proxy, then change it
2) If blocked, change the proxy (remove it from proxies list) and retry.
3) If a HTTP error occurs, retry with same proxy
4) If a proxy error occurs, change the proxy (remove it from proxies list), and retry.
Usually, it works pretty decent, though I see some problems which may appear:
1) make_request function is calling itsel which may lead in some cases to an infinte loop
2) proxy errors are not handled properly
Here is my code:
import requests
import threading
import random
import time
import logging
import os
class Crawler():
def __init__(self):
self.user_agents = []
with open('user_agents.txt', 'r') as inpt:
for line in inpt:
if line.strip():
self.user_agents.append(line.strip())
self.proxies = []
with open('proxies.txt', 'r') as inpt:
for line in inpt:
if not line.strip():
continue
self.proxies.append({"http": ''.join(["http://",
line.strip()]),
"https": ''.join(["https://",
line.strip()])})
self.headers = {'User-agent': random.choice(self.user_agents)}
self.session = requests.Session()
self.counter = 0
self.current_proxy = None
self.lock = threading.Lock()
self.set_proxy()
def make_request(self, method, url, **kwargs):
"""Request a page and return its content
#method - string, POST or GET
#url - string
#return: string, HTML page source
or bytes for binary files
"""
# make only 10 to 20 requests using a proxy
with self.lock:
if self.counter > random.randrange(10, 20):
self.set_proxy()
else:
self.counter += 1
try:
if method == 'GET':
if kwargs.get('download', False):
req = self.session.get(url,
headers=self.headers,
stream=True, verify=False)
return req.raw
req = self.session.get(url,
headers=self.headers,
verify=False,
**kwargs)
else:
req = self.session.post(url,
headers=self.headers,
verify=False,
**kwargs)
if req.status_code == 407:
logging.exception('make_request[Proxy Authentication]')
os._exit(1)
if req.encoding not in ['utf8', 'utf-8', None]:
html = req.content.decode(req.encoding)
else:
html = req.text
if 'Access Denied' in html:
# website's error message. proxy blocked
with self.lock:
self.set_proxy(remove=True)
time.sleep(1)
return self.make_request(method, url, **kwargs)
else:
return html
except requests.exceptions.HTTPError as e:
if e.response.status_code == 403:
# access forbidden. proxy blocked
with self.lock:
self.set_proxy(remove_proxy=True)
time.sleep(1)
return self.make_request(method, url, **kwargs)
elif e.response.status_code == 404:
logging.exception(' '.join([
'make_request[HTTPError]',
url, str(e)]))
return
elif e.response.status_code == 429:
# too many requests. proxy blocked
with self.lock:
self.set_proxy(remove_proxy=True)
time.sleep(1)
return self.make_request(method, url, **kwargs)
else:
logging.exception(' '.join([
'make_request[unknown HTTPError]',
url, str(e)]))
return None
except requests.exceptions.InvalidURL as e:
logging.exception(' '.join([
'make_request[InvalidURL]',
url, str(e)]))
return None
except requests.exceptions.Timeout:
time.sleep(1)
return self.make_request(method, url, **kwargs)
except requests.exceptions.ConnectionError as e:
# Connection refused
if '403 Forbidden' in str(e):
logging.exception('make_requests[403 forbidden]')
os._exit(1)
with self.lock:
self.set_proxy()
time.sleep(1)
return self.make_request(method, url, **kwargs)
except Exception as e:
logging.exception(' '.join([
'make_request[unknown Exception]',
url, str(e)]))
return None
def set_proxy(self, remove_proxy=False):
"""Get a random proxy from the list"""
if remove_proxy:
try:
self.proxies.remove(self.current_proxy)
except:
pass
while True:
if self.proxies:
proxy = random.choice(self.proxies)
if not self.is_alive(proxy):
continue
self.current_proxy = proxy
self.session = requests.Session()
self.session.proxies = self.current_proxy
self.headers = {'User-agent': random.choice(self.user_agents)}
self.counter = 0
break
else:
logging.exception('EMPTY PROXY LIST')
os._exit(1)
break
def is_alive(self, proxy):
"""Check if a proxy is alive or not
#proxy - dict
#return: True if alive, False otherwise
"""
try:
requests.get('http://www.google.com',
proxies=proxy, timeout=5)
return True
except:
return False
Thanks

Related

How to use Funcy "print_durations" in grequests function?

I've never used this type of module or function. I wanted to know how I can time and print the time taken to find the "response 200" link?
##################### START MODUL PARSE #######################
def my_except_handler(request, exception):
request.url
def check_for_errors(response, *args, **kwargs):
try:
response
except response.exceptions.RequestException as e:
pass
except response.exceptions.HTTPError as e:
pass
except response.ReadTimeout as e:
pass
except response.ConnectionError as e:
pass
except response.ConnectTimeout:
pass
def do_parse(response, *args, **kwargs):
url = response.url
if response.status_code == 200:
response.request.url
url_parse = response.request.url
response.text
try:
if "<font color=#25ff00>" in response.text:
print("STOP SCAN PARSE Time SCAN HERE NEED PRINT TIME")
except Exception as e:
pass
def get_urls_file(site):
urls = []
config_file = "Configurations/package.json"
config = config_file = json.loads(open(f'{config_file}').read())
par_s = config['parse']
for x in par_s:
urls.append(f'{site}{x}')
def get_data_file(urls):
actions_list = []
for url in urls:
action_item = grequests.get(url, headers=headers, timeout=6, stream=True, allow_redirects=False, hooks={'response': [do_parse, check_for_errors]})
actions_list.append(action_item)
grequests.map(actions_list,size=30, exception_handler=my_except_handler)
get_data_file(urls)
Also, i have this example
from funcy import print_durations
#print_durations()
def myfunc(n=0):
for i in range(n):
pass
myfunc(123)
myfunc(123456789)
but if i try to put " #print_durations() " like here:
#print_durations()
def do_parse(response, *args, **kwargs):
i get still the print for all file:
0.00 ns in do_env(<Response [500]>, timeout=6, verify=True, proxies=OrderedDict(), stream=False, cert=None)
0.00 ns in do_env(<Response [500]>, timeout=6, verify=True, proxies=OrderedDict(), stream=False, cert=None)
..... ..... .....
187.51 ms in do_parse(<Response [200]>, timeout=6, verify=True, proxies=OrderedDict(), stream=False, cert=None)
but i need to print the time just here:
print(f"STOP SCAN PARSE Time SCAN ("HERE NEED PRINT TIME")
Thanks for help me.

why i am getting UnboundLocalError

import requests
from bs4 import BeautifulSoup
from random import choice
def get_proxy():
url = "https://free-proxy-list.net/"
r = requests.get(url)
soup =BeautifulSoup(r.content , 'lxml')
return{'https': choice(list(map(lambda x:x[0]+':'+x[1],list(zip(map(lambda x:x.text, soup.findAll('td')[::8]),map(lambda x:x.text, soup.findAll('td')[1::8]))))))}
def proxy_request(request_type, url, **kwargs):
while 1:
try:
proxy = get_proxy()
print("using proxy: {}".format(proxy))
r = requests.request(request_type, url, proxies=proxy, timeout=7, **kwargs)
num = soup.find_all('h1')
print (num.text())
break
except:
pass
return r
r = proxy_request('get',"https://en.wikipedia.org/wiki/Main_Page")
print (r.text())
and i am getting this error
The problem is your return is in your while loop. When you break, you skip return, therefore r is None:
def proxy_request(request_type, url, **kwargs):
while 1:
try:
proxy = get_proxy()
print("using proxy: {}".format(proxy))
r = requests.request(request_type, url, proxies=proxy, timeout=7, **kwargs)
num = soup.find_all('h1')
print (num.text())
break
except:
pass
return num # return it here, outside of the while loop
You'll also want to consider the case where num might not be found and get a case to break out to avoid infinite loops

Accessing a classes variable in python

I understand that this is a duplicate, but I havent had that "ah-ha" moment where I understand HOW to access the a classes variable. In this code, I am crawling a website from a list of thousands of pages. Those jobs are submitted via concurrent.futures.
I want to be able to return the value of "results". I've used self.results within def __init__(self, url_list, threads) and I cant seem to pull that variable when I try print(example.results.
If self.results is returning a value, but example.results isn't pulling it from if __name__ == '__main__':, how can you access that? I know I've done something wrong, but I don't know what it is.
from concurrent.futures import ThreadPoolExecutor
from proxy_def import *
import requests
from bs4 import BeautifulSoup
from parsers import *
site = 0
class ConcurrentListCrawler(object):
def __init__(self, url_list, threads):
self.urls = url_list
self.results = {}
self.max_threads = threads
def __make_request(self, url):
try:
r = requests.get(url=url, timeout=20)
r.raise_for_status()
print(countit(), r.url)
except requests.exceptions.Timeout:
r = requests.get(url=url, timeout=60)
except requests.exceptions.ConnectionError:
r = requests.get(url=url, timeout=60)
except requests.exceptions.RequestException as e:
raise e
return r.url, r.text
def __parse_results(self, url, html):
try:
print(url)
trip_data = restaurant_parse(url)
except Exception as e:
raise e
if trip_data:
print('here we go')
self.results = trip_data
#print(self.results)
return self.results
def wrapper(self, url):
url, html = self.__make_request(url)
self.__parse_results(url, html)
def run_script(self):
with ThreadPoolExecutor(max_workers=min(len(self.urls),self.max_threads)) as Executor:
jobs = [Executor.submit(self.wrapper, u) for u in self.urls]
if __name__ == '__main__':
listo = loadit()
print(listo)
print(len(listo))
example = ConcurrentListCrawler(listo, 10)
example.run_script()
print(example.results)
Any pointers would be greatly appreciated.
I believe one of your methods is not returning the results.
Make the following change.
def wrapper(self, url):
url, html = self.__make_request(url)
return self.__parse_results(url, html)
After this, I suggest you utilize the self.results as a dictionary, like it was declared.
In the method "__parse_results(..)", append trip_data to self.results as follows, instead of assigning.
def __parse_results(self, url, html):
try:
print(url)
trip_data = restaurant_parse(url)
except Exception as e:
raise e
if trip_data:
print('here we go')
self.results[url] = trip_data
#print(self.results)
return self.results
When you append to self.results, it would retain the older values and you may avoid replacing by reassignment.
The issue was that I submitted all the jobs at once through a list. I was unable to pull the variable from the class because print(example.results) because that part of the code isnt access until all jobs are complete. With that I was able to resolve by getting rid of the class (even though the title of this posting indicates that this is the issue).
from concurrent.futures import ThreadPoolExecutor
import concurrent
from proxy_def import *
import requests
from bs4 import BeautifulSoup
from parsers import *
site = 0
def load_url(url):
try:
print(countit(), url)
trip_data = restaurant_parse(url)
return trip_data
except Exception as e:
raise e
if __name__ == '__main__':
URLs = loadit()
#print(URLs)
#print(len(URLs))
with ThreadPoolExecutor(max_workers=10) as executor:
# start the load operations and mark each future with its URL
future_to_url = {executor.submit(load_url, url): url for url in URLs}
for future in concurrent.futures.as_completed(future_to_url):
url = future_to_url[future]
try:
data = future.result()
print('this is data', data)
except Exception as exc:
print('%r generated an exception: %s' % (url, exc))
Here, I can pull the dictionary by grabbing data.
Thanks for the help, everyone.

HTTP client asynch calls with delay

I'm using httpclient.HTTPRequest library to send Async requests, but need to add delay between requests.
This means lets say I configure RPS (Requests per second) = 5. Then I send a request each 0.2 but asynchronously. How can I send the requests asynchronously without waiting for each request response.
This is my code:
def process_campaign(self, campaign_instance):
ioloop.IOLoop.current().run_sync(lambda: start_campaign(campaign_instance))
#gen.coroutine
def start_campaign(campaign_instance):
...
while True:
try:
log.info("start_campaign() Requests in Queue: {}".format(len(web_requests)))
web_request = web_requests.pop()
time.sleep(delay)
headers = {'Content-Type': 'application/json'}
request = httpclient.HTTPRequest(auth_username=settings.api_account,
auth_password=settings.api_password,
url=settings.api_url,
body=json.dumps(web_request),
headers=headers,
request_timeout=15,
method="POST")
response = yield http_client.fetch(request)
except httpclient.HTTPError, e:
log.exception("start_campaign() " + str(e))
except IndexError:
log.info('start_campaign() Campaign web requests completed. Errors {}'.format(api_errors))
break
But seems to wait for HTTP response before proceeding.
You can try:
class WebRequest(RequestHandler):
def __init__(self, web_request):
self.delay = 0
self.web_request = web_request
#asynchronous
def post(self):
IOLoop.instance().add_timeout(self.delay, self._process)
#gen.coroutine
def _process(self):
try:
http_client = httpclient.AsyncHTTPClient()
log.info("start_campaign() Web request: {}".format(self.web_request))
headers = {'Content-Type': 'application/json'}
request = httpclient.HTTPRequest(auth_username=settings.api_account,
auth_password=settings.api_password,
url=settings.api_url,
body=json.dumps(self.web_request),
headers=headers,
request_timeout=15,
method="POST")
response = yield http_client.fetch(request)
except Exception, exception:
log.exception(exception)
Re-use your while Loop:
while True:
try:
web_request = web_requests.pop()
time.sleep(delay)
client = WebRequest(web_request)
client.post()
except IndexError:
break

How do I simulate connection errors and request timeouts in python unit tests

Suppose my django/flask application pulls in information from API's, how can I test that connection exceptions are caught and handled properly?
So for example here is a function that calls an API:
import requests
def call_the_api():
url = 'http://httpbin.org/get'
try:
req = requests.get(url)
if req.json().get('errors'):
logger.warn("API error response")
return {'request_error': 'api_error_response'}
except requests.exceptions.ConnectionError:
logger.warn('ConnectionError')
return {'request_error': 'ConnectionTimeout'}
except requests.exception.Timeout:
logger.warn('API request timed out')
return {'request_error': 'Timeout'}
except Exception, ex:
logger.warn("API request Exception: %s", ex)
return {'request_error': ex}
else:
return req.json()
For testing responses from the API I found mock to be very useful.
def mock_get_request():
response = requests.get.return_value
json_file = 'sample_response.json'
json_file_path = os.path.join(os.path.dirname(__file__), json_file)
with open(json_file_path, 'r') as f:
response.content = response.text = f.read()
response.status_code = 200
response.encoding = 'utf-8'
response.json = lambda: json.loads(response.content.decode(response.encoding))
response.url = u'%s' % args[0]
return response
class TestSuitabilityFunctions(TestCase):
def test_call_the_api(self):
requests.get = MagicMock(side_effect=mock_get_request)
resp = call_the_api()
self.assertEqual(resp.get('url'), "http://httpbin.org/get")
So my question is how would I go about simulating a connection timeout or error?
Untested code but...
def connection_error():
raise requests.exceptions.ConnectionError
class TestSuitabilityFunctions(TestCase):
#patch.object(module_that_youre_testing, "requests")
def test_connection_error(self, mock_requests):
mock_requests.get = MagicMock(side_effect=connection_error)
with self.assertRaises(requests.exceptions.ConnectionError) as cm:
resp = call_the_api()
exception = cm.exception
self.assertEqual(resp, {'request_error': 'ConnectionTimeout'})
... or similar should do the trick. Off the top of my head I can't remember how assertRaises interacts with errors that are caught. Maybe you don't even need the assertRaises part.

Categories

Resources