I understand that this is a duplicate, but I havent had that "ah-ha" moment where I understand HOW to access the a classes variable. In this code, I am crawling a website from a list of thousands of pages. Those jobs are submitted via concurrent.futures.
I want to be able to return the value of "results". I've used self.results within def __init__(self, url_list, threads) and I cant seem to pull that variable when I try print(example.results.
If self.results is returning a value, but example.results isn't pulling it from if __name__ == '__main__':, how can you access that? I know I've done something wrong, but I don't know what it is.
from concurrent.futures import ThreadPoolExecutor
from proxy_def import *
import requests
from bs4 import BeautifulSoup
from parsers import *
site = 0
class ConcurrentListCrawler(object):
def __init__(self, url_list, threads):
self.urls = url_list
self.results = {}
self.max_threads = threads
def __make_request(self, url):
try:
r = requests.get(url=url, timeout=20)
r.raise_for_status()
print(countit(), r.url)
except requests.exceptions.Timeout:
r = requests.get(url=url, timeout=60)
except requests.exceptions.ConnectionError:
r = requests.get(url=url, timeout=60)
except requests.exceptions.RequestException as e:
raise e
return r.url, r.text
def __parse_results(self, url, html):
try:
print(url)
trip_data = restaurant_parse(url)
except Exception as e:
raise e
if trip_data:
print('here we go')
self.results = trip_data
#print(self.results)
return self.results
def wrapper(self, url):
url, html = self.__make_request(url)
self.__parse_results(url, html)
def run_script(self):
with ThreadPoolExecutor(max_workers=min(len(self.urls),self.max_threads)) as Executor:
jobs = [Executor.submit(self.wrapper, u) for u in self.urls]
if __name__ == '__main__':
listo = loadit()
print(listo)
print(len(listo))
example = ConcurrentListCrawler(listo, 10)
example.run_script()
print(example.results)
Any pointers would be greatly appreciated.
I believe one of your methods is not returning the results.
Make the following change.
def wrapper(self, url):
url, html = self.__make_request(url)
return self.__parse_results(url, html)
After this, I suggest you utilize the self.results as a dictionary, like it was declared.
In the method "__parse_results(..)", append trip_data to self.results as follows, instead of assigning.
def __parse_results(self, url, html):
try:
print(url)
trip_data = restaurant_parse(url)
except Exception as e:
raise e
if trip_data:
print('here we go')
self.results[url] = trip_data
#print(self.results)
return self.results
When you append to self.results, it would retain the older values and you may avoid replacing by reassignment.
The issue was that I submitted all the jobs at once through a list. I was unable to pull the variable from the class because print(example.results) because that part of the code isnt access until all jobs are complete. With that I was able to resolve by getting rid of the class (even though the title of this posting indicates that this is the issue).
from concurrent.futures import ThreadPoolExecutor
import concurrent
from proxy_def import *
import requests
from bs4 import BeautifulSoup
from parsers import *
site = 0
def load_url(url):
try:
print(countit(), url)
trip_data = restaurant_parse(url)
return trip_data
except Exception as e:
raise e
if __name__ == '__main__':
URLs = loadit()
#print(URLs)
#print(len(URLs))
with ThreadPoolExecutor(max_workers=10) as executor:
# start the load operations and mark each future with its URL
future_to_url = {executor.submit(load_url, url): url for url in URLs}
for future in concurrent.futures.as_completed(future_to_url):
url = future_to_url[future]
try:
data = future.result()
print('this is data', data)
except Exception as exc:
print('%r generated an exception: %s' % (url, exc))
Here, I can pull the dictionary by grabbing data.
Thanks for the help, everyone.
Related
When trying to send a function to the stream that parses the page and then executes the html.render, an error occurs:
Error: There is no current event loop in thread 'Thread-1 (take_proxy_us_spys_one_thread)
I started talking about a similar problem and realized that a friend here somehow managed to implement this. But I still get an error.
Here is my code which should be repeated all the time.
Help, please, to understand.
import urllib3
import requests
import time
from requests_html import HTMLSession
import threading
import fake_useragent
def take_proxy_us_spys_one(urls: list=[], header:dict = None,):
for url in urls:
try:
url_first = 'https://spys.one'
r = requests.get(url_first, headers=header)
cookies = r.cookies
session = HTMLSession()
r = session.post(url,
data={'xx00': '','xpp': '5','xf1': '0','xf2': '0','xf3': '0','xf4': '0', 'xf5': '0'},
headers=header,
cookies=cookies)
r.html.render(reload=False,)
print(str(r))
except Exception as exc:
print("Error: " + str(exc))
def take_proxy_us_spys_one_thread(event, sleeptime= 60, urls=[], lock = None):
while event.is_set():
try:
user = fake_useragent.UserAgent().random
header = {'User-Agent': user}
lock.acquire() if lock!=None else None
proxies_1 = take_proxy_us_spys_one(urls=urls, header=header)
lock.release() if lock != None else None
time.sleep(sleeptime)
except Exception as exc:
print("Error: " + str(exc))
time.sleep(sleeptime)
if __name__ == '__main__':
start_in_thread = True
urllib3.disable_warnings()
urls_spys_one = [
'https://spys.one/free-proxy-list/ALL/'
]
lock = threading.Lock()
event = threading.Event()
event.set()
t2 = threading.Thread(target=take_proxy_us_spys_one_thread, args=(event, 10, urls_spys_one, lock),).start()
I tried to implement the mechanism from here.
I've never used this type of module or function. I wanted to know how I can time and print the time taken to find the "response 200" link?
##################### START MODUL PARSE #######################
def my_except_handler(request, exception):
request.url
def check_for_errors(response, *args, **kwargs):
try:
response
except response.exceptions.RequestException as e:
pass
except response.exceptions.HTTPError as e:
pass
except response.ReadTimeout as e:
pass
except response.ConnectionError as e:
pass
except response.ConnectTimeout:
pass
def do_parse(response, *args, **kwargs):
url = response.url
if response.status_code == 200:
response.request.url
url_parse = response.request.url
response.text
try:
if "<font color=#25ff00>" in response.text:
print("STOP SCAN PARSE Time SCAN HERE NEED PRINT TIME")
except Exception as e:
pass
def get_urls_file(site):
urls = []
config_file = "Configurations/package.json"
config = config_file = json.loads(open(f'{config_file}').read())
par_s = config['parse']
for x in par_s:
urls.append(f'{site}{x}')
def get_data_file(urls):
actions_list = []
for url in urls:
action_item = grequests.get(url, headers=headers, timeout=6, stream=True, allow_redirects=False, hooks={'response': [do_parse, check_for_errors]})
actions_list.append(action_item)
grequests.map(actions_list,size=30, exception_handler=my_except_handler)
get_data_file(urls)
Also, i have this example
from funcy import print_durations
#print_durations()
def myfunc(n=0):
for i in range(n):
pass
myfunc(123)
myfunc(123456789)
but if i try to put " #print_durations() " like here:
#print_durations()
def do_parse(response, *args, **kwargs):
i get still the print for all file:
0.00 ns in do_env(<Response [500]>, timeout=6, verify=True, proxies=OrderedDict(), stream=False, cert=None)
0.00 ns in do_env(<Response [500]>, timeout=6, verify=True, proxies=OrderedDict(), stream=False, cert=None)
..... ..... .....
187.51 ms in do_parse(<Response [200]>, timeout=6, verify=True, proxies=OrderedDict(), stream=False, cert=None)
but i need to print the time just here:
print(f"STOP SCAN PARSE Time SCAN ("HERE NEED PRINT TIME")
Thanks for help me.
I have list of a lot of links and I want to use multiprocessing to speed the proccess, here is simplified version, I need it to be ordered like this:
I tried a lot of things, process, pool etc. I always had errors, I need to do it with 4 or 8 threads and make it ordered like this. Thank you for all help. Here is code:
from bs4 import BeautifulSoup
import requests
import time
links = ["http://www.tennisexplorer.com/match-detail/?id=1672704", "http://www.tennisexplorer.com/match-detail/?id=1699387", "http://www.tennisexplorer.com/match-detail/?id=1698990" "http://www.tennisexplorer.com/match-detail/?id=1696623", "http://www.tennisexplorer.com/match-detail/?id=1688719", "http://www.tennisexplorer.com/match-detail/?id=1686305"]
data = []
def essa(match, omega):
aaa = BeautifulSoup(requests.get(match).text, "lxml")
center = aaa.find("div", id="center")
p1_l = center.find_all("th", class_="plName")[0].find("a").get("href")
p2_l = center.find_all("th", class_="plName")[1].find("a").get("href")
return p1_l + " - " + p2_l + " - " + str(omega)
i = 1
start_time = time.clock()
for link in links:
data.append(essa(link, i))
i += 1
for d in data:
print(d)
print(time.clock() - start_time, "seconds")
Spawn several threads of the function and join them together:
from threading import Thread
def essa(match, omega):
aaa = BeautifulSoup(requests.get(match).text, "lxml")
center = aaa.find("div", id="center")
p1_l = center.find_all("th", class_="plName")[0].find("a").get("href")
p2_l = center.find_all("th", class_="plName")[1].find("a").get("href")
print p1_l + " - " + p2_l + " - " + str(omega)
if __name__ == '__main__':
threadlist = []
for index, url in enumerate(links):
t= Thread(target=essa,args=(url, index))
t.start()
threadlist.append(t)
for b in threadlist:
b.join()
You wont get them to print in order, for the simple reason that some http responses take longer than others.
As far I can understand you have the list of links and make requests concurrently to make the process faster. Here is the sample code for multithreading. I hope this will help you. Read the documentation for concurrent futures.
import concurrent.futures
import urllib.request
URLS = ['http://www.foxnews.com/',
'http://www.cnn.com/',
'http://europe.wsj.com/',
'http://www.bbc.co.uk/',
'http://some-made-up-domain.com/']
# Retrieve a single page and report the URL and contents
def load_url(url, timeout):
with urllib.request.urlopen(url, timeout=timeout) as conn:
return conn.read()
# We can use a with statement to ensure threads are cleaned up promptly
with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
# Start the load operations and mark each future with its URL
future_to_url = {executor.submit(load_url, url, 60): url for url in URLS}
for future in concurrent.futures.as_completed(future_to_url):
url = future_to_url[future]
try:
data = future.result()
except Exception as exc:
print('%r generated an exception: %s' % (url, exc))
else:
print('%r page is %d bytes' % (url, len(data)))
I'm trying to get some data from a web page. To speed up this process (they allow me to make 1000 requests per minute), I use ThreadPool.
Since there is a huge amount of data, the process is quite vulnerable to connection fails etc. so I try to log everything I can to be able to detect each mistake I did in code.
The problem is that program sometimes just stops without any exception (it acts like it is running but with no effect - I use PyCharm). I log catched exceptions everywhere I can but I can't see any exception in any log.
I assume that if there were a timeout reached, the exception would be raised and logged.
I've found out where the problem could be. Here is the code:
As a pool, I use: from multiprocessing.pool import ThreadPool as Pool
And lock: from threading import Lock
The download_category function is being used in loop.
def download_category(url):
# some code
#
# ...
log('Create pool...')
_pool = Pool(_workers_number)
with open('database/temp_produkty.txt') as f:
log('Spracovavanie produktov... vytvaranie vlakien...') # I see this in log
for url_product in f:
x = _pool.apply_async(process_product, args=(url_product.strip('\n'), url))
_pool.close()
_pool.join()
log('Presuvanie produktov z temp export do export.csv...') # I can't see this in log
temp_export_to_export_csv()
set_spracovanie_kategorie(url)
except Exception as e:
logging.exception('Got exception on download_one_category: {}'.format(url))
And process_product function:
def process_product(url, cat):
try:
data = get_product_data(url)
except:
log('{}: {} exception while getting product data... #') # I don't see this in log
return
try:
print_to_temp_export(data, cat) # I don't see this in log
except:
log('{}: {} exception while printing to csv... #') # I don't see this in log
raise
LOG function:
def log(text):
now = datetime.now().strftime('%d.%m.%Y %H:%M:%S')
_lock.acquire()
mLib.printToFile('logging/log.log', '{} -> {}'.format(now, text))
_lock.release()
I use logging module too. In this log, I see that probably 8 (number of workers) times request was sent but no answer hasn't been recieved.
EDIT1:
def get_product_data(url):
data = defaultdict(lambda: '-')
root = load_root(url)
try:
nazov = root.xpath('//h1[#itemprop="name"]/text()')[0]
except:
nazov = root.xpath('//h1/text()')[0]
under_block = root.xpath('//h2[#id="lowest-cost"]')
if len(under_block) < 1:
under_block = root.xpath('//h2[contains(text(),"Naj")]')
if len(under_block) < 1:
return False
data['nazov'] = nazov
data['url'] = url
blocks = under_block[0].xpath('./following-sibling::div[#class="shp"]/div[contains(#class,"shp")]')
i = 0
for block in blocks:
i += 1
data['dat{}_men'.format(i)] = eblock.xpath('.//a[#class="link"]/text()')[0]
del root
return data
LOAD ROOT:
class RedirectException(Exception):
pass
def load_url(url):
r = requests.get(url, allow_redirects=False)
if r.status_code == 301:
raise RedirectException
if r.status_code == 404:
if '-q-' in url:
url = url.replace('-q-','-')
mLib.printToFileWOEncoding('logging/neexistujuce.txt','Skusanie {} kategorie...'.format(url))
return load_url(url) # THIS IS NOT LOOPING
else:
mLib.printToFileWOEncoding('logging/neexistujuce.txt','{}'.format(url))
html = r.text
return html
def load_root(url):
try:
html = load_url(url)
except Exception as e:
logging.exception('load_root_exception')
raise
return etree.fromstring(html, etree.HTMLParser())
Suppose my django/flask application pulls in information from API's, how can I test that connection exceptions are caught and handled properly?
So for example here is a function that calls an API:
import requests
def call_the_api():
url = 'http://httpbin.org/get'
try:
req = requests.get(url)
if req.json().get('errors'):
logger.warn("API error response")
return {'request_error': 'api_error_response'}
except requests.exceptions.ConnectionError:
logger.warn('ConnectionError')
return {'request_error': 'ConnectionTimeout'}
except requests.exception.Timeout:
logger.warn('API request timed out')
return {'request_error': 'Timeout'}
except Exception, ex:
logger.warn("API request Exception: %s", ex)
return {'request_error': ex}
else:
return req.json()
For testing responses from the API I found mock to be very useful.
def mock_get_request():
response = requests.get.return_value
json_file = 'sample_response.json'
json_file_path = os.path.join(os.path.dirname(__file__), json_file)
with open(json_file_path, 'r') as f:
response.content = response.text = f.read()
response.status_code = 200
response.encoding = 'utf-8'
response.json = lambda: json.loads(response.content.decode(response.encoding))
response.url = u'%s' % args[0]
return response
class TestSuitabilityFunctions(TestCase):
def test_call_the_api(self):
requests.get = MagicMock(side_effect=mock_get_request)
resp = call_the_api()
self.assertEqual(resp.get('url'), "http://httpbin.org/get")
So my question is how would I go about simulating a connection timeout or error?
Untested code but...
def connection_error():
raise requests.exceptions.ConnectionError
class TestSuitabilityFunctions(TestCase):
#patch.object(module_that_youre_testing, "requests")
def test_connection_error(self, mock_requests):
mock_requests.get = MagicMock(side_effect=connection_error)
with self.assertRaises(requests.exceptions.ConnectionError) as cm:
resp = call_the_api()
exception = cm.exception
self.assertEqual(resp, {'request_error': 'ConnectionTimeout'})
... or similar should do the trick. Off the top of my head I can't remember how assertRaises interacts with errors that are caught. Maybe you don't even need the assertRaises part.