I'm trying to get some data from a web page. To speed up this process (they allow me to make 1000 requests per minute), I use ThreadPool.
Since there is a huge amount of data, the process is quite vulnerable to connection fails etc. so I try to log everything I can to be able to detect each mistake I did in code.
The problem is that program sometimes just stops without any exception (it acts like it is running but with no effect - I use PyCharm). I log catched exceptions everywhere I can but I can't see any exception in any log.
I assume that if there were a timeout reached, the exception would be raised and logged.
I've found out where the problem could be. Here is the code:
As a pool, I use: from multiprocessing.pool import ThreadPool as Pool
And lock: from threading import Lock
The download_category function is being used in loop.
def download_category(url):
# some code
#
# ...
log('Create pool...')
_pool = Pool(_workers_number)
with open('database/temp_produkty.txt') as f:
log('Spracovavanie produktov... vytvaranie vlakien...') # I see this in log
for url_product in f:
x = _pool.apply_async(process_product, args=(url_product.strip('\n'), url))
_pool.close()
_pool.join()
log('Presuvanie produktov z temp export do export.csv...') # I can't see this in log
temp_export_to_export_csv()
set_spracovanie_kategorie(url)
except Exception as e:
logging.exception('Got exception on download_one_category: {}'.format(url))
And process_product function:
def process_product(url, cat):
try:
data = get_product_data(url)
except:
log('{}: {} exception while getting product data... #') # I don't see this in log
return
try:
print_to_temp_export(data, cat) # I don't see this in log
except:
log('{}: {} exception while printing to csv... #') # I don't see this in log
raise
LOG function:
def log(text):
now = datetime.now().strftime('%d.%m.%Y %H:%M:%S')
_lock.acquire()
mLib.printToFile('logging/log.log', '{} -> {}'.format(now, text))
_lock.release()
I use logging module too. In this log, I see that probably 8 (number of workers) times request was sent but no answer hasn't been recieved.
EDIT1:
def get_product_data(url):
data = defaultdict(lambda: '-')
root = load_root(url)
try:
nazov = root.xpath('//h1[#itemprop="name"]/text()')[0]
except:
nazov = root.xpath('//h1/text()')[0]
under_block = root.xpath('//h2[#id="lowest-cost"]')
if len(under_block) < 1:
under_block = root.xpath('//h2[contains(text(),"Naj")]')
if len(under_block) < 1:
return False
data['nazov'] = nazov
data['url'] = url
blocks = under_block[0].xpath('./following-sibling::div[#class="shp"]/div[contains(#class,"shp")]')
i = 0
for block in blocks:
i += 1
data['dat{}_men'.format(i)] = eblock.xpath('.//a[#class="link"]/text()')[0]
del root
return data
LOAD ROOT:
class RedirectException(Exception):
pass
def load_url(url):
r = requests.get(url, allow_redirects=False)
if r.status_code == 301:
raise RedirectException
if r.status_code == 404:
if '-q-' in url:
url = url.replace('-q-','-')
mLib.printToFileWOEncoding('logging/neexistujuce.txt','Skusanie {} kategorie...'.format(url))
return load_url(url) # THIS IS NOT LOOPING
else:
mLib.printToFileWOEncoding('logging/neexistujuce.txt','{}'.format(url))
html = r.text
return html
def load_root(url):
try:
html = load_url(url)
except Exception as e:
logging.exception('load_root_exception')
raise
return etree.fromstring(html, etree.HTMLParser())
Related
i need a script to make it like a cpanel checker, with more than 1 url and the url is stored in a txt file.
usage : python script.py list.txt
format in file list.txt : https://demo.cpanel.net:2083|democom|DemoCoA5620
this is my code but it doesn't work, can someone help me?
Thanks.
import requests, sys
from multiprocessing.dummy import Pool as ThreadPool
try:
with open(sys.argv[1], 'r') as f:
list_data = [line.strip() for line in f if line.strip()]
except IOError:
pass
def cpanel(url):
try:
data = {'user':'democom', 'pass':'DemoCoA5620'}
r = requests.post(url, data=data)
if r.status_code==200:
print "login success"
else:
print "login failed"
except:
pass
def chekers(url):
try:
cpanel(url)
except:
pass
def Main():
try:
start = timer()
pp = ThreadPool(25)
pr = pp.map(chekers, list_data)
print('Time: ' + str(timer() - start) + ' seconds')
except:
pass
if __name__ == '__main__':
Main()
I fixed your code in a way that it will return an actual array containing a boolean array indicating the success of the cpanel function.
from __future__ import print_function
import requests
from multiprocessing.pool import ThreadPool
try:
list_data = ["https://demo.cpanel.net:2083|democom|DemoCoA5620",
"https://demo.cpanel.net:2083|UserDoesNotExist|WRONGPASSWORD",
]
except IOError:
pass
def cpanel(url):
try:
# try to split that url to get username / password
try:
url, username, password = url.split('|')
except Exception as e:
print("Url {} seems to have wrong format. Concrete error: {}".format(url, e))
return False
# build the correct url
url += '/login/?login_only=1'
# build post parameters
params = {'user': username,
'pass': password}
# make request
r = requests.post(url, params)
if r.status_code==200:
print("login for user {} success".format(username))
return True
else:
print("login for user {} failed due to Status Code {} and message \"{}\"".format(username, r.status_code, r.reason))
return False
except Exception as e:
print("Error occured for url {} ".format(e))
return False
def chekers(url):
return cpanel(url)
def Main():
try:
# start = timer()
pp = ThreadPool(1)
pr = pp.map(chekers, list_data)
print(pr)
# print('Time: ' + str(timer() - start) + ' seconds')
except:
pass
if __name__ == '__main__':
Main()
Output:
login for user democom success
login for user UserDoesNotExist failed due to Status Code 401 and message "Access Denied"
[True, False]
Be aware that I replaced your file read operation by some fixed urls.
Since you use request.post I guess you actually want to POST something to that urls. Your code does not do that. If you just want to send a request, use the requests.get method.
See the official documentation for the requests packet: https://2.python-requests.org/en/master/user/quickstart/#make-a-request for more details.
Also note that
"but it doesn't work"
is NOT a question.
I understand that this is a duplicate, but I havent had that "ah-ha" moment where I understand HOW to access the a classes variable. In this code, I am crawling a website from a list of thousands of pages. Those jobs are submitted via concurrent.futures.
I want to be able to return the value of "results". I've used self.results within def __init__(self, url_list, threads) and I cant seem to pull that variable when I try print(example.results.
If self.results is returning a value, but example.results isn't pulling it from if __name__ == '__main__':, how can you access that? I know I've done something wrong, but I don't know what it is.
from concurrent.futures import ThreadPoolExecutor
from proxy_def import *
import requests
from bs4 import BeautifulSoup
from parsers import *
site = 0
class ConcurrentListCrawler(object):
def __init__(self, url_list, threads):
self.urls = url_list
self.results = {}
self.max_threads = threads
def __make_request(self, url):
try:
r = requests.get(url=url, timeout=20)
r.raise_for_status()
print(countit(), r.url)
except requests.exceptions.Timeout:
r = requests.get(url=url, timeout=60)
except requests.exceptions.ConnectionError:
r = requests.get(url=url, timeout=60)
except requests.exceptions.RequestException as e:
raise e
return r.url, r.text
def __parse_results(self, url, html):
try:
print(url)
trip_data = restaurant_parse(url)
except Exception as e:
raise e
if trip_data:
print('here we go')
self.results = trip_data
#print(self.results)
return self.results
def wrapper(self, url):
url, html = self.__make_request(url)
self.__parse_results(url, html)
def run_script(self):
with ThreadPoolExecutor(max_workers=min(len(self.urls),self.max_threads)) as Executor:
jobs = [Executor.submit(self.wrapper, u) for u in self.urls]
if __name__ == '__main__':
listo = loadit()
print(listo)
print(len(listo))
example = ConcurrentListCrawler(listo, 10)
example.run_script()
print(example.results)
Any pointers would be greatly appreciated.
I believe one of your methods is not returning the results.
Make the following change.
def wrapper(self, url):
url, html = self.__make_request(url)
return self.__parse_results(url, html)
After this, I suggest you utilize the self.results as a dictionary, like it was declared.
In the method "__parse_results(..)", append trip_data to self.results as follows, instead of assigning.
def __parse_results(self, url, html):
try:
print(url)
trip_data = restaurant_parse(url)
except Exception as e:
raise e
if trip_data:
print('here we go')
self.results[url] = trip_data
#print(self.results)
return self.results
When you append to self.results, it would retain the older values and you may avoid replacing by reassignment.
The issue was that I submitted all the jobs at once through a list. I was unable to pull the variable from the class because print(example.results) because that part of the code isnt access until all jobs are complete. With that I was able to resolve by getting rid of the class (even though the title of this posting indicates that this is the issue).
from concurrent.futures import ThreadPoolExecutor
import concurrent
from proxy_def import *
import requests
from bs4 import BeautifulSoup
from parsers import *
site = 0
def load_url(url):
try:
print(countit(), url)
trip_data = restaurant_parse(url)
return trip_data
except Exception as e:
raise e
if __name__ == '__main__':
URLs = loadit()
#print(URLs)
#print(len(URLs))
with ThreadPoolExecutor(max_workers=10) as executor:
# start the load operations and mark each future with its URL
future_to_url = {executor.submit(load_url, url): url for url in URLs}
for future in concurrent.futures.as_completed(future_to_url):
url = future_to_url[future]
try:
data = future.result()
print('this is data', data)
except Exception as exc:
print('%r generated an exception: %s' % (url, exc))
Here, I can pull the dictionary by grabbing data.
Thanks for the help, everyone.
Wrote this crawler in Python, it dumps several parameters to JSON output file based on the input list of domains.
Have this question:
Do I need to close the HTTP connection in each thread? Input data is ca. 5 Million items. It process at the beginning at a rate ca. 50 iterations per second, but later after some time it drops to 1-2 per second and/or hangs (no kernel messages and no errors on stdout)? Can this be code or is network limiting related? I suspect software since when I restart it, it starts again with high rate (ca. 50 iteration per second)
Any tips how to improve the code below are also welcome, especially improve on speed and crawling throughput.
Code in questions:
import urllib2
import pprint
from tqdm import tqdm
import lxml.html
from Queue import Queue
from geoip import geolite2
import pycountry
from tld import get_tld
resfile = open("out.txt",'a')
concurrent = 200
def doWork():
while True:
url = q.get()
status = getStatus(url)
doSomethingWithResult(status)
q.task_done()
def getStatus(ourl):
try:
response = urllib2.urlopen("http://"+ourl)
peer = response.fp._sock.fp._sock.getpeername()
ip = peer[0]
header = response.info()
html = response.read()
html_element = lxml.html.fromstring(html)
generator = html_element.xpath("//meta[#name='generator']/#content")
try:
match = geolite2.lookup(ip)
if match is not None:
country= match.country
try:
c=pycountry.countries.lookup(country)
country=c.name
except:
country=""
except:
country=""
try:
res=get_tld("http://www"+ourl, as_object=True)
tld=res.suffix
except:
tld=""
try:
match = re.search(r'[\w\.-]+#[\w\.-]+', html)
email=match.group(0)
except:
email=""
try:
item= generator[0]
val = "{ \"Domain\":\"http://"+ourl.rstrip()+"\",\"IP:\""+ip+"\"," + "\"Server\":"+ "\""+str(header.getheader("Server")).replace("None","")+"\",\"PoweredBy\":" + "\""+str(header.getheader("X-Powered-By")).replace("None","")+"\""+",\"MetaGenerator\":\""+item+"\",\"Email\":\""+email+"\",\"Suffix\":\""+tld+"\",\"CountryHosted\":\""+country+"\" }"
except:
val = "{ \"Domain\":\"http://"+ourl.rstrip()+"\",\"IP:\""+ip+"\"," + "\"Server\":"+ "\""+str(header.getheader("Server")).replace("None","")+"\",\"PoweredBy\":" + "\""+str(header.getheader("X-Powered-By")).replace("None","")+"\""+",\"MetaGenerator\":\"\",\"Email\":\""+email+"\",\"Suffix\":\""+tld+"\",\"CountryHosted\":\""+country+"\" }"
return val
except Exception as e:
#print "error"+str(e)
pass
def doSomethingWithResult(status):
if status:
resfile.write(str(status)+"\n")
q = Queue(concurrent * 2)
for i in range(concurrent):
t = Thread(target=doWork)
t.daemon = True
t.start()
try:
for url in tqdm(open('list.txt')):
q.put(url.strip())
status = open("status.txt",'w')
status.write(str(url.strip()))
q.join()
except KeyboardInterrupt:
sys.exit(1)
Update 1:
Closing the Socket and FileDescriptor makes it work better, does not seem to hang anymore after some time. Performance is 50 reqs/sec on home laptop and ca 100 req/sec on a VPS
from threading import Thread
import httplib, sys
import urllib2
import pprint
from tqdm import tqdm
import lxml.html
from Queue import Queue
from geoip import geolite2
import pycountry
from tld import get_tld
import json
resfile = open("out.txt",'a')
concurrent = 200
def doWork():
while True:
url = q.get()
status = getStatus(url)
doSomethingWithResult(status)
q.task_done()
def getStatus(ourl):
try:
response = urllib2.urlopen("http://"+ourl)
realsock = response.fp._sock.fp._sock
peer = response.fp._sock.fp._sock.getpeername()
ip = peer[0]
header = response.info()
html = response.read()
realsock.close()
response.close()
html_element = lxml.html.fromstring(html)
generator = html_element.xpath("//meta[#name='generator']/#content")
try:
match = geolite2.lookup(ip)
if match is not None:
country= match.country
try:
c=pycountry.countries.lookup(country)
country=c.name
except:
country=""
except:
country=""
try:
res=get_tld("http://www"+ourl, as_object=True)
tld=res.suffix
except:
tld=""
try:
match = re.search(r'[\w\.-]+#[\w\.-]+', html)
email=match.group(0)
except:
email=""
try:
item= generator[0]
val = "{ \"Domain\":"+json.dumps("http://"+ourl.rstrip())+",\"IP\":\""+ip+"\",\"Server\":"+json.dumps(str(header.getheader("Server")).replace("None",""))+",\"PoweredBy\":" +json.dumps(str(header.getheader("X-Powered-By")).replace("None",""))+",\"MetaGenerator\":"+json.dumps(item)+",\"Email\":"+json.dumps(email)+",\"Suffix\":\""+tld+"\",\"CountryHosted\":\""+country+"\" }"
except:
val = "{ \"Domain\":"+json.dumps("http://"+ourl.rstrip())+",\"IP\":\""+ip+"\"," + "\"Server\":"+json.dumps(str(header.getheader("Server")).replace("None",""))+",\"PoweredBy\":" +json.dumps(str(header.getheader("X-Powered-By")).replace("None",""))+",\"MetaGenerator\":\"\",\"Email\":"+json.dumps(email)+",\"Suffix\":\""+tld+"\",\"CountryHosted\":\""+country+"\" }"
return val
except Exception as e:
print "error"+str(e)
pass
def doSomethingWithResult(status):
if status:
resfile.write(str(status)+"\n")
q = Queue(concurrent * 2)
for i in range(concurrent):
t = Thread(target=doWork)
t.daemon = True
t.start()
try:
for url in tqdm(open('list.txt')):
q.put(url.strip())
status = open("status.txt",'w')
status.write(str(url.strip()))
q.join()
except KeyboardInterrupt:
sys.exit(1)
The handles will be automatically garbage collected, but, you will be better off closing the handles yourself, especially as you are doing this in a tight loop.
You also asked for suggestions for improvement. A big one would be to stop using urllib2 and start using requests instead.
There are many possible options, why your crawling rate drops.
1.) Take care not to crawl to much data from the same domain. Some web servers are configured just to allow one connection per IP address in parallel.
2.) Try to send randomized browser-like http headers (user-agent, referrer, ...) to prevent web server scraping protection, if set.
3.) Use a mature http (parallel) library, like pycurl (has MultiCurl) or requests (grequests). They perform faster for sure.
I've read many answers, however I have not found a proper solution.
The problem, I'm reading mixed/replace HTTP streams that will not expire or end by default.
You can try it by yourself using curl:
curl http://agent.mtconnect.org/sample\?interval\=0
So, now I'm using Python threads and requests to read data from multiple streams.
import requests
import uuid
from threading import Thread
tasks = ['http://agent.mtconnect.org/sample?interval=5000',
'http://agent.mtconnect.org/sample?interval=10000']
thread_id = []
def http_handler(thread_id, url, flag):
print 'Starting task %s' % thread_id
try:
requests_stream = requests.get(url, stream=True, timeout=2)
for line in requests_stream.iter_lines():
if line:
print line
if flag and line.endswith('</MTConnectStreams>'):
# Wait until XML message end is reached to receive the full message
break
except requests.exceptions.RequestException as e:
print('error: ', e)
except BaseException as e:
print e
if __name__ == '__main__':
for task in tasks:
uid = str(uuid.uuid4())
thread_id.append(uid)
t = Thread(target=http_handler, args=(uid, task, False), name=uid)
t.start()
print thread_id
# Wait Time X or until user is doing something
# Send flag = to desired thread to indicate the loop should stop after reaching the end.
Any suggestions? What is the best solution? I don't want to kill the thread because I would like to read the ending to have a full XML message.
I found a solution by using threading module and threading.events. Maybe not the best solution, but it works fine currently.
import logging
import threading
import time
import uuid
import requests
logging.basicConfig(level=logging.DEBUG, format='(%(threadName)-10s) %(message)s', )
tasks = ['http://agent.mtconnect.org/sample?interval=5000',
'http://agent.mtconnect.org/sample?interval=10000']
d = dict()
def http_handler(e, url):
logging.debug('wait_for_event starting')
message_buffer = []
filter_namespace = True
try:
requests_stream = requests.get(url, stream=True, timeout=2)
for line in requests_stream.iter_lines():
if line:
message_buffer.append(line)
if e.isSet() and line.endswith('</MTConnectStreams>'):
logging.debug(len(message_buffer))
break
except requests.exceptions.RequestException as e:
print('error: ', e)
except BaseException as e:
print e
if __name__ == '__main__':
logging.debug('Waiting before calling Event.set()')
for task in tasks:
uid = str(uuid.uuid4())
e = threading.Event()
d[uid] = {"stop_event": e}
t = threading.Event(uid)
t = threading.Thread(name=uid,
target=http_handler,
args=(e, task))
t.start()
logging.debug('Waiting 3 seconds before calling Event.set()')
for key in d:
time.sleep(3)
logging.debug(threading.enumerate())
logging.debug(d[key])
d[key]['stop_event'].set()
logging.debug('bye')
I wrote a little tool, that gathers data from facebook, using api. Tool uses multiprocessing, queues and httplib modules. Here, is a part of code:
main process:
def extract_and_save(args):
put_queue = JoinableQueue()
get_queue = Queue()
for index in range(args.number_of_processes):
process_name = u"facebook_worker-%s" % index
grabber = FacebookGrabber(get_queue=put_queue, put_queue=get_queue, name=process_name)
grabber.start()
friend_list = get_user_friends(args.default_user_id, ["id"])
for index, friend_id in enumerate(friend_list):
put_queue.put(friend_id)
put_queue.join()
if not get_queue.empty():
... save to database ...
else:
logger.info(u"There is no data to save")
worker process:
class FacebookGrabber(Process):
def __init__(self, *args, **kwargs):
self.connection = httplib.HTTPSConnection("graph.facebook.com", timeout=2)
self.get_queue = kwargs.pop("get_queue")
self.put_queue = kwargs.pop("put_queue")
super(FacebookGrabber, self).__init__(*args, **kwargs)
self.daemon = True
def run(self):
while True:
friend_id = self.get_queue.get(block=True)
try:
friend_obj = self.get_friend_obj(friend_id)
except Exception, e:
logger.info(u"Friend id %s: facebook responded with an error (%s)", friend_id, e)
else:
if friend_obj:
self.put_queue.put(friend_obj)
self.get_queue.task_done()
common code:
def get_json_from_facebook(connection, url, kwargs=None):
url_parts = list(urlparse.urlparse(url))
query = dict(urlparse.parse_qsl(url_parts[4]))
if kwargs:
query.update(kwargs)
url_parts[4] = urllib.urlencode(query)
url = urlparse.urlunparse(url_parts)
try:
connection.request("GET", url)
except Exception, e:
print "<<<", e
response = connection.getresponse()
data = json.load(response)
return data
This code perfectly works on Ubuntu. But when I tried to run it on Windows 7 I got message "There is no data to save". The problem is here:
try:
connection.request("GET", url)
except Exception, e:
print "<<<", e
I get next error: <<< a float is required
Do anybody know, how to fix this problem?
Python version: 2.7.5
One of the "gotcha's" that occasionally happens with socket timeout values is that most operating systems expect them as floats. I believe this has been accounted for with later versions of the linux kernel.
Try changing:
self.connection = httplib.HTTPSConnection("graph.facebook.com", timeout=2)
to:
self.connection = httplib.HTTPSConnection("graph.facebook.com", timeout=2.0)
That's 2 seconds, by the way. Default is typically 5 seconds. Might be a little low.