Python Multi-threaded App does not terminate - python

This my code which basically just takes a list of 94,000+ URLs, and collects the http_status codes for them:
#!/usr/bin/python3
import threading
from queue import Queue
import urllib.request
import urllib.parse
from http.client import HTTPConnection
import socket
import http.client
#import httplib
url_input = open("urls_prod_sort.txt", "r").read()
urls = url_input[:url_input.rfind('\n')].split('\n')
#urls = urls[:100]
url_502 = []
url_logs = []
url_502_lock = threading.Lock()
print_lock = threading.Lock()
def sendRequest(url_u, http_method = 'GET', data = None):
use_proxy = "http://xxxxxxxx:8080"
proxies = {"http": use_proxy}
proxy = urllib.request.ProxyHandler(proxies)
handler = urllib.request.HTTPHandler()
url = "http://" + url_u
with print_lock:
print(url)
opener = urllib.request.build_opener(proxy,handler)
urllib.request.install_opener(opener)
request = urllib.request.Request(url,data)
request.add_header("User-agent","| MSIE |")
request.get_method = lambda: http_method
try:
response = urllib.request.urlopen(request)
response_code = response.code
except urllib.error.HTTPError as error:
response_code = error.code
except urllib.error.URLError as e2:
response_code = 701
except socket.timeout as e3:
response_code = 702
except socket.error as e4:
response_code = 703
except http.client.IncompleteRead as e:
response_code = 700
if response_code == 502:
with url_502_lock:
#url_502.append(url)
url_502_file = open("url_502_file.txt", "a")
url_502_file.write(url + "\n")
url_502_file.close()
with print_lock:
#url_logs.append(url + "," + str(response_code))
url_all_logs_file = open("url_all_logs.csv", "a")
url_all_logs_file.write(url + "," + str(response_code) + '\n')
url_all_logs_file.close()
#print (url + "," + str(response_code))
#print (response_code)
return response_code
def worker():
while True:
url = q.get()
if url == ":::::"
break
else:
sendRequest(url)
q.task_done()
#======================================
q = Queue()
for threads in range(1000):
t = threading.Thread(target = worker)
t.daemon = True
t.start()
for url in urls:
q.put(url)
q.put(":::::")
q.join()
However, the program never seems to terminate (even tho the URLs have all been iteratred through) which forces me to ctrl-c the program - and then I get the following error:
Traceback (most recent call last):
File "./url_sc_checker.py", line 120, in <module>
q.join()
File "/usr/lib/python3.2/queue.py", line 82, in join
self.all_tasks_done.wait()
File "/usr/lib/python3.2/threading.py", line 235, in wait
waiter.acquire()
KeyboardInterrupt

The reason that your program doesn't terminate is simple, your worker creates an infinite loop:
def worker():
while True:
...
You need to either throw an exception, break, or have a terminating condition in your while statement. Otherwise your program would remain trying to get the next job from the queue, without knowing that there will never be the next job.
A common way to do this is to put a sentinel value in your queue, when checking out a job from the queue, the worker checks if it is the sentinel value and breaks out the loop.
Another way is to have a global condition variable that you check in the while condition. When the job producer have pushed all items to the queue, the job producer joins the queue, and when all jobs are done, the job producer unblocks and terminates the threads our processes.
Another possible reason why your process doesn't terminate is if your sendRequest produces an unexpected exception, then the thread terminates and you'll be left with some jobs that are never marked as done.

Related

How to use html.render inside a thread?

When trying to send a function to the stream that parses the page and then executes the html.render, an error occurs:
Error: There is no current event loop in thread 'Thread-1 (take_proxy_us_spys_one_thread)
I started talking about a similar problem and realized that a friend here somehow managed to implement this. But I still get an error.
Here is my code which should be repeated all the time.
Help, please, to understand.
import urllib3
import requests
import time
from requests_html import HTMLSession
import threading
import fake_useragent
def take_proxy_us_spys_one(urls: list=[], header:dict = None,):
for url in urls:
try:
url_first = 'https://spys.one'
r = requests.get(url_first, headers=header)
cookies = r.cookies
session = HTMLSession()
r = session.post(url,
data={'xx00': '','xpp': '5','xf1': '0','xf2': '0','xf3': '0','xf4': '0', 'xf5': '0'},
headers=header,
cookies=cookies)
r.html.render(reload=False,)
print(str(r))
except Exception as exc:
print("Error: " + str(exc))
def take_proxy_us_spys_one_thread(event, sleeptime= 60, urls=[], lock = None):
while event.is_set():
try:
user = fake_useragent.UserAgent().random
header = {'User-Agent': user}
lock.acquire() if lock!=None else None
proxies_1 = take_proxy_us_spys_one(urls=urls, header=header)
lock.release() if lock != None else None
time.sleep(sleeptime)
except Exception as exc:
print("Error: " + str(exc))
time.sleep(sleeptime)
if __name__ == '__main__':
start_in_thread = True
urllib3.disable_warnings()
urls_spys_one = [
'https://spys.one/free-proxy-list/ALL/'
]
lock = threading.Lock()
event = threading.Event()
event.set()
t2 = threading.Thread(target=take_proxy_us_spys_one_thread, args=(event, 10, urls_spys_one, lock),).start()
I tried to implement the mechanism from here.

Python Multithreaded HTTP crawler - Closing connection and hanging of the program

Wrote this crawler in Python, it dumps several parameters to JSON output file based on the input list of domains.
Have this question:
Do I need to close the HTTP connection in each thread? Input data is ca. 5 Million items. It process at the beginning at a rate ca. 50 iterations per second, but later after some time it drops to 1-2 per second and/or hangs (no kernel messages and no errors on stdout)? Can this be code or is network limiting related? I suspect software since when I restart it, it starts again with high rate (ca. 50 iteration per second)
Any tips how to improve the code below are also welcome, especially improve on speed and crawling throughput.
Code in questions:
import urllib2
import pprint
from tqdm import tqdm
import lxml.html
from Queue import Queue
from geoip import geolite2
import pycountry
from tld import get_tld
resfile = open("out.txt",'a')
concurrent = 200
def doWork():
while True:
url = q.get()
status = getStatus(url)
doSomethingWithResult(status)
q.task_done()
def getStatus(ourl):
try:
response = urllib2.urlopen("http://"+ourl)
peer = response.fp._sock.fp._sock.getpeername()
ip = peer[0]
header = response.info()
html = response.read()
html_element = lxml.html.fromstring(html)
generator = html_element.xpath("//meta[#name='generator']/#content")
try:
match = geolite2.lookup(ip)
if match is not None:
country= match.country
try:
c=pycountry.countries.lookup(country)
country=c.name
except:
country=""
except:
country=""
try:
res=get_tld("http://www"+ourl, as_object=True)
tld=res.suffix
except:
tld=""
try:
match = re.search(r'[\w\.-]+#[\w\.-]+', html)
email=match.group(0)
except:
email=""
try:
item= generator[0]
val = "{ \"Domain\":\"http://"+ourl.rstrip()+"\",\"IP:\""+ip+"\"," + "\"Server\":"+ "\""+str(header.getheader("Server")).replace("None","")+"\",\"PoweredBy\":" + "\""+str(header.getheader("X-Powered-By")).replace("None","")+"\""+",\"MetaGenerator\":\""+item+"\",\"Email\":\""+email+"\",\"Suffix\":\""+tld+"\",\"CountryHosted\":\""+country+"\" }"
except:
val = "{ \"Domain\":\"http://"+ourl.rstrip()+"\",\"IP:\""+ip+"\"," + "\"Server\":"+ "\""+str(header.getheader("Server")).replace("None","")+"\",\"PoweredBy\":" + "\""+str(header.getheader("X-Powered-By")).replace("None","")+"\""+",\"MetaGenerator\":\"\",\"Email\":\""+email+"\",\"Suffix\":\""+tld+"\",\"CountryHosted\":\""+country+"\" }"
return val
except Exception as e:
#print "error"+str(e)
pass
def doSomethingWithResult(status):
if status:
resfile.write(str(status)+"\n")
q = Queue(concurrent * 2)
for i in range(concurrent):
t = Thread(target=doWork)
t.daemon = True
t.start()
try:
for url in tqdm(open('list.txt')):
q.put(url.strip())
status = open("status.txt",'w')
status.write(str(url.strip()))
q.join()
except KeyboardInterrupt:
sys.exit(1)
Update 1:
Closing the Socket and FileDescriptor makes it work better, does not seem to hang anymore after some time. Performance is 50 reqs/sec on home laptop and ca 100 req/sec on a VPS
from threading import Thread
import httplib, sys
import urllib2
import pprint
from tqdm import tqdm
import lxml.html
from Queue import Queue
from geoip import geolite2
import pycountry
from tld import get_tld
import json
resfile = open("out.txt",'a')
concurrent = 200
def doWork():
while True:
url = q.get()
status = getStatus(url)
doSomethingWithResult(status)
q.task_done()
def getStatus(ourl):
try:
response = urllib2.urlopen("http://"+ourl)
realsock = response.fp._sock.fp._sock
peer = response.fp._sock.fp._sock.getpeername()
ip = peer[0]
header = response.info()
html = response.read()
realsock.close()
response.close()
html_element = lxml.html.fromstring(html)
generator = html_element.xpath("//meta[#name='generator']/#content")
try:
match = geolite2.lookup(ip)
if match is not None:
country= match.country
try:
c=pycountry.countries.lookup(country)
country=c.name
except:
country=""
except:
country=""
try:
res=get_tld("http://www"+ourl, as_object=True)
tld=res.suffix
except:
tld=""
try:
match = re.search(r'[\w\.-]+#[\w\.-]+', html)
email=match.group(0)
except:
email=""
try:
item= generator[0]
val = "{ \"Domain\":"+json.dumps("http://"+ourl.rstrip())+",\"IP\":\""+ip+"\",\"Server\":"+json.dumps(str(header.getheader("Server")).replace("None",""))+",\"PoweredBy\":" +json.dumps(str(header.getheader("X-Powered-By")).replace("None",""))+",\"MetaGenerator\":"+json.dumps(item)+",\"Email\":"+json.dumps(email)+",\"Suffix\":\""+tld+"\",\"CountryHosted\":\""+country+"\" }"
except:
val = "{ \"Domain\":"+json.dumps("http://"+ourl.rstrip())+",\"IP\":\""+ip+"\"," + "\"Server\":"+json.dumps(str(header.getheader("Server")).replace("None",""))+",\"PoweredBy\":" +json.dumps(str(header.getheader("X-Powered-By")).replace("None",""))+",\"MetaGenerator\":\"\",\"Email\":"+json.dumps(email)+",\"Suffix\":\""+tld+"\",\"CountryHosted\":\""+country+"\" }"
return val
except Exception as e:
print "error"+str(e)
pass
def doSomethingWithResult(status):
if status:
resfile.write(str(status)+"\n")
q = Queue(concurrent * 2)
for i in range(concurrent):
t = Thread(target=doWork)
t.daemon = True
t.start()
try:
for url in tqdm(open('list.txt')):
q.put(url.strip())
status = open("status.txt",'w')
status.write(str(url.strip()))
q.join()
except KeyboardInterrupt:
sys.exit(1)
The handles will be automatically garbage collected, but, you will be better off closing the handles yourself, especially as you are doing this in a tight loop.
You also asked for suggestions for improvement. A big one would be to stop using urllib2 and start using requests instead.
There are many possible options, why your crawling rate drops.
1.) Take care not to crawl to much data from the same domain. Some web servers are configured just to allow one connection per IP address in parallel.
2.) Try to send randomized browser-like http headers (user-agent, referrer, ...) to prevent web server scraping protection, if set.
3.) Use a mature http (parallel) library, like pycurl (has MultiCurl) or requests (grequests). They perform faster for sure.

Any possible way to speed up the processing of this?

I have a list of 80 usernames right now and I have my script check if each username exists or not. However it takes a little longer than I like so I was wondering if there is anything I can do to speed up how long it takes to check if each username exists or not.
# ------------------------------
# Mass Kik Username Checker
# Script Made by: Ski
# ------------------------------
import requests, threading
def check(username):
try:
req = requests.get("http://kik.me/"+username, allow_redirects=False).status_code
if req == 302:
return False
if req == 200:
return True
except Exception as e:
print e
exit()
def _loadList(filename):
item_list = []
for item in str(open(filename, "r").read()).split("\n"):
item_list.append(item)
return item_list
def _thread(items):
global _usernames
for username in _usernames[items[0]:items[1]]:
exists = check(username)
if exists:
print username+" exists\n"
if not exists:
print username+" doesn't exist\n"
if __name__ == '__main__':
_usernames = _loadList("usernames.txt")
thread1 = threading.Thread(target=_thread, args=([0, 20], )).start()
thread2 = threading.Thread(target=_thread, args=([20, 40], )).start()
thread3 = threading.Thread(target=_thread, args=([40, 60], )).start()
thread4 = threading.Thread(target=_thread, args=([60, 80], )).start()
Try out Python 3.x Pool of threads. You can define how many workers will perform the request. Using more (ex. 32) than 4, would speed-up your code dramatically.
import requests
from concurrent.futures import ThreadPoolExecutor
NUM_OF_WORKERS=32
def check(username):
try:
req = requests.get("http://kik.me/"+username, allow_redirects=False).status_code
if req == 302:
print(username, " does not exist.")
if req == 200:
print(username, "exists.")
except Exception as error:
print(error)
usernames = _loadList(filename)
with ThreadPoolExecutor(max_workers=NUM_OF_WORKERS) as pool:
pool.map(check, usernames)
This makes your code way more readable as well.
EDIT: noticed now the Python 2.7 tag.
Python 2 has a Pool of threads which is available under multiprocessing module. Unfortunately it's not documented as no tests were made available.
import requests
from multiprocessing.pool import ThreadPool
NUM_OF_WORKERS=32
def check(username):
try:
req = requests.get("http://kik.me/"+username, allow_redirects=False).status_code
if req == 302:
print(username, " does not exist.")
if req == 200:
print(username, "exists.")
except Exception as error:
print(error)
usernames = _loadList(filename)
pool = ThreadPool(processes=NUM_OF_WORKERS)
pool.map_async(check, usernames)
pool.close()
pool.join()
If you want a better Pool of Threads for Python 2, you can try the Pebble module

Python, send a stop notification to a blocking loop within a thread

I've read many answers, however I have not found a proper solution.
The problem, I'm reading mixed/replace HTTP streams that will not expire or end by default.
You can try it by yourself using curl:
curl http://agent.mtconnect.org/sample\?interval\=0
So, now I'm using Python threads and requests to read data from multiple streams.
import requests
import uuid
from threading import Thread
tasks = ['http://agent.mtconnect.org/sample?interval=5000',
'http://agent.mtconnect.org/sample?interval=10000']
thread_id = []
def http_handler(thread_id, url, flag):
print 'Starting task %s' % thread_id
try:
requests_stream = requests.get(url, stream=True, timeout=2)
for line in requests_stream.iter_lines():
if line:
print line
if flag and line.endswith('</MTConnectStreams>'):
# Wait until XML message end is reached to receive the full message
break
except requests.exceptions.RequestException as e:
print('error: ', e)
except BaseException as e:
print e
if __name__ == '__main__':
for task in tasks:
uid = str(uuid.uuid4())
thread_id.append(uid)
t = Thread(target=http_handler, args=(uid, task, False), name=uid)
t.start()
print thread_id
# Wait Time X or until user is doing something
# Send flag = to desired thread to indicate the loop should stop after reaching the end.
Any suggestions? What is the best solution? I don't want to kill the thread because I would like to read the ending to have a full XML message.
I found a solution by using threading module and threading.events. Maybe not the best solution, but it works fine currently.
import logging
import threading
import time
import uuid
import requests
logging.basicConfig(level=logging.DEBUG, format='(%(threadName)-10s) %(message)s', )
tasks = ['http://agent.mtconnect.org/sample?interval=5000',
'http://agent.mtconnect.org/sample?interval=10000']
d = dict()
def http_handler(e, url):
logging.debug('wait_for_event starting')
message_buffer = []
filter_namespace = True
try:
requests_stream = requests.get(url, stream=True, timeout=2)
for line in requests_stream.iter_lines():
if line:
message_buffer.append(line)
if e.isSet() and line.endswith('</MTConnectStreams>'):
logging.debug(len(message_buffer))
break
except requests.exceptions.RequestException as e:
print('error: ', e)
except BaseException as e:
print e
if __name__ == '__main__':
logging.debug('Waiting before calling Event.set()')
for task in tasks:
uid = str(uuid.uuid4())
e = threading.Event()
d[uid] = {"stop_event": e}
t = threading.Event(uid)
t = threading.Thread(name=uid,
target=http_handler,
args=(e, task))
t.start()
logging.debug('Waiting 3 seconds before calling Event.set()')
for key in d:
time.sleep(3)
logging.debug(threading.enumerate())
logging.debug(d[key])
d[key]['stop_event'].set()
logging.debug('bye')

Python multiprocess Process never terminates

My routine below takes a list of urllib2.Requests and spawns a new process per request and fires them off. The purpose is for asynchronous speed, so it's all fire-and-forget (no response needed). The issue is that the processes spawned in the code below never terminate. So after a few of these the box wilL OOM. Context: Django web app. Any help?
MP_CONCURRENT = int(multiprocessing.cpu_count()) * 2
if MP_CONCURRENT < 2: MP_CONCURRENT = 2
MPQ = multiprocessing.JoinableQueue(MP_CONCURRENT)
def request_manager(req_list):
try:
# put request list in the queue
for req in req_list:
MPQ.put(req)
# call processes on queue
worker = multiprocessing.Process(target=process_request, args=(MPQ,))
worker.daemon = True
worker.start()
# move on after queue is empty
MPQ.join()
except Exception, e:
logging.error(traceback.print_exc())
# prcoess requests in queue
def process_request(MPQ):
try:
while True:
req = MPQ.get()
dr = urllib2.urlopen(req)
MPQ.task_done()
except Exception, e:
logging.error(traceback.print_exc())
Maybe i am not right, but
MP_CONCURRENT = int(multiprocessing.cpu_count()) * 2
if MP_CONCURRENT < 2: MP_CONCURRENT = 2
MPQ = multiprocessing.JoinableQueue(MP_CONCURRENT)
def request_manager(req_list):
try:
# put request list in the queue
pool=[]
for req in req_list:
MPQ.put(req)
# call processes on queue
worker = multiprocessing.Process(target=process_request, args=(MPQ,))
worker.daemon = True
worker.start()
pool.append(worker)
# move on after queue is empty
MPQ.join()
# Close not needed processes
for p in pool: p.terminate()
except Exception, e:
logging.error(traceback.print_exc())
# prcoess requests in queue
def process_request(MPQ):
try:
while True:
req = MPQ.get()
dr = urllib2.urlopen(req)
MPQ.task_done()
except Exception, e:
logging.error(traceback.print_exc())
MP_CONCURRENT = int(multiprocessing.cpu_count()) * 2
if MP_CONCURRENT < 2: MP_CONCURRENT = 2
MPQ = multiprocessing.JoinableQueue(MP_CONCURRENT)
CHUNK_SIZE = 20 #number of requests sended to one process.
pool = multiprocessing.Pool(MP_CONCURRENT)
def request_manager(req_list):
try:
# put request list in the queue
responce=pool.map(process_request,req_list,CHUNK_SIZE) # function exits after all requests called and pool work ended
# OR
responce=pool.map_async(process_request,req_list,CHUNK_SIZE) #function request_manager exits after all requests passed to pool
except Exception, e:
logging.error(traceback.print_exc())
# prcoess requests in queue
def process_request(req):
dr = urllib2.urlopen(req)
This works ~5-10x faster then your code
Integrate side "brocker" to django (such as rabbitmq or something like it).
Ok after some fiddling (and a good night's sleep) I believe I've figured out the problem (and thank you Eri, you were the inspiration I needed). The main issue of the zombie processes was that I was not signaling back that the process was finished (and killing it) both of which I (naively) thought was happening automagically with multiprocess.
The code that worked:
# function that will be run through the pool
def process_request(req):
try:
dr = urllib2.urlopen(req, timeout=30)
except Exception, e:
logging.error(traceback.print_exc())
# process killer
def sig_end(r):
sys.exit()
# globals
MP_CONCURRENT = int(multiprocessing.cpu_count()) * 2
if MP_CONCURRENT < 2: MP_CONCURRENT = 2
CHUNK_SIZE = 20
POOL = multiprocessing.Pool(MP_CONCURRENT)
# pool initiator
def request_manager(req_list):
try:
resp = POOL.map_async(process_request, req_list, CHUNK_SIZE, callback=sig_end)
except Exception, e:
logging.error(traceback.print_exc())
A couple of notes:
1) The function that will be hit by "map_async" ("process_request" in this example) must be defined first (and before the global declarations).
2) There is probably a more graceful way to exit the process (suggestions welcome).
3) Using pool in this example really was best (thanks again Eri) due to the "callback" feature which allows me to throw a signal right away.

Categories

Resources