No performance gain with python threading - python

I'm writing a parallel crawler using Python and I'm storing some information in Mongodb. After testing I realized that my code, even though is using threading, is not parallel. It make no difference whether I use a single thread or 10 or 50 threads. I can't figure out why.
EDIT: From what I can see most of the processing time is taken up by soup = BeautifulSoup(html). Could it be that this command can't get parallelized using threads?
from threading import Thread
import Queue
import urllib2
import re
from BeautifulSoup import *
from urlparse import urljoin
from pymongo import MongoClient
from urlparse import urlparse
import time
import hashlib
start_time = time.time()
level = 1
client = MongoClient()
db = client.crawler
visited = {}
def doWork():
while True:
try:
myUrl = q_start.get()
except:
continue
try:
c=urllib2.urlopen(myUrl)
except:
q_start.task_done()
continue
parsed_url = urlparse(myUrl)
html=c.read()
try:
soup = BeautifulSoup(html)
except:
q_start.task_done()
continue
txt = soup.prettify()
links = soup('a')
m = hashlib.md5(myUrl)
db.urls.insert(
{
"url":myUrl,
"HTML":txt,
"level":level,
"domain":parsed_url.netloc,
"md5":m.hexdigest()
}
)
for link in links:
if('href' in dict(link.attrs)):
url = urljoin(myUrl,link['href'])
if url.find("'")!=-1:
continue
url=url.split('#')[0]
if url[0:4] == 'http':
if url in visited:
continue
else:
visited[url]=True
q_new.put(url)
q_start.task_done()
q_start = Queue.Queue()
q_new = Queue.Queue()
for i in range(50):
t = Thread(target=doWork)
t.daemon = True
t.start()
q_start.put("http://google.com")
q_start.join()
for i in range(2,5):
print "Depth: "
print i
print time.time() - start_time
level += 1
print q_new.qsize()
q_aux = q_new
q_new = Queue.Queue()
while q_aux.empty() != True:
x = q_aux.get()
q_start.put(x)
q_start.join()
print "end"
print time.time() - start_time

Related

Python multiprocessing in for loop (requests and BeautifulSoup)

I have list of a lot of links and I want to use multiprocessing to speed the proccess, here is simplified version, I need it to be ordered like this:
I tried a lot of things, process, pool etc. I always had errors, I need to do it with 4 or 8 threads and make it ordered like this. Thank you for all help. Here is code:
from bs4 import BeautifulSoup
import requests
import time
links = ["http://www.tennisexplorer.com/match-detail/?id=1672704", "http://www.tennisexplorer.com/match-detail/?id=1699387", "http://www.tennisexplorer.com/match-detail/?id=1698990" "http://www.tennisexplorer.com/match-detail/?id=1696623", "http://www.tennisexplorer.com/match-detail/?id=1688719", "http://www.tennisexplorer.com/match-detail/?id=1686305"]
data = []
def essa(match, omega):
aaa = BeautifulSoup(requests.get(match).text, "lxml")
center = aaa.find("div", id="center")
p1_l = center.find_all("th", class_="plName")[0].find("a").get("href")
p2_l = center.find_all("th", class_="plName")[1].find("a").get("href")
return p1_l + " - " + p2_l + " - " + str(omega)
i = 1
start_time = time.clock()
for link in links:
data.append(essa(link, i))
i += 1
for d in data:
print(d)
print(time.clock() - start_time, "seconds")
Spawn several threads of the function and join them together:
from threading import Thread
def essa(match, omega):
aaa = BeautifulSoup(requests.get(match).text, "lxml")
center = aaa.find("div", id="center")
p1_l = center.find_all("th", class_="plName")[0].find("a").get("href")
p2_l = center.find_all("th", class_="plName")[1].find("a").get("href")
print p1_l + " - " + p2_l + " - " + str(omega)
if __name__ == '__main__':
threadlist = []
for index, url in enumerate(links):
t= Thread(target=essa,args=(url, index))
t.start()
threadlist.append(t)
for b in threadlist:
b.join()
You wont get them to print in order, for the simple reason that some http responses take longer than others.
As far I can understand you have the list of links and make requests concurrently to make the process faster. Here is the sample code for multithreading. I hope this will help you. Read the documentation for concurrent futures.
import concurrent.futures
import urllib.request
URLS = ['http://www.foxnews.com/',
'http://www.cnn.com/',
'http://europe.wsj.com/',
'http://www.bbc.co.uk/',
'http://some-made-up-domain.com/']
# Retrieve a single page and report the URL and contents
def load_url(url, timeout):
with urllib.request.urlopen(url, timeout=timeout) as conn:
return conn.read()
# We can use a with statement to ensure threads are cleaned up promptly
with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
# Start the load operations and mark each future with its URL
future_to_url = {executor.submit(load_url, url, 60): url for url in URLS}
for future in concurrent.futures.as_completed(future_to_url):
url = future_to_url[future]
try:
data = future.result()
except Exception as exc:
print('%r generated an exception: %s' % (url, exc))
else:
print('%r page is %d bytes' % (url, len(data)))

How to run script in multithreading or multiprocessing

This scripts takes 2 sec to complete but how to run it in many threads and complete in 50 ms
import urllib2
from threading import Thread
def btl_test(url):
page = urllib2.urlopen(url)
print page
url = ["http://google.com","http://example.com","http://yahoo.com","http://linkedin.com","http://orkut.com","http://quora.com","http://facebook.com","http://myspace.com","http://gmail.com","http://nltk.org","http://cyber.com"]
for i in url:
t = Thread(target = btl_test,args=(i,))
t.start()
How to put results in order as well?
from contextlib import closing # http://stackoverflow.com/a/25968716/968442
from multiprocessing.pool import Pool
with closing(Pool(len(url))) as pool:
pool.map(btl_test, url)
Should be the handy snippet. Regarding order you can assign a mapping using a tuple and print them accordingly.
Update:
As per this blog pool.map will return the output with the order preserved. Here is code which prints the list of tuples in (url, html_content) format without changing the order
urls = ["http://google.com","http://example.com","http://yahoo.com","http://linkedin.com","http://orkut.com","http://quora.com","http://facebook.com","http://myspace.com","http://gmail.com","http://nltk.org","http://cyber.com"]
def btl_test(url):
import urllib2
return url, urllib2.urlopen(url).read()
from contextlib import closing # http://stackoverflow.com/a/25968716/968442
from multiprocessing.pool import Pool
with closing(Pool(len(urls))) as pool:
result = pool.map(btl_test, urls)
print result
Try to use Queue() and enumerate to store order.
import threading
import requests
import Queue
class UrlReader(threading.Thread):
def __init__(self, queue, output):
super(UrlReader, self).__init__()
self.setDaemon = True
self.queue = queue
self.output = output
def run(self):
while True:
try:
target = self.queue.get(block=False)
data = requests.get(target[1])
print data.status_code
if data.status_code == 200:
self.queue.task_done()
self.output.put((data.url, target[0]), block=False)
else:
self.queue.task_done()
self.queue.put(target)
except Queue.Empty:
break
except requests.exceptions.ConnectionError:
self.queue.task_done()
self.queue.put(target)
def load(urlrange, num_threads):
mainqueue = Queue.Queue()
outq = Queue.Queue()
mythreads = []
for url in urlrange:
mainqueue.put(url)
for j in xrange(num_threads):
mythreads.append(UrlReader(mainqueue, outq))
mythreads[-1].start()
mainqueue.join()
for j in xrange(num_threads):
mythreads.append(UrlReader(mainqueue, outq))
mythreads[j].join()
return list(outq.__dict__['queue'])
urls = ["http://google.com","http://example.com","http://yahoo.com","http://linkedin.com","http://orkut.com","http://quora.com","http://facebook.com","http://myspace.com","http://gmail.com","http://nltk.org","http://cyber.com"]
print load(enumerate(urls), 10)
>>> [(6, 'http://facebook.com'), (9, 'http://nltk.org'), (0, 'http://google.com'), (1, 'http://example.com'), (2, 'http://yahoo.com'), (3, 'http://linkedin.com'), (4, 'http://orkut.com'), (5, 'http://quora.com'), (7, 'http://myspace.com'), (8, 'http://gmail.com'), (10, 'http://cyber.com')]
This works
from urlparse import urlparse
from multiprocessing.pool import Pool
import re
import urllib2
def btl_test(url):
page = urllib2.urlopen(url).read()
if (re.findall(r'<title>(.*?)<\/title>',page)):
page1 = (re.findall(r'<title>(.*?)<\/title>',page)[0])
print page1
url = ["http://google.com","http://example.com","http://yahoo.com","http://linkedin.com","http://facebook.com","http://orkut.com","http://oosing.com","http://pinterets.com","http://orkut.com","http://quora.com","http://facebook.com","http://myspace.com","http://gmail.com","http://nltk.org","http://cyber.com"]
#for i in url:
# print btl_test(i)
nprocs = 2 # nprocs is the number of processes to run
ParsePool = Pool(nprocs)
ParsePool.map(btl_test,url)
#ParsedURLS = ParsePool.map(btl_test,url)
#print ParsedURLS
Helps a lot

the efficiency compare between gevent and thread

Recently,I'm working on a gevent demo and I try to compare the efficiency between gevent and thread. Generally speakingļ¼Œthe gevent code should be more efficient than the thread code. But when I use time command to profile the program, I get the unusual result(my command is time python FILENAME.py 50 1000,the last two parameters means pool number or thread number,so I change the two number in the table below). The result shows that the thread is more efficient than the gevent code,so I want to know why this happen and what's wrong with my program? Thanks.
gevent VS thread
My code is below(The main idea is use thread or gevent to send multi HTTP request):
******This is the thread version code******
# _*_ coding: utf-8 _*_
import sys
reload(sys)
sys.setdefaultencoding("utf8")
import requests
import threading
import time
import urllib2
finished = 0
def GetUrl(pagenum):
url = 'http://opendata.baidu.com/zhaopin/s?p=mini&wd=%B0%D9%B6%C8&pn=' + \
str(pagenum*20) + '&rn=20'
return url
def setUrlSet():
for i in xrange(requestnum):
urlnum = i % 38
urlset.append(GetUrl(urlnum))
def GetResponse(pagenum):
try:
r = requests.get(urlset[pagenum])
except Exception, e:
print e
pass
def DigJobByPagenum(pagenum, requestnum):
init_num = pagenum
print '%d begin' % init_num
while pagenum < requestnum:
GetResponse(pagenum)
pagenum += threadnum
print '%d over' % init_num
def NormalThread(threadnum):
startime = time.time()
print "%s is running..." % threading.current_thread().name
threads = []
global finished, requestnum
for i in xrange(threadnum):
thread = threading.Thread(target=DigJobByPagenum, args=(i, requestnum))
threads.append(thread)
for t in threads:
t.daemon = True
t.start()
for t in threads:
t.join()
finished += 1
endtime = time.time()
print "%s is stop.The total time is %0.2f" % \
(threading.current_thread().name, (endtime - startime))
def GetAvageTime(array):
alltime = 0.0
for i in array:
alltime += i
avageTime = alltime/len(array)
return avageTime
if __name__ == '__main__':
threadnum = int(sys.argv[1])
requestnum = int(sys.argv[2])
print 'threadnum : %s,requestnum %s ' % (threadnum, requestnum)
originStartTime = time.time()
urlset = []
setUrlSet()
NormalThread(threadnum)
******This is the gevent verison code******
# _*_ coding: utf-8 _*_
import sys
reload(sys)
sys.setdefaultencoding("utf8")
from gevent import monkey
monkey.patch_all()
import gevent
from gevent import pool
import requests
import time
finished = 0
def GetUrl(pagenum):
url = 'http://opendata.baidu.com/zhaopin/s?p=mini&wd=%B0%D9%B6%C8&pn=' + \
str(pagenum*20) + '&rn=20'
return url
def setUrlSet():
for i in xrange(requestnum):
urlnum = i % 38
urlset.append(GetUrl(urlnum))
def GetResponse(url):
startime = time.time()
r = requests.get(url)
print url
endtime = time.time()
spendtime = endtime - startime
NormalSpendTime.append(spendtime)
global finished
finished += 1
print finished
def GetAvageTime(array):
alltime = 0.0
for i in array:
alltime += i
avageTime = alltime/len(array)
return avageTime
def RunAsyncJob():
jobpool = pool.Pool(concurrent)
for url in urlset:
jobpool.spawn(GetResponse, url)
jobpool.join()
endtime = time.time()
allSpendTime = endtime - originStartime
print 'Total spend time is %0.3f, total request num is %s within %s \
seconds' % (allSpendTime, finished, timeoutNum)
print 'Each request time is %0.3f' % (GetAvageTime(NormalSpendTime))
if __name__ == '__main__':
concurrent = int(sys.argv[1])
requestnum = int(sys.argv[2])
timeoutNum = 100
NormalSpendTime = []
urlset = []
urlActionList = []
setUrlSet()
originStartime = time.time()
RunAsyncJob()
Try
gevent.monkey.patch_all(httplib=True)
It seems that by default gevent does not patch httplib (have a look at http://www.gevent.org/gevent.monkey.html : httplib=False) so you are actually doing blocking requests and you lose all advantages of the asynchronous framework. Although I'm not sure whether requests uses httplib.
If that doesn't work, then have a look at this lib:
https://github.com/kennethreitz/grequests
Re: httplib=False
You are already using requests library to make web calls. It has gevent flavour called grequests:
https://github.com/kennethreitz/grequests
Overall I don't immediately see much reason to prefer one style of threading to the other, if your pool is so small. Of course real threads are relatively heavy (start with 8MB stack), but you have to take that into proportion to the size of your job.
My take, try both (done), verify you are doing both right (to do) and let numbers do the talking.

Multi-threaded Python Web Crawler Got Stuck

I'm writing a Python web crawler and I want to make it multi-threaded. Now I have finished the basic part, below is what it does:
a thread gets a url from the queue;
the thread extracts the links from the page, checks if the links exist in a pool (a set), and puts the new links to the queue and the pool;
the thread writes the url and the http response to a csv file.
But when I run the crawler, it always gets stuck eventually, not exiting properly. I have gone through the official document of Python but still have no clue.
Below is the code:
#!/usr/bin/env python
#!coding=utf-8
import requests, re, urlparse
import threading
from Queue import Queue
from bs4 import BeautifulSoup
#custom modules and files
from setting import config
class Page:
def __init__(self, url):
self.url = url
self.status = ""
self.rawdata = ""
self.error = False
r = ""
try:
r = requests.get(self.url, headers={'User-Agent': 'random spider'})
except requests.exceptions.RequestException as e:
self.status = e
self.error = True
else:
if not r.history:
self.status = r.status_code
else:
self.status = r.history[0]
self.rawdata = r
def outlinks(self):
self.outlinks = []
#links, contains URL, anchor text, nofollow
raw = self.rawdata.text.lower()
soup = BeautifulSoup(raw)
outlinks = soup.find_all('a', href=True)
for link in outlinks:
d = {"follow":"yes"}
d['url'] = urlparse.urljoin(self.url, link.get('href'))
d['anchortext'] = link.text
if link.get('rel'):
if "nofollow" in link.get('rel'):
d["follow"] = "no"
if d not in self.outlinks:
self.outlinks.append(d)
pool = Queue()
exist = set()
thread_num = 10
lock = threading.Lock()
output = open("final.csv", "a")
#the domain is the start point
domain = config["domain"]
pool.put(domain)
exist.add(domain)
def crawl():
while True:
p = Page(pool.get())
#write data to output file
lock.acquire()
output.write(p.url+" "+str(p.status)+"\n")
print "%s crawls %s" % (threading.currentThread().getName(), p.url)
lock.release()
if not p.error:
p.outlinks()
outlinks = p.outlinks
if urlparse.urlparse(p.url)[1] == urlparse.urlparse(domain)[1] :
for link in outlinks:
if link['url'] not in exist:
lock.acquire()
pool.put(link['url'])
exist.add(link['url'])
lock.release()
pool.task_done()
for i in range(thread_num):
t = threading.Thread(target = crawl)
t.start()
pool.join()
output.close()
Any help would be appreciated!
Thanks
Marcus
Your crawl function has an infinite while loop with no possible exit path.
The condition True always evaluates to True and the loop continues, as you say,
not exiting properly
Modify the crawl function's while loop to include a condition. For instance, when the number of links saved to the csv file exceeds a certain minimum number, then exit the while loop.
i.e.,
def crawl():
while len(exist) <= min_links:
...

Python Threading Error when executing a MSSQL statement

I have a producer, consumer application where the producer reads a database and puts the results into a website. On success, The consumer then gets the id of the transaction and updates the db.
The program runs as required until it attempts to execute the update. It fails sometimes with this error 'HY000', 'The driver did not supply an error!'
The code can comfortably write to file without any issues.
What could i do to fix this? We need to update the db.
Thanks
Notes...using python 2.7 with pyodbc on mssql 2008.
code below
#!/usr/bin/env python
from urlparse import urlparse
from threading import Thread
import httplib, sys
import Queue
import urllib2
import urllib
from time import localtime, strftime
from ConfigParser import SafeConfigParser
from functions import Functions
from pyodbcclass import db_mssql
now = strftime("%y-%m-%d-%H%M%S", localtime())
k = db_mssql()
thread_list = []
thread_list2 = []
getFromDBQueue = Queue.Queue()
updateDBQueue = Queue.Queue()
number_of_consumer_threads = 3
def putURL():
querySql = "select distinct top 3 id,url from tblURL where processed=0 order by id asc"
re = k.query2(querySql)
if re:
for r in re:
id = r.id
params = urllib.urlencode({'user': user, 'password': password})
ourl = urlini + "?%s" % params
urlplusid = {'url':ourl.strip(),'id':id}
getFromDBQueue.put(urlplusid)
def getURL(thread_id):
while 1:
try:
URL_toget = getFromDBQueue.get(block=False)
url2 = URL_toget['url']
msgid2 = URL_toget['id']
except Queue.Empty:
print "thread exiting, id: " + str(thread_id) + "++getFromDB++"
sys.exit()
status,url = getStatus(url2)
if status == 200:
updateDBQueue.put(msgid2)
print(status)
def updateDB(thread_id):
while 1:
try:
id2 = updateDBQueue.get(block=False)
if id2:
params = ['true',id2]
sqlupdate = "UPDATE tblURL SET processed=? WHERE id=?"
k.execute3(sqlupdate,params)
except Queue.Empty:
print "thread exiting, id: " + str(thread_id) + "**update**"
sys.exit()
# fill the queue with work and block until we are done filling the queue
producer_thread = Thread(target=putURL)
producer_thread.start()
producer_thread.join()
# we can now start consumers
for i in range(number_of_consumer_threads):
getfromDB = Thread(target=getURL, args=(i,))
getfromDB.start()
thread_list.append(getfromDB)
for i in range(number_of_consumer_threads):
update = Thread(target=updateDB, args=(i,))
update.start()
thread_list2.append(update)
for thread in thread_list:
thread.join()
for thread2 in thread_list2:
thread2.join()

Categories

Resources