How to run script in multithreading or multiprocessing - python

This scripts takes 2 sec to complete but how to run it in many threads and complete in 50 ms
import urllib2
from threading import Thread
def btl_test(url):
page = urllib2.urlopen(url)
print page
url = ["http://google.com","http://example.com","http://yahoo.com","http://linkedin.com","http://orkut.com","http://quora.com","http://facebook.com","http://myspace.com","http://gmail.com","http://nltk.org","http://cyber.com"]
for i in url:
t = Thread(target = btl_test,args=(i,))
t.start()
How to put results in order as well?

from contextlib import closing # http://stackoverflow.com/a/25968716/968442
from multiprocessing.pool import Pool
with closing(Pool(len(url))) as pool:
pool.map(btl_test, url)
Should be the handy snippet. Regarding order you can assign a mapping using a tuple and print them accordingly.
Update:
As per this blog pool.map will return the output with the order preserved. Here is code which prints the list of tuples in (url, html_content) format without changing the order
urls = ["http://google.com","http://example.com","http://yahoo.com","http://linkedin.com","http://orkut.com","http://quora.com","http://facebook.com","http://myspace.com","http://gmail.com","http://nltk.org","http://cyber.com"]
def btl_test(url):
import urllib2
return url, urllib2.urlopen(url).read()
from contextlib import closing # http://stackoverflow.com/a/25968716/968442
from multiprocessing.pool import Pool
with closing(Pool(len(urls))) as pool:
result = pool.map(btl_test, urls)
print result

Try to use Queue() and enumerate to store order.
import threading
import requests
import Queue
class UrlReader(threading.Thread):
def __init__(self, queue, output):
super(UrlReader, self).__init__()
self.setDaemon = True
self.queue = queue
self.output = output
def run(self):
while True:
try:
target = self.queue.get(block=False)
data = requests.get(target[1])
print data.status_code
if data.status_code == 200:
self.queue.task_done()
self.output.put((data.url, target[0]), block=False)
else:
self.queue.task_done()
self.queue.put(target)
except Queue.Empty:
break
except requests.exceptions.ConnectionError:
self.queue.task_done()
self.queue.put(target)
def load(urlrange, num_threads):
mainqueue = Queue.Queue()
outq = Queue.Queue()
mythreads = []
for url in urlrange:
mainqueue.put(url)
for j in xrange(num_threads):
mythreads.append(UrlReader(mainqueue, outq))
mythreads[-1].start()
mainqueue.join()
for j in xrange(num_threads):
mythreads.append(UrlReader(mainqueue, outq))
mythreads[j].join()
return list(outq.__dict__['queue'])
urls = ["http://google.com","http://example.com","http://yahoo.com","http://linkedin.com","http://orkut.com","http://quora.com","http://facebook.com","http://myspace.com","http://gmail.com","http://nltk.org","http://cyber.com"]
print load(enumerate(urls), 10)
>>> [(6, 'http://facebook.com'), (9, 'http://nltk.org'), (0, 'http://google.com'), (1, 'http://example.com'), (2, 'http://yahoo.com'), (3, 'http://linkedin.com'), (4, 'http://orkut.com'), (5, 'http://quora.com'), (7, 'http://myspace.com'), (8, 'http://gmail.com'), (10, 'http://cyber.com')]

This works
from urlparse import urlparse
from multiprocessing.pool import Pool
import re
import urllib2
def btl_test(url):
page = urllib2.urlopen(url).read()
if (re.findall(r'<title>(.*?)<\/title>',page)):
page1 = (re.findall(r'<title>(.*?)<\/title>',page)[0])
print page1
url = ["http://google.com","http://example.com","http://yahoo.com","http://linkedin.com","http://facebook.com","http://orkut.com","http://oosing.com","http://pinterets.com","http://orkut.com","http://quora.com","http://facebook.com","http://myspace.com","http://gmail.com","http://nltk.org","http://cyber.com"]
#for i in url:
# print btl_test(i)
nprocs = 2 # nprocs is the number of processes to run
ParsePool = Pool(nprocs)
ParsePool.map(btl_test,url)
#ParsedURLS = ParsePool.map(btl_test,url)
#print ParsedURLS
Helps a lot

Related

Multithread https requests in python

I'm trying to multi-thread web requests in python for web scraping. I want to send multiple requests to the same website using multi-threading, but the time it takes for the script to complete is the same whether or not I use multi-threading.
This is the code that I'm using:
import queue
import urllib.request
from threading import Thread
def perform_web_requests(addresses, no_workers):
class Worker(Thread):
def __init__(self, request_queue):
Thread.__init__(self)
self.queue = request_queue
self.results = []
def run(self):
while True:
content = self.queue.get()
if content == "":
break
request = urllib.request.Request(content)
response = urllib.request.urlopen(request)
self.results.append(response.read())
self.queue.task_done()
# Create queue and add addresses
q = queue.Queue()
for url in addresses:
q.put(url)
# Workers keep working till they receive an empty string
for _ in range(no_workers):
q.put("")
# Create workers and add tot the queue
workers = []
for _ in range(no_workers):
worker = Worker(q)
worker.start()
workers.append(worker)
# Join workers to wait till they finished
for worker in workers:
worker.join()
# Combine results from all workers
r = []
for worker in workers:
r.extend(worker.results)
return r
urls = ['https://google.com']
i = 0
while i < 100:
results = perform_web_requests(urls, 50)
i += 1
print(i)
It appears urllib does not support multi-threading. Use urllib3:
https://github.com/urllib3/urllib3

Python multiprocessing in for loop (requests and BeautifulSoup)

I have list of a lot of links and I want to use multiprocessing to speed the proccess, here is simplified version, I need it to be ordered like this:
I tried a lot of things, process, pool etc. I always had errors, I need to do it with 4 or 8 threads and make it ordered like this. Thank you for all help. Here is code:
from bs4 import BeautifulSoup
import requests
import time
links = ["http://www.tennisexplorer.com/match-detail/?id=1672704", "http://www.tennisexplorer.com/match-detail/?id=1699387", "http://www.tennisexplorer.com/match-detail/?id=1698990" "http://www.tennisexplorer.com/match-detail/?id=1696623", "http://www.tennisexplorer.com/match-detail/?id=1688719", "http://www.tennisexplorer.com/match-detail/?id=1686305"]
data = []
def essa(match, omega):
aaa = BeautifulSoup(requests.get(match).text, "lxml")
center = aaa.find("div", id="center")
p1_l = center.find_all("th", class_="plName")[0].find("a").get("href")
p2_l = center.find_all("th", class_="plName")[1].find("a").get("href")
return p1_l + " - " + p2_l + " - " + str(omega)
i = 1
start_time = time.clock()
for link in links:
data.append(essa(link, i))
i += 1
for d in data:
print(d)
print(time.clock() - start_time, "seconds")
Spawn several threads of the function and join them together:
from threading import Thread
def essa(match, omega):
aaa = BeautifulSoup(requests.get(match).text, "lxml")
center = aaa.find("div", id="center")
p1_l = center.find_all("th", class_="plName")[0].find("a").get("href")
p2_l = center.find_all("th", class_="plName")[1].find("a").get("href")
print p1_l + " - " + p2_l + " - " + str(omega)
if __name__ == '__main__':
threadlist = []
for index, url in enumerate(links):
t= Thread(target=essa,args=(url, index))
t.start()
threadlist.append(t)
for b in threadlist:
b.join()
You wont get them to print in order, for the simple reason that some http responses take longer than others.
As far I can understand you have the list of links and make requests concurrently to make the process faster. Here is the sample code for multithreading. I hope this will help you. Read the documentation for concurrent futures.
import concurrent.futures
import urllib.request
URLS = ['http://www.foxnews.com/',
'http://www.cnn.com/',
'http://europe.wsj.com/',
'http://www.bbc.co.uk/',
'http://some-made-up-domain.com/']
# Retrieve a single page and report the URL and contents
def load_url(url, timeout):
with urllib.request.urlopen(url, timeout=timeout) as conn:
return conn.read()
# We can use a with statement to ensure threads are cleaned up promptly
with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
# Start the load operations and mark each future with its URL
future_to_url = {executor.submit(load_url, url, 60): url for url in URLS}
for future in concurrent.futures.as_completed(future_to_url):
url = future_to_url[future]
try:
data = future.result()
except Exception as exc:
print('%r generated an exception: %s' % (url, exc))
else:
print('%r page is %d bytes' % (url, len(data)))

Multi-threading in Python: Getting stuck at last thread

I have a strange situation and cannot figure it out after lots of hit-trials. I am using multi-threading (10) for reading urls (100) and it works fine in most cases but in some situation, it gets stuck at the last thread. I waited for it to see if it returns and it took a lot of time (1050 seconds) whereas the rest of the nine threads returned within 25 seconds. It shows something is wrong with my code but can't figure it out. Any ideas?
Note1: It happens for both daemon and non-daemon threads.
Note2: The number of URLs and thread changes. I tried a different number of URLs from 10-100 and various threads from 5-50.
Note3: The URLs are most of the time completely different.
import urllib2
import Queue
import threading
from goose import Goose
input_queue = Queue.Queue()
result_queue = Queue.Queue()
Thread Worker:
def worker(input_queue, result_queue):
queue_full = true
while queue_full:
try:
url = input_queue.get(False)
read a url using urllib2 and goose
process it
result_queue.put(updated value)
except Queue.Empty:
queue_full = False
Main process:
for url in urls:
input_queue.put(url)
thread_count = 5
for t in range(thread_count):
t = threading.Thread(target=worker, args= (input_queue, result_queue))
t.start()
for url in urls:
url = result_queue.get() # updates url
The process gets blocked at the last result_queue.get() call.
NOTE: I am more interested in what I am doing wrong here, in case someone can point that out? Because I tend to think that I wrote correct code but apparently that's not the case.
You can use ThreadPoolExecutor from concurrent.futures.
from concurrent.futures import ThreadPoolExecutor
MAX_WORKERS = 50
def worker(url):
response = requests.get(url)
return response.content
with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
results = executor.map(worker, urls)
for result in results:
print(result)
For example, i take URL as a list of numbers
import urllib2
import Queue
import threading
#from goose import Goose
input_queue = Queue.Queue()
result_queue = Queue.Queue()
def worker(input_queue, result_queue):
while not input_queue.empty():
try:
url = input_queue.get(False)
updated_value = int(url) * 9
result_queue.put(updated_value)
except Queue.Empty:
pass
urls = [1,2,3,4,5,6,7,8,9]
for url in urls:
input_queue.put(url)
thread_count = 5
for i in range(thread_count):
t = threading.Thread(target=worker, args= (input_queue, result_queue))
t.start()
t.join()
for url in urls:
try:
url = result_queue.get()
print url
except Queue.Empty:
pass
Output
9
18
27
36
45
54
63
72
81

No performance gain with python threading

I'm writing a parallel crawler using Python and I'm storing some information in Mongodb. After testing I realized that my code, even though is using threading, is not parallel. It make no difference whether I use a single thread or 10 or 50 threads. I can't figure out why.
EDIT: From what I can see most of the processing time is taken up by soup = BeautifulSoup(html). Could it be that this command can't get parallelized using threads?
from threading import Thread
import Queue
import urllib2
import re
from BeautifulSoup import *
from urlparse import urljoin
from pymongo import MongoClient
from urlparse import urlparse
import time
import hashlib
start_time = time.time()
level = 1
client = MongoClient()
db = client.crawler
visited = {}
def doWork():
while True:
try:
myUrl = q_start.get()
except:
continue
try:
c=urllib2.urlopen(myUrl)
except:
q_start.task_done()
continue
parsed_url = urlparse(myUrl)
html=c.read()
try:
soup = BeautifulSoup(html)
except:
q_start.task_done()
continue
txt = soup.prettify()
links = soup('a')
m = hashlib.md5(myUrl)
db.urls.insert(
{
"url":myUrl,
"HTML":txt,
"level":level,
"domain":parsed_url.netloc,
"md5":m.hexdigest()
}
)
for link in links:
if('href' in dict(link.attrs)):
url = urljoin(myUrl,link['href'])
if url.find("'")!=-1:
continue
url=url.split('#')[0]
if url[0:4] == 'http':
if url in visited:
continue
else:
visited[url]=True
q_new.put(url)
q_start.task_done()
q_start = Queue.Queue()
q_new = Queue.Queue()
for i in range(50):
t = Thread(target=doWork)
t.daemon = True
t.start()
q_start.put("http://google.com")
q_start.join()
for i in range(2,5):
print "Depth: "
print i
print time.time() - start_time
level += 1
print q_new.qsize()
q_aux = q_new
q_new = Queue.Queue()
while q_aux.empty() != True:
x = q_aux.get()
q_start.put(x)
q_start.join()
print "end"
print time.time() - start_time

Multi-threaded Python Web Crawler Got Stuck

I'm writing a Python web crawler and I want to make it multi-threaded. Now I have finished the basic part, below is what it does:
a thread gets a url from the queue;
the thread extracts the links from the page, checks if the links exist in a pool (a set), and puts the new links to the queue and the pool;
the thread writes the url and the http response to a csv file.
But when I run the crawler, it always gets stuck eventually, not exiting properly. I have gone through the official document of Python but still have no clue.
Below is the code:
#!/usr/bin/env python
#!coding=utf-8
import requests, re, urlparse
import threading
from Queue import Queue
from bs4 import BeautifulSoup
#custom modules and files
from setting import config
class Page:
def __init__(self, url):
self.url = url
self.status = ""
self.rawdata = ""
self.error = False
r = ""
try:
r = requests.get(self.url, headers={'User-Agent': 'random spider'})
except requests.exceptions.RequestException as e:
self.status = e
self.error = True
else:
if not r.history:
self.status = r.status_code
else:
self.status = r.history[0]
self.rawdata = r
def outlinks(self):
self.outlinks = []
#links, contains URL, anchor text, nofollow
raw = self.rawdata.text.lower()
soup = BeautifulSoup(raw)
outlinks = soup.find_all('a', href=True)
for link in outlinks:
d = {"follow":"yes"}
d['url'] = urlparse.urljoin(self.url, link.get('href'))
d['anchortext'] = link.text
if link.get('rel'):
if "nofollow" in link.get('rel'):
d["follow"] = "no"
if d not in self.outlinks:
self.outlinks.append(d)
pool = Queue()
exist = set()
thread_num = 10
lock = threading.Lock()
output = open("final.csv", "a")
#the domain is the start point
domain = config["domain"]
pool.put(domain)
exist.add(domain)
def crawl():
while True:
p = Page(pool.get())
#write data to output file
lock.acquire()
output.write(p.url+" "+str(p.status)+"\n")
print "%s crawls %s" % (threading.currentThread().getName(), p.url)
lock.release()
if not p.error:
p.outlinks()
outlinks = p.outlinks
if urlparse.urlparse(p.url)[1] == urlparse.urlparse(domain)[1] :
for link in outlinks:
if link['url'] not in exist:
lock.acquire()
pool.put(link['url'])
exist.add(link['url'])
lock.release()
pool.task_done()
for i in range(thread_num):
t = threading.Thread(target = crawl)
t.start()
pool.join()
output.close()
Any help would be appreciated!
Thanks
Marcus
Your crawl function has an infinite while loop with no possible exit path.
The condition True always evaluates to True and the loop continues, as you say,
not exiting properly
Modify the crawl function's while loop to include a condition. For instance, when the number of links saved to the csv file exceeds a certain minimum number, then exit the while loop.
i.e.,
def crawl():
while len(exist) <= min_links:
...

Categories

Resources