Is there a way I can multithread the function to take 5 URL's from the list at a single time ? Please see my code below . its python 2.7
import requests, csv, time, json, threading
from lxml import html
from csv import DictWriter
All_links = ['http://www.clopaydoor.com/api/v1/dealerlocator/getdealers?latitude=42.343097&longitude=-71.123046&doorType=residential&isFirstSearch=true&isHomeDepot=false&isClopayDealer=true&radius=3000&country=USA',
'http://www.clopaydoor.com/api/v1/dealerlocator/getdealers?latitude=42.398588&longitude=-71.24505&doorType=residential&isFirstSearch=true&isHomeDepot=false&isClopayDealer=true&radius=3000&country=USA',
'http://www.clopaydoor.com/api/v1/dealerlocator/getdealers?latitude=42.394319&longitude=-71.218049&doorType=residential&isFirstSearch=true&isHomeDepot=false&isClopayDealer=true&radius=3000&country=USA',
'http://www.clopaydoor.com/api/v1/dealerlocator/getdealers?latitude=42.365396&longitude=-71.23165&doorType=residential&isFirstSearch=true&isHomeDepot=false&isClopayDealer=true&radius=3000&country=USA',
'http://www.clopaydoor.com/api/v1/dealerlocator/getdealers?latitude=42.356719&longitude=-71.250479&doorType=residential&isFirstSearch=true&isHomeDepot=false&isClopayDealer=true&radius=3000&country=USA',
'http://www.clopaydoor.com/api/v1/dealerlocator/getdealers?latitude=42.385096&longitude=-71.208399&doorType=residential&isFirstSearch=true&isHomeDepot=false&isClopayDealer=true&radius=3000&country=USA',
'http://www.clopaydoor.com/api/v1/dealerlocator/getdealers?latitude=42.334146&longitude=-71.183298&doorType=residential&isFirstSearch=true&isHomeDepot=false&isClopayDealer=true&radius=3000&country=USA',
'http://www.clopaydoor.com/api/v1/dealerlocator/getdealers?latitude=42.374296&longitude=-71.182371&doorType=residential&isFirstSearch=true&isHomeDepot=false&isClopayDealer=true&radius=3000&country=USA']
target = open('completedlinks.txt','ab')
def get_data(each):
each = each.strip('\n')
r = requests.get(each)
source = json.loads(r.content)
the_file = open("output.csv", "ab")
writer = DictWriter(the_file, source[1].keys())
writer.writeheader()
writer.writerows(source)
the_file.close()
target.write(each+'\n')
print each+"\n--------------------------"
for each in All_links:
try:
get_data(each)
except:
pass
Check out the multiprocessing package. It implements thread pools, which would accomplish this.
Update:
Adding something like this should work
from multiprocessing import Pool
def chunks(l, n):
""" Yield successive n-sized chunks from l. """
for i in xrange(0, len(l), n):
yield l[i:i+n]
def threadit(threads, links):
for part in chunks(links, threads):
pool = Pool(threads)
for link in part:
pool.apply_async(getdata, args=(link,))
pool.close()
pool.join()
threadit(5, All_links)
Related
import requests
import time
from lxml import html
def parse_site():
return str(memoryview(''.join([f'---! {link.text_content()} !---\n{parse_fandom(link.xpath(".//a/#href")[0])}\n' for link in
html.fromstring(requests.get('https://archiveofourown.org/media').content).xpath('//*[#class="actions"]')]).encode('utf-8'))[:-1], 'utf-8')
def parse_fandom(url):
return ''.join([' '.join(f'{item.text_content()} |—| {item.xpath(".//a/#href")[0]}'.split()) + '\n' for item in
html.fromstring(requests.get(f'https://archiveofourown.org{url}').content).xpath('//*[contains(#class, "tags")]//li')])
if __name__ == '__main__':
start_time = time.time()
with open('test.txt', 'w+', encoding='utf-8') as f:
f.write(parse_site())
print("--- %s seconds ---" % (time.time() - start_time))
I'm working on web scraping this site to collect fandom stats, but connecting to the site with requests.get() can take 1-3 seconds, bringing the whole program to a slow 18-22 seconds. Somehow, I want to make these requests on parallel threads, but modules like grequests need an allocated pool to do so, and I haven't figured out a way to create such a pool within list comprehension.
Order of the list doesn't matter to me, as long as there is a hierarchy between each category (parsed in parse_site()) and its child links (parse_fandom(url)). What I want to do is something like:
[parallel_parse_fandom(url), parallel_parse_fandom(url2), parallel_parse_fandom(url3)]
↓
[<All links within this fandom>, parallel_parse_fandom(url2), <All links within this fandom>]
↓
return [<All links within this fandom>, <All links within this fandom>, <All links within this fandom>]
Solution based on #Aditya's
import requests
import time
from lxml import html
from concurrent.futures import ThreadPoolExecutor, as_completed
def parse_site():
with ThreadPoolExecutor(max_workers=12) as executor:
results = []
for result in as_completed([executor.submit(parse_fandom, url) for url in [[link.text_content(), link.xpath(".//a/#href")[0]] for link in
html.fromstring(requests.get('https://archiveofourown.org/media').content).xpath('//*[#class="actions"]')]]):
results.append(result)
return str(memoryview(''.join(item.result() for item in results).encode('utf-8'))[:-1], 'utf-8')
def parse_fandom(data):
return f'---! {data[0]} !---\n' + ''.join([' '.join(f'{item.text_content()} |—| {item.xpath(".//a/#href")[0]}'.split()) + '\n' for item in
html.fromstring(requests.get(f'https://archiveofourown.org{data[1]}').content).xpath('//*[contains(#class, "tags")]//li')])
if __name__ == '__main__':
with open('test.txt', 'w', encoding='utf-8') as f:
f.write(parse_site())
You can try the below, It will easily allow you to make a lot of requests in parallel provided the server can handle it as well;
# it's just a wrapper around concurrent.futures ThreadPoolExecutor with a nice tqdm progress bar!
from tqdm.contrib.concurrent import thread_map
def chunk_list(lst, size):
"""
From SO only;
Yield successive n-sized chunks from list.
"""
for i in range(0, len(lst), size):
yield lst[i:i + size]
for idx, my_chunk in enumerate(chunk_list(huge_list, size=2**12)):
for response in thread_map(<which_func_to_call>, my_chunk, max_workers=your_cpu_cores+6)):
# which_func_to_call -> wrap the returned response json obj in this, etc
# do something with the response now..
# make sure to cache the chunk results as well
I want to be able to receive a list of content when posting data from an input. The input is in a text file which will be opened in python. To speed the process up I'd like to increase the number of threads that can be sent at once. How would I be able to do that, here's a rough idea of what I'm talking about:
import requests
userdata = open("data.txt", "r")
usercodes = [x.strip() for x in userdata]
for i in range(len(usercodes)):
thread_one = requests.post(url='https://test.com/input', params=usercodes[i])
thread_two = requests.post(url='https://test.com/input', params=usercodes[i+1])
thread_three = requests.post(url='https://test.com/input', params=usercodes[i+2])
I want all the threads to run at the same time as in here the program will carry out the requests one after the next.
import requests
from multiprocessing import Pool
def make_request(usercode):
requests.post(url='https://test.com/input', params=usercode)
if __name__ == '__main__':
userdata = open("data.txt", "r")
usercodes = [x.strip() for x in userdata]
with Pool(multiprocessing.cpu_count()) as p:
print(p.map(make_request, usercodes))
p.close()
With concurrent.futures.ThreadPoolExecutor:
from concurrent.futures import ThreadPoolExecutor
from functools import partial
import requests
userdata = open("data.txt", "r")
usercodes = (x.strip() for x in userdata) # keep as generator
with ThreadPoolExecutor() as pool:
pool.map(partial(requests.post, 'https://test.com/input'), usercodes)
userdata.close() # closing the input file
Async is definitely your friend here.
from gevent import joinall, spawn, monkey
gevent.monkey.patch_all()
import requests
userdata = open("data.txt", "r")
usercodes = [x.strip() for x in userdata]
send_url = 'https://test.com/input'
threads = []
def send(usercode):
requests.post(url=send_url, params=usercode)
for code in usercodes:
threads.append(spawn(send, code))
joinall(threads)
I am making a web scraper to build a database. The site I plan to use has index pages each containing 50 links. The amount of pages to be parsed is estimated to be around 60K and up, this is why I want to implement multiprocessing.
Here is some pseudo-code of what I want to do:
def harvester(index):
main=dict()
....
links = foo.findAll ( 'a')
for link in links:
main.append(worker(link))
# or maybe something like: map_async(worker(link))
def worker(url):
''' this function gather the data from the given url'''
return dictionary
Now what I want to do with that is to have a certain number of worker function to gather data in parallel on different pages. This data would then be appended to a big dictionary located in harvester or written directly in a csv file by the worker function.
I'm wondering how I can implement parallelism. I have done a faire
amount of research on using gevent, threading and multiprocessing but
I am not sure how to implement it.
I am also not sure if appending data to a large dictionary or writing
directly in a csv using DictWriter will be stable with that many input at the same time.
Thanks
I propose you to split your work into separate workers which communicate via Queues.
Here you mostly have IO wait time (crawling, csv writing)
So you can do the following (not tested, just see the idea):
import threading
import Queue
class CsvWriter(threading.Thread):
def __init__(self, resultq):
super(CsvWriter, self).__init__()
self.resultq = resultq
self.writer = csv.DictWriter(open('results.csv', 'wb'))
def run(self):
done = False
while not done:
row = self.requltq.get()
if row != -1:
self.writer.writerow(row)
else:
done = True
class Crawler(threading.Thread):
def __init__(self, inputqueue, resultq):
super(Crawler, self).__init__()
self.iq = inputq
self.oq = resultq
def run(self):
done = False
while not done:
link = self.iq.get()
if link != -1:
result = self.extract_data(link)
self.oq.put(result)
else:
done = True
def extract_data(self, link):
# crawl and extract what you need and return a dict
pass
def main():
linkq = Queue.Queue()
for url in your_urls:
linkq.put(url)
resultq = Queue.Queue()
writer = CsvWriter(resultq)
writer.start()
crawlers = [Crawler(linkq, resultq) for _ in xrange(10)]
[c.start() for c in crawlers]
[linkq.put(-1) for _ in crawlers]
[c.join() for c in crawlers]
resultq.put(-1)
writer.join()
This code should work (fix possible typos) and make it to exit when all the urls are finished
I'm currently writing a script that reads reddit comments from a large file (5 gigs compressed, ~30 gigs of data being read). My script reads the comments, checks for some text, parses them, and sends them off to a Queue function (running in a seperate thread). No matter what I do, I always get a MemoryError on a specific iteration (number 8162735 if it matters in the slightest). And I can't seem to handle the error, Windows just keeps shutting down python when it hits. Here's my script:
import ujson
from tqdm import tqdm
import bz2
import json
import threading
import spacy
import Queue
import time
nlp = spacy.load('en')
def iter_comments(loc):
with bz2.BZ2File(loc) as file_:
for i, line in (enumerate(file_)):
yield ujson.loads(line)['body']
objects = iter_comments('RC_2015-01.bz2')
q = Queue.Queue()
f = open("reddit_dump.bin", 'wb')
def worker():
while True:
item = q.get()
f.write(item)
q.task_done()
for i in range(0, 2):
t = threading.Thread(target=worker)
t.daemon = True
t.start()
def finish_parse(comment):
global q
try:
comment_parse = nlp(unicode(comment))
comment_bytes = comment_parse.to_bytes()
q.put(comment_bytes)
except MemoryError:
print "MemoryError with comment {0}, waiting for Queue to empty".format(comment)
time.sleep(2)
except AssertionError:
print "AssertionError with comment {0}, skipping".format(comment)
for comment in tqdm(objects):
comment = str(comment.encode('ascii', 'ignore'))
if ">" in comment:
c_parse_thread = threading.Thread(target=finish_parse, args=(comment,))
c_parse_thread.start()
q.join()
f.close()
Does anybody know what I'm doing wrong?
Looks like its not in your code but may be in the data. Have you tried to skip that iteration?
x = 0
for comment in tqdm(objects):
x += 1
if x != 8162735
comment = str(comment.encode('ascii', 'ignore'))
if ">" in comment:
c_parse_thread = threading.Thread(target=finish_parse, args=(comment,))
c_parse_thread.start()
This source is just an exemple:
inputf = open('input', 'r')
outputf = open('output', 'a')
for x in inputf:
x = x.strip('\n')
result = urllib2.urlopen('http://test.com/'+x).getcode()
outputf.write(x+' - '+result+'\n')
I want to add threading to this to check a few URLs at the same time.
The user should everytime decide how many threads he want to use.
The order on the output is not important.
What is the best and most beautiful way for that?
I like multiprocessing.pool.ThreadPool (or multiprocessing.pool.Pool)
like:
from multiprocessing.pool import ThreadPool
n_threads = 5
pool = ThreadPool(processes=n_threads)
threads = [pool.apply_async(some_function, args=(arg1,)) for arg1 in args]
pool.close()
pool.join()
results = [result.get() for result in threads]