Making Parallel Requests From Within List Comprehension - python

import requests
import time
from lxml import html
def parse_site():
return str(memoryview(''.join([f'---! {link.text_content()} !---\n{parse_fandom(link.xpath(".//a/#href")[0])}\n' for link in
html.fromstring(requests.get('https://archiveofourown.org/media').content).xpath('//*[#class="actions"]')]).encode('utf-8'))[:-1], 'utf-8')
def parse_fandom(url):
return ''.join([' '.join(f'{item.text_content()} |—| {item.xpath(".//a/#href")[0]}'.split()) + '\n' for item in
html.fromstring(requests.get(f'https://archiveofourown.org{url}').content).xpath('//*[contains(#class, "tags")]//li')])
if __name__ == '__main__':
start_time = time.time()
with open('test.txt', 'w+', encoding='utf-8') as f:
f.write(parse_site())
print("--- %s seconds ---" % (time.time() - start_time))
I'm working on web scraping this site to collect fandom stats, but connecting to the site with requests.get() can take 1-3 seconds, bringing the whole program to a slow 18-22 seconds. Somehow, I want to make these requests on parallel threads, but modules like grequests need an allocated pool to do so, and I haven't figured out a way to create such a pool within list comprehension.
Order of the list doesn't matter to me, as long as there is a hierarchy between each category (parsed in parse_site()) and its child links (parse_fandom(url)). What I want to do is something like:
[parallel_parse_fandom(url), parallel_parse_fandom(url2), parallel_parse_fandom(url3)]
↓
[<All links within this fandom>, parallel_parse_fandom(url2), <All links within this fandom>]
↓
return [<All links within this fandom>, <All links within this fandom>, <All links within this fandom>]
Solution based on #Aditya's
import requests
import time
from lxml import html
from concurrent.futures import ThreadPoolExecutor, as_completed
def parse_site():
with ThreadPoolExecutor(max_workers=12) as executor:
results = []
for result in as_completed([executor.submit(parse_fandom, url) for url in [[link.text_content(), link.xpath(".//a/#href")[0]] for link in
html.fromstring(requests.get('https://archiveofourown.org/media').content).xpath('//*[#class="actions"]')]]):
results.append(result)
return str(memoryview(''.join(item.result() for item in results).encode('utf-8'))[:-1], 'utf-8')
def parse_fandom(data):
return f'---! {data[0]} !---\n' + ''.join([' '.join(f'{item.text_content()} |—| {item.xpath(".//a/#href")[0]}'.split()) + '\n' for item in
html.fromstring(requests.get(f'https://archiveofourown.org{data[1]}').content).xpath('//*[contains(#class, "tags")]//li')])
if __name__ == '__main__':
with open('test.txt', 'w', encoding='utf-8') as f:
f.write(parse_site())

You can try the below, It will easily allow you to make a lot of requests in parallel provided the server can handle it as well;
# it's just a wrapper around concurrent.futures ThreadPoolExecutor with a nice tqdm progress bar!
from tqdm.contrib.concurrent import thread_map
def chunk_list(lst, size):
"""
From SO only;
Yield successive n-sized chunks from list.
"""
for i in range(0, len(lst), size):
yield lst[i:i + size]
for idx, my_chunk in enumerate(chunk_list(huge_list, size=2**12)):
for response in thread_map(<which_func_to_call>, my_chunk, max_workers=your_cpu_cores+6)):
# which_func_to_call -> wrap the returned response json obj in this, etc
# do something with the response now..
# make sure to cache the chunk results as well

Related

Python: Serial, Threading, Multiprocessing. Which is the fastest way?

so I have a code that needs to do HTTP requests (let's say 1000). I approached it in 3 ways so far with 50 HTTP requests. The results and codes are below.
The fastest is the approach using Threads, issue is that I lose some data (from what I understood due to the GIL). My questions are the following:
My understanding it that the correct approach in this case is to use Multiprocessing. Is there any way I can improve the speed of that approach? Matching the Threading time would be great.
I would guess that the higher the amount of links I have, the more time the Serial and Threading approach would take, while the Multiprocessing approach would increase much more slowly. Do you have any source that will allow me to get an estimate of the time it would take to run the code with n links?
Serial - Time To Run around 10 seconds
def get_data(link, **kwargs):
data = requests.get(link)
if "queue" in kwargs and isinstance(kwargs["queue"], queue.Queue):
kwargs["queue"].put(data)
else:
return data
links = [link_1, link_2, ..., link_n]
matrix = []
for link in links:
matrix.append(get_data(link))
Threads - Time To Run around 0.8 of a second
def get_data_thread(links):
q = queue.Queue()
for link in links:
data = threading.Thread(target = get_data, args = (link, ), kwargs = {"queue" : q})
data.start()
data.join()
return q
matrix = []
q = get_data_thread(links)
while not q.empty():
matrix.append(q.get())
Multiprocessing - Time To Run around 5 seconds
def get_data_pool(links):
p = mp.Pool()
data = p.map(get_data, links)
return data
if __name__ == "__main__":
matrix = get_data_pool(links)
If I were to suggest anything, I would go with AIOHTTP. A sketch of the code:
import aiohttp
import asyncio
async def main(alink):
links = [link_1, link_2, ..., link_n]
matrix = []
async with aiohttp.ClientSession() as session:
async with session.get(alink) as resp:
return resp.data()
if __name__ == "__main__":
loop = asyncio.get_event_loop()
for link in links:
loop.run_until_complete(main(link))

Add Bittrex API Results from Loop into Dataframe

I am new to python. By following code from others I have put together the following code. The code provide the output from "GetLatestTick" by looping though code for each "pair" in list under the __name/main__ section. I can even perform calculation on the line being processed. How can I organized this output into a DataFrame for sorting and other manipulation?
from multiprocessing import Process, Lock
import requests, json
from tabulate import tabulate
def bittrex(lock, pair):
lock.acquire()
#print(pair)
get_request_link = ('https://bittrex.com/Api/v2.0/pub/market/GetLatestTick?marketName='
+ pair + '&tickInterval=thirtyMin')
api = requests.get(get_request_link)
data = json.loads(api.text)
for i in data:
if i == 'result':
for h in data[i]:
print(pair,h['O'],h['H'],h['L'],h['C'],h['V'],h['BV'],
(((h['O'])/(h['H']))-1),h['T'])
lock.release()
return data
def main():
bitt_all = bittrex(lock, pair) # This still required
if __name__ == '__main__':
lock = Lock()
pair = ['BTC-MUE','USD-ETH','USDT-TRX','USDT-BTC']
for pair in pair:
Process(target=bittrex, args=(lock, pair)).start()

Implementing multiprocessing in a loop scraper and appending the data

I am making a web scraper to build a database. The site I plan to use has index pages each containing 50 links. The amount of pages to be parsed is estimated to be around 60K and up, this is why I want to implement multiprocessing.
Here is some pseudo-code of what I want to do:
def harvester(index):
main=dict()
....
links = foo.findAll ( 'a')
for link in links:
main.append(worker(link))
# or maybe something like: map_async(worker(link))
def worker(url):
''' this function gather the data from the given url'''
return dictionary
Now what I want to do with that is to have a certain number of worker function to gather data in parallel on different pages. This data would then be appended to a big dictionary located in harvester or written directly in a csv file by the worker function.
I'm wondering how I can implement parallelism. I have done a faire
amount of research on using gevent, threading and multiprocessing but
I am not sure how to implement it.
I am also not sure if appending data to a large dictionary or writing
directly in a csv using DictWriter will be stable with that many input at the same time.
Thanks
I propose you to split your work into separate workers which communicate via Queues.
Here you mostly have IO wait time (crawling, csv writing)
So you can do the following (not tested, just see the idea):
import threading
import Queue
class CsvWriter(threading.Thread):
def __init__(self, resultq):
super(CsvWriter, self).__init__()
self.resultq = resultq
self.writer = csv.DictWriter(open('results.csv', 'wb'))
def run(self):
done = False
while not done:
row = self.requltq.get()
if row != -1:
self.writer.writerow(row)
else:
done = True
class Crawler(threading.Thread):
def __init__(self, inputqueue, resultq):
super(Crawler, self).__init__()
self.iq = inputq
self.oq = resultq
def run(self):
done = False
while not done:
link = self.iq.get()
if link != -1:
result = self.extract_data(link)
self.oq.put(result)
else:
done = True
def extract_data(self, link):
# crawl and extract what you need and return a dict
pass
def main():
linkq = Queue.Queue()
for url in your_urls:
linkq.put(url)
resultq = Queue.Queue()
writer = CsvWriter(resultq)
writer.start()
crawlers = [Crawler(linkq, resultq) for _ in xrange(10)]
[c.start() for c in crawlers]
[linkq.put(-1) for _ in crawlers]
[c.join() for c in crawlers]
resultq.put(-1)
writer.join()
This code should work (fix possible typos) and make it to exit when all the urls are finished

Python MemoryError with Queue and threading

I'm currently writing a script that reads reddit comments from a large file (5 gigs compressed, ~30 gigs of data being read). My script reads the comments, checks for some text, parses them, and sends them off to a Queue function (running in a seperate thread). No matter what I do, I always get a MemoryError on a specific iteration (number 8162735 if it matters in the slightest). And I can't seem to handle the error, Windows just keeps shutting down python when it hits. Here's my script:
import ujson
from tqdm import tqdm
import bz2
import json
import threading
import spacy
import Queue
import time
nlp = spacy.load('en')
def iter_comments(loc):
with bz2.BZ2File(loc) as file_:
for i, line in (enumerate(file_)):
yield ujson.loads(line)['body']
objects = iter_comments('RC_2015-01.bz2')
q = Queue.Queue()
f = open("reddit_dump.bin", 'wb')
def worker():
while True:
item = q.get()
f.write(item)
q.task_done()
for i in range(0, 2):
t = threading.Thread(target=worker)
t.daemon = True
t.start()
def finish_parse(comment):
global q
try:
comment_parse = nlp(unicode(comment))
comment_bytes = comment_parse.to_bytes()
q.put(comment_bytes)
except MemoryError:
print "MemoryError with comment {0}, waiting for Queue to empty".format(comment)
time.sleep(2)
except AssertionError:
print "AssertionError with comment {0}, skipping".format(comment)
for comment in tqdm(objects):
comment = str(comment.encode('ascii', 'ignore'))
if ">" in comment:
c_parse_thread = threading.Thread(target=finish_parse, args=(comment,))
c_parse_thread.start()
q.join()
f.close()
Does anybody know what I'm doing wrong?
Looks like its not in your code but may be in the data. Have you tried to skip that iteration?
x = 0
for comment in tqdm(objects):
x += 1
if x != 8162735
comment = str(comment.encode('ascii', 'ignore'))
if ">" in comment:
c_parse_thread = threading.Thread(target=finish_parse, args=(comment,))
c_parse_thread.start()

Multithreading python

Is there a way I can multithread the function to take 5 URL's from the list at a single time ? Please see my code below . its python 2.7
import requests, csv, time, json, threading
from lxml import html
from csv import DictWriter
All_links = ['http://www.clopaydoor.com/api/v1/dealerlocator/getdealers?latitude=42.343097&longitude=-71.123046&doorType=residential&isFirstSearch=true&isHomeDepot=false&isClopayDealer=true&radius=3000&country=USA',
'http://www.clopaydoor.com/api/v1/dealerlocator/getdealers?latitude=42.398588&longitude=-71.24505&doorType=residential&isFirstSearch=true&isHomeDepot=false&isClopayDealer=true&radius=3000&country=USA',
'http://www.clopaydoor.com/api/v1/dealerlocator/getdealers?latitude=42.394319&longitude=-71.218049&doorType=residential&isFirstSearch=true&isHomeDepot=false&isClopayDealer=true&radius=3000&country=USA',
'http://www.clopaydoor.com/api/v1/dealerlocator/getdealers?latitude=42.365396&longitude=-71.23165&doorType=residential&isFirstSearch=true&isHomeDepot=false&isClopayDealer=true&radius=3000&country=USA',
'http://www.clopaydoor.com/api/v1/dealerlocator/getdealers?latitude=42.356719&longitude=-71.250479&doorType=residential&isFirstSearch=true&isHomeDepot=false&isClopayDealer=true&radius=3000&country=USA',
'http://www.clopaydoor.com/api/v1/dealerlocator/getdealers?latitude=42.385096&longitude=-71.208399&doorType=residential&isFirstSearch=true&isHomeDepot=false&isClopayDealer=true&radius=3000&country=USA',
'http://www.clopaydoor.com/api/v1/dealerlocator/getdealers?latitude=42.334146&longitude=-71.183298&doorType=residential&isFirstSearch=true&isHomeDepot=false&isClopayDealer=true&radius=3000&country=USA',
'http://www.clopaydoor.com/api/v1/dealerlocator/getdealers?latitude=42.374296&longitude=-71.182371&doorType=residential&isFirstSearch=true&isHomeDepot=false&isClopayDealer=true&radius=3000&country=USA']
target = open('completedlinks.txt','ab')
def get_data(each):
each = each.strip('\n')
r = requests.get(each)
source = json.loads(r.content)
the_file = open("output.csv", "ab")
writer = DictWriter(the_file, source[1].keys())
writer.writeheader()
writer.writerows(source)
the_file.close()
target.write(each+'\n')
print each+"\n--------------------------"
for each in All_links:
try:
get_data(each)
except:
pass
Check out the multiprocessing package. It implements thread pools, which would accomplish this.
Update:
Adding something like this should work
from multiprocessing import Pool
def chunks(l, n):
""" Yield successive n-sized chunks from l. """
for i in xrange(0, len(l), n):
yield l[i:i+n]
def threadit(threads, links):
for part in chunks(links, threads):
pool = Pool(threads)
for link in part:
pool.apply_async(getdata, args=(link,))
pool.close()
pool.join()
threadit(5, All_links)

Categories

Resources