As the title says, I want to make asynchronous API calls in apache beam using python.
Currently, I am calling the API inside a DoFn for each element in the Pcollection.
DoFn code
class textapi_call(beam.DoFn):
def __init__(self, api_key):
self.api_key = api_key
def setup(self):
self.session = requests.session()
def process(self, element):
address = element[3] + ", " + element[4] + ", " + element[5] + ", " + element[6] + ", " + element[7]
url = findplace_url(address, api_key=self.api_key)
params = {"inputtype": "textquery",
"fields": "name,place_id,geometry,type,icon,permanently_closed,business_status"}
start = time.time()
res = self.session.get(url, params=params)
results = json.loads(res.content)
time_taken = time.time() - start
return [[element[0], address, str(results), time_taken]]
pipeline code:
with beam.Pipeline(options=pipeline_options) as p:
lines = p | ReadFromText(input_file,skip_header_lines=1)
lines_list = lines | "to list" >> beam.Map(parse_csv)
res = lines_list | "API calls" >> beam.ParDo(textapi_call(api_key))
How can I modify the code to make the API calls asynchronous or concurrent?
I could not find any examples relating to this in python.
I want to mainly improve the performance. Please let me know if there is another way to make the API calls faster in beam apart from horizontal scaling.
For something like this, where you're spending most of your time waiting for external services, there are a couple of ways to speed things up.
The easiest would be to use something like BatchElements followed by a DoFn that would process a whole batch in parallel. This works best when the service in question has a batch API, but one can do it manually as well, e.g.
class ProcessBatchDoFn(beam.DoFo):
def setup(self):
self.session = requests.session()
self.executor = concurrent.futures.Executor(...)
def process(batch_of_elements):
urls = [element_to_url(e) for e in batch_of_elements]
for element, result in zip(
batch_of_elements,
self.executor.map(self.session.get, urls))
yield element[0], url, str(result) # or whatever is needed downstream
used as
with beam.Pipeline(options=pipeline_options) as p:
lines = p | ReadFromText(input_file,skip_header_lines=1)
lines_list = lines | "to list" >> beam.Map(parse_csv)
res = lines_list | beam.BatchElements() | beam.ParDo(ProcessBatchDoFn())
Maybe using something like https://github.com/ross/requests-futures could make this simpler.
This will work well as long as the URLs all take about the same amount of time to fetch. Anther option would be to do rolling requests, e.g.
def ExpensivePerElementDoFn(beam.DoFn):
def setup(self):
self.executor = concurrent.futures.Executor(...)
def start_bundle(self):
self.pending = []
def process(element):
self.pending.append(self.executor.submit(fn, element))
yield from self.flush_done()
while len(self.pending) > MAX_CONCURRENT_REQUESTS:
time.sleep(0.1)
yield from self.flush_done()
def flush_done(self):
done = [ix for ix, future in enumerate(self.pending) if future.done()]
for ix in reversed(done):
future = self.pending.pop(ix)
yield future.result()
def finish_bundle(self):
for future in self.pending():
yield future.result()
Related
I have a json file with list of urls.
After reading documentation, I figured, multiproccessing.pool is the best option for me.
I ran 10 urls, with multiprocessing.Pool(10), I was expecting the results would pretty much be instant, but it takes me about 12 seconds to complete everything, not sure if I am using it correctly, but below is my code.
def download_site(data, outputArr, proxyArr=None):
session = requests.Session()
# print("Scraping last name {lastName}".format(lastName=data['lastName']))
userAgents = open('user-agents.txt').read().split('\n')
params = (
('Name', data['lastName']),
('type', 'P'),
)
url = 'someurl'
if not proxyArr:
proxyArr = {
'http': data['proxy']['http']
}
try:
with session.get(url, params=params, proxies=proxyArr, headers=headers) as response:
name = multiprocessing.current_process().name
try:
content = response.json()
loadJson = json.loads(content)['nameBirthDetails']
for case in loadJson:
dateFile = loadJson[case]['dateFiled']
year = int(dateFile.split('/')[-1])
if year > 2018:
profileDetailUrlParams = (
('caseId',loadJson[case]['caseYear']),
('caseType', 'WC'),
('caseNumber', loadJson[case]['caseSeqNbr']),
)
loadJson[case]['caseDetail'] = getAttorneyData(profileDetailUrlParams, session, proxyArr)
outputArr.append(loadJson[case])
# print("Total Scraped Results so far ", len(outputArr))
except (requests.exceptions.ConnectionError, json.decoder.JSONDecodeError):
print("Error Found JSON DECODE ERROR - passing for last name", data['lastName'])
except simplejson.errors.JSONDecodeError:
print("Found Simple Json Error", data['lastName'])
pass
# newProxy = generate_random_proxy()
# download_site(data, outputArr, newProxy)
except:
raise
def queueList(sites):
manager = multiprocessing.Manager()
outputArr = manager.list()
functionMain = partial(download_site, outputArr = outputArr)
p = multiprocessing.Pool(10)
records = p.map(functionMain, sites)
p.terminate()
p.join()
if __name__ == "__main__":
outputArr = []
fileData = json.loads(open('lastNamesWithProxy.json').read())[:10]
start_time = time.time()
queueList(fileData)
duration = time.time() - start_time
print(f"Downloaded {len(fileData)} in {duration} seconds")
The function download_site, is the function where I fetch a list via requests library - then for each item in the list, I make another requests aka function getAttorneyData
How can I further hone this to run faster? I have a high-end computer so CPU shouldn't be an issue, I want to use it to it's max potential.
My goal is to be able to spawn 10 workers and consume each worker with each request. So, 10 requests would really take me 1-2 seconds instead of 12, which is currently.
So here is my use case:
I read from a database rows containing information to make a complex SOAP call (I'm using zeep to do these calls).
One row from the database corresponds to a request to the service.
There can be up to 20 thousand lines, so I don't want to read everything in memory before making the calls.
I need to process the responses - when the
response is OK, I need to store some returned information back into
my database, and when there is an exception I need to process the
exception for that particular request/response pair.
I need also to capture some external information at the time of the request creation, so that I know where to store the response from the request. In my current code I'm using the delightful property of gather() that makes the results come in the same order.
I read the relevant PEPs and Python documentation but I'm still very confused, as there seems to be multiple ways to solve the same problem.
I also went through countless exercises on the web, but the examples are all trivial - it's either asyncio.sleep() or some webscraping with a finite list of urls.
The solution that I have come up so far kinda works - the asyncio.gather() method is very, very, useful, but I have not been able to 'feed' it from a generator. I'm currently just counting to an arbitrary size and then starting a .gather() operation. I've transcribed the code, with boring parts left out and I've tried to anonymise the code
I've tried solutions involving semaphores, queues, different event loops, but I'm failing every time. Ideally I'd like to be able to create Futures 'continuously' - I think I'm missing the logic of 'convert this awaitable call to a future'
I'd be grateful for any help!
import asyncio
from asyncio import Future
import zeep
from zeep.plugins import HistoryPlugin
history = HistoryPlugin()
max_concurrent_calls = 5
provoke_errors = True
def export_data_async(db_variant: str, order_nrs: set):
st = time.time()
results = []
loop = asyncio.get_event_loop()
def get_client1(service_name: str, system: Systems = Systems.ACME) -> Tuple[zeep.Client, zeep.client.Factory]:
client1 = zeep.Client(wsdl=system.wsdl_url(service_name=service_name),
transport=transport,
plugins=[history],
)
factory_ns2 = client1.type_factory(namespace='ns2')
return client1, factory_ns2
table = 'ZZZZ'
moveback_table = 'EEEEEE'
moveback_dict = create_default_empty_ordered_dict('attribute1 attribute2 attribute3 attribute3')
client, factory = get_client1(service_name='ACMEServiceName')
if log.isEnabledFor(logging.DEBUG):
client.wsdl.dump()
zeep_log = logging.getLogger('zeep.transports')
zeep_log.setLevel(logging.DEBUG)
with Db(db_variant) as db:
db.open_db(CON_STRING[db_variant])
db.init_table_for_read(table, order_list=order_nrs)
counter_failures = 0
tasks = []
sids = []
results = []
def handle_future(future: Future) -> None:
results.extend(future.result())
def process_tasks_concurrently() -> None:
nonlocal tasks, sids, counter_failures, results
futures = asyncio.gather(*tasks, return_exceptions=True)
futures.add_done_callback(handle_future)
loop.run_until_complete(futures)
for i, response_or_fault in enumerate(results):
if type(response_or_fault) in [zeep.exceptions.Fault, zeep.exceptions.TransportError]:
counter_failures += 1
log_webservice_fault(sid=sids[i], db=db, err=response_or_fault, object=table)
else:
db.write_dict_to_table(
moveback_table,
{'sid': sids[i],
'attribute1': response_or_fault['XXX']['XXX']['xxx'],
'attribute2': response_or_fault['XXX']['XXX']['XXXX']['XXX'],
'attribute3': response_or_fault['XXXX']['XXXX']['XXX'],
}
)
db.commit_db_con()
tasks = []
sids = []
results = []
return
for row in db.rows(table):
if int(row.id) % 2 == 0 and provoke_errors:
payload = faulty_message_payload(row=row,
factory=factory,
)
else:
payload = message_payload(row=row,
factory=factory,
)
tasks.append(client.service.myRequest(
MessageHeader=factory.MessageHeader(**message_header_arguments(row=row)),
myRequestPayload=payload,
_soapheaders=[security_soap_header],
))
sids.append(row.sid)
if len(tasks) == max_concurrent_calls:
process_tasks_concurrently()
if tasks: # this is the remainder of len(db.rows) % max_concurrent_calls
process_tasks_concurrently()
loop.run_until_complete(transport.session.close())
db.execute_this_statement(statement=update_sql)
db.commit_db_con()
log.info(db.activity_log)
if counter_failures:
log.info(f"{table :<25} Count failed: {counter_failures}")
print("time async: %.2f" % (time.time() - st))
return results
Failed attempt with Queue: (blocks at await client.service)
loop = asyncio.get_event_loop()
counter = 0
results = []
async def payload_generator(db_variant: str, order_nrs: set):
# code that generates the data for the request
yield counter, row, payload
async def service_call_worker(queue, results):
while True:
counter, row, payload = await queue.get()
results.append(await client.service.myServicename(
MessageHeader=calculate_message_header(row=row)),
myPayload=payload,
_soapheaders=[security_soap_header],
)
)
print(colorama.Fore.BLUE + f'after result returned {counter}')
# Here do the relevant processing of response or error
queue.task_done()
async def main_with_q():
n_workers = 3
queue = asyncio.Queue(n_workers)
e = pprint.pformat(queue)
p = payload_generator(DB_VARIANT, order_list_from_args())
results = []
workers = [asyncio.create_task(service_call_worker(queue, results))
for _ in range(n_workers)]
async for c in p:
await queue.put(c)
await queue.join() # wait for all tasks to be processed
for worker in workers:
worker.cancel()
if __name__ == '__main__':
try:
loop.run_until_complete(main_with_q())
loop.run_until_complete(transport.session.close())
finally:
loop.close()
I'm just starting to use Asyncio and I'm trying to use it to parse a website.
I'm trying to parse 6 sections (self.signals) of the site, each section has N number of pages with tables on them, so essentially I'm trying to async the loop that calls what section, and async the pages in each section. This is what I have so far.
class FinViz():
def __init__(self):
self.url = 'https://finviz.com/screener.ashx?v=160&s='
self.signals = {
'Earnings_Before' : 'n_earningsbefore',
'Earnings_After' : 'n_earningsafter',
'Most_Active' : 'ta_mostactive',
'Top_Gainers' : 'ta_topgainers',
'Most_Volatile' : 'ta_mostvolatile',
'News' : 'n_majornews',
'Upgrade' : 'n_upgrades',
'Unusual_Volume' : 'ta_unusualvolume'
}
self.ticks = []
def _parseTable(self, data):
i, signal = data
url = self.signals[signal] if i == 0 else self.signals[signal] + '&r={}'.format(str(i * 20 + 1))
soup = BeautifulSoup(urlopen(self.url + url, timeout = 3).read(), 'html5lib')
table = soup.find('div', {'id' : 'screener-content'}).find('table',
{'width' : '100%', 'cellspacing': '1', 'cellpadding' : '3', 'border' : '0', 'bgcolor' : '#d3d3d3'})
for row in table.findAll('tr'):
col = row.findAll('td')[1]
if col.find('a'):
self.ticks.append(col.find('a').text)
async def parseSignal(self, signal):
try:
soup = BeautifulSoup(urlopen(self.url + self.signals[signal], timeout = 3).read(), 'html5lib')
tot = int(soup.find('td', {'class' : 'count-text'}).text.split()[1])
with concurrent.futures.ThreadPoolExecutor(max_workers = 20) as executor:
loop = asyncio.get_event_loop()
futures = []
for i in range(tot // 20 + (tot % 20 > 0)):
futures.append(loop.run_in_executor(executor, self._parseTable, (i, signal)))
for response in await asyncio.gather(*futures):
pass
except URLError:
pass
async def getAll(self):
with concurrent.futures.ThreadPoolExecutor(max_workers = 20) as executor:
loop = asyncio.get_event_loop()
futures = []
for signal in self.signals:
futures.append(await loop.run_in_executor(executor, self.parseSignal, signal))
for response in await asyncio.gather(*futures):
pass
print(self.ticks)
if __name__ == '__main__':
x = FinViz()
loop = asyncio.get_event_loop()
loop.run_until_complete(x.getAll())
This does do the job successfully, but it somehow does it slower than if I were to do the parsing without asyncio.
Any tips for an asynchronous noob?
Edit: Added full code
Remember python has a GIL, so threaded code will not help performance. To potentially speed things up use a ProcessPoolExecutor however note you'll incur the following overhead:
pickle/unpickling data to sub-process worker
pickle/unpickling result sent back to main process
You can avoid 1. if you run on a fork safe environment and store the data in a global variable.
You can also do stuff like share a memory mapped file...also sharing raw strings/bytes is the fastest.
I'm using asyncio and aiohttp to make a async scraper.
For some reason after I hit 150+ request its start slowing down. The first async
runs fine where i get the links. The second one is where i get the problem where the slowness happens. Like after 200 it needs 1min for one request. Any idea why? Am I using Asyncio or aiohttp incorrectly?
Edit: I'm running this localy on a 7gb ram so I don't think im running out of memory.
import aiohttp
import asyncio
import async_timeout
import re
from lxml import html
import timeit
from os import makedirs,chmod
basepath = ""
start = timeit.default_timer()
novel = ""
novel = re.sub(r"[^a-zA-Z0-9 ]+/", "", novel)
novel = re.sub(r" ", "-", novel)
novel_url = {}
#asyncio.coroutine
def get(*args, **kwargs):
response = yield from aiohttp.request('GET', *args, **kwargs)
return (yield from response.text())
def scrape_links(page):
url = html.fromstring(page)
links = url.xpath("")
chapter_count = url.xpath("")
dictonaries = dict(zip(chapter_count,links))
novel_url.update(dictonaries)
def print_links(query):
# Makedirs and apply chmod
makedirs('%s/%s' % ( basepath,query ),exist_ok=True)
makedirs('%s/%s/img' % (basepath, query),exist_ok=True)
chmod('%s/%s' % ( basepath,query ), 0o765)
chmod('%s/%s/img/' % ( basepath,query ), 0o765)
url = 'https://www.examplesite.org/' + query
page = yield from get(url, compress=True)
magnet = scrape_links(page)
loop = asyncio.get_event_loop()
f = asyncio.wait([print_links(novel)])
loop.run_until_complete(f)
##### now getting chapters from links array
def scrape_chapters(page, i):
url = html.fromstring(page)
title = url.xpath("")
title = ''.join(title)
title = re.sub(r"", "", title)
chapter = url.xpath("")
# Use this to join them insteed of looping though if it doesn't work in epub maker
# chapter = '\n'.join(chapter)
print(title)
# file = open("%s/%s/%s-%s.html" % (basepath, novel, novel, i), 'w+')
# file.write("<h1>%s</h1>" % title)
# for x in chapter:
# file.write("\n<p>%s</p>" % x)
# file.close()
def print_chapters(query):
chapter = (str(query[0]))
chapter_count = re.sub(r"CH ", "", chapter)
page = yield from get(query[1], compress=True)
chapter = scrape_chapters(page, chapter_count)
loop = asyncio.get_event_loop()
f = asyncio.wait([print_chapters(d) for d in novel_url.items()])
loop.run_until_complete(f)
stop = timeit.default_timer()
print("\n")
print(stop - start)
Could it be due to the limit on aiohttp.ClientSession connections?
https://docs.aiohttp.org/en/latest/http_request_lifecycle.html#how-to-use-the-clientsession
It may try passing connector with larger limit: https://docs.aiohttp.org/en/latest/client_advanced.html#limiting-connection-pool-size
If I run the script step by step works perfectly, but when I'm using threading misses 50-60%. I'm using Python + mechanize module
#setting up the browser
mySite = 'http://example.com/managament.php?'
postData = {'UserID' : '', 'Action':'Delete'}
job_tab1_user1 = [1,2,3]
job_tab2_user1 = [4,5,6]
job_tab1_user2 = [7,8,9]
job_tab2_user2 = [10,12,13]
.... till user1000
#i want to point out that the lists are 100% different
def user1_jobs:
for i in job_tab1_user1:
browser.open("http://example.com/jobs.php?actions="+i)
browser.open(mySite, Post_data)
for i in job_tab2_user1:
browser.open("http://example.com/jobs.php?actions="+i)
browser.open(mySite, Post_data)
def user2_jobs:
for i in job_tab1_user2:
browser.open("http://example.com/jobs.php?actions="+i)
browser.open(mySite, Post_data)
for i in job_tab2_user2:
browser.open("http://example.com/jobs.php?actions="+i)
browser.open(mySite, Post_data)
... and so on till user 1000
And I call them in the end like this:
t_user1 = threading.Thread(target=user1_jobs, args=[])
t_user1.start()
t_user2 = threading.Thread(target=user2_jobs, args=[])
t_user2.start()
I have a similar script that sends like 200 request per second and all of them are processed. I also tried using time.sleep(2), but again is missing a lot.
Another question besides what is wrong with my script is if its way to compact this code, because I'm using 1000 users and the script reaches thousands of lines. Thank you in advance.
from threading import *
submits = [[1,2,3], [3,4,5], [6,7,8]]
class worker(Thread):
def __init__(self, site, postdata, data):
Thread.__init__(self)
self.data = data
self.site = site
self.postdata = postdata
self.start()
def run(self):
for i in self.data:
browser.open("http://example.com/jobs.php?actions="+str(i))
browser.open(self.site, self.postdata)
for obj in submits:
worker('http://example.com/managament.php?', {'UserID' : '', 'Action':'Delete'}, submits)
Since the OP asked for it, here's a condensed/compressed version of the code.
or:
for index in range(0,1000):
worker('http://example.com/managament.php?', {'UserID' : '', 'Action':'Delete'}, [i for i in range(1,4)])
If the data you want to send actually is a sequence of 3 integers (1,2,3) that inclines in a perfect order.
Here is a full script that you can easily modify by changing the initial variables.
It creates a list dynamically and uses a generator to create the functions for each thread.
Currently it creates 1000 users, each with 2 tabs and 3 jobs.
# define your variables here
NUM_USERS = 1000
NUM_JOBS_PER_USER = 3
NUM_TABS_PER_USER = 2
URL_PART = "http://example.com/jobs.php?actions="
# populate our list of jobs
# the structure is like this: jobs[user][tab][job]
jobs = [[[0 for y in range(NUM_JOBS_PER_USER)] \
for x in range(NUM_TABS_PER_USER)] \
for x in range(NUM_USERS)]
p = 1
for i in range(NUM_USERS):
for j in range(NUM_TABS_PER_USER):
for k in range(NUM_JOBS_PER_USER):
jobs[i][j][k] = p
p += 1
# create a generator that builds our thread functions
def generateFunctions(jobs):
for user in jobs:
for tab in user:
for job in tab:
def f():
browser.open(URL_PART + str(job))
browser.open(mySite, Post_data)
yield f
# create and start threads, add them to a list
# if we need to preserve handlers for later use
threads = []
for f in generateFunctions(jobs):
thr = threading.Thread(target = f, args=[])
thr.start()
threads.append(thr)