Python asyncio and aiohttp slowing down after 150+ requests - python

I'm using asyncio and aiohttp to make a async scraper.
For some reason after I hit 150+ request its start slowing down. The first async
runs fine where i get the links. The second one is where i get the problem where the slowness happens. Like after 200 it needs 1min for one request. Any idea why? Am I using Asyncio or aiohttp incorrectly?
Edit: I'm running this localy on a 7gb ram so I don't think im running out of memory.
import aiohttp
import asyncio
import async_timeout
import re
from lxml import html
import timeit
from os import makedirs,chmod
basepath = ""
start = timeit.default_timer()
novel = ""
novel = re.sub(r"[^a-zA-Z0-9 ]+/", "", novel)
novel = re.sub(r" ", "-", novel)
novel_url = {}
#asyncio.coroutine
def get(*args, **kwargs):
response = yield from aiohttp.request('GET', *args, **kwargs)
return (yield from response.text())
def scrape_links(page):
url = html.fromstring(page)
links = url.xpath("")
chapter_count = url.xpath("")
dictonaries = dict(zip(chapter_count,links))
novel_url.update(dictonaries)
def print_links(query):
# Makedirs and apply chmod
makedirs('%s/%s' % ( basepath,query ),exist_ok=True)
makedirs('%s/%s/img' % (basepath, query),exist_ok=True)
chmod('%s/%s' % ( basepath,query ), 0o765)
chmod('%s/%s/img/' % ( basepath,query ), 0o765)
url = 'https://www.examplesite.org/' + query
page = yield from get(url, compress=True)
magnet = scrape_links(page)
loop = asyncio.get_event_loop()
f = asyncio.wait([print_links(novel)])
loop.run_until_complete(f)
##### now getting chapters from links array
def scrape_chapters(page, i):
url = html.fromstring(page)
title = url.xpath("")
title = ''.join(title)
title = re.sub(r"", "", title)
chapter = url.xpath("")
# Use this to join them insteed of looping though if it doesn't work in epub maker
# chapter = '\n'.join(chapter)
print(title)
# file = open("%s/%s/%s-%s.html" % (basepath, novel, novel, i), 'w+')
# file.write("<h1>%s</h1>" % title)
# for x in chapter:
# file.write("\n<p>%s</p>" % x)
# file.close()
def print_chapters(query):
chapter = (str(query[0]))
chapter_count = re.sub(r"CH ", "", chapter)
page = yield from get(query[1], compress=True)
chapter = scrape_chapters(page, chapter_count)
loop = asyncio.get_event_loop()
f = asyncio.wait([print_chapters(d) for d in novel_url.items()])
loop.run_until_complete(f)
stop = timeit.default_timer()
print("\n")
print(stop - start)

Could it be due to the limit on aiohttp.ClientSession connections?
https://docs.aiohttp.org/en/latest/http_request_lifecycle.html#how-to-use-the-clientsession
It may try passing connector with larger limit: https://docs.aiohttp.org/en/latest/client_advanced.html#limiting-connection-pool-size

Related

Threading NNTP, how? (Newbie here)

I can't wrap my head around how I could possibly rewrite my code to be multi-threaded.
The code I'm writing is made to automatically archive every single article in a list of newsgroups that exist, but I wanna be able to utilize my newsgroup plan and make it up to 20 threads. I've never coded threading before and my attempts were in vein.
Here's my code, excluding the username and pass ( but you can get a free account with max 5 threads if you really want to at https://my.xsusenet.com )
Please don't judge me too hard :(
import nntplib
import sys
import datetime
import os
basetime = datetime.datetime.today()
#daysback = int(sys.argv[1])
#date_list = [basetime - datetime.timedelta(days=x) for x in range(daysback)]
s = nntplib.NNTP('free.xsusenet.com', user='USERNAME', password='PASSWORD') # I am only allowed 5 connections at a time, so try for 4.
groups = []
resp, groups_list_tuple = s.list()
def remove_non_ascii_2(string):
return string.encode('ascii', errors='ignore').decode()
for g_tuple in groups_list_tuple:
#print(g_tuple) # DEBUG_LINE
# Parse group_list info
group = g_tuple[0]
last = g_tuple[1]
first = g_tuple[2]
flag = g_tuple[3]
# Parse newsgroup info
resp, count, first, last, name = s.group(group)
for message_id in range(first, last):
resp, number, mes_id = s.next()
resp, info = s.article(mes_id)
if os.path.exists('.\\' + group):
pass
else:
os.mkdir('.\\' + group)
print(f"Downloading: {message_id}")
outfile = open('.\\' + group + '\\' + str(message_id), 'a', encoding="utf-8")
for line in info.lines:
outfile.write(remove_non_ascii_2(str(line)) + '\n')
outfile.close()
Tried threading using a ThreadPoolExecutor, to cause it to use 20 threads, and failed, caused it to repeat the same process to the same message id. The expected result was to download 20 different messages at a time.
Here's the code I tried with threading, mind you I did like 6-8 variations of it to try and get it to work, this was the last one before I gave up to ask on here.
import nntplib
import sys
import datetime
import os
import concurrent.futures
basetime = datetime.datetime.today()
#daysback = int(sys.argv[1])
#date_list = [basetime - datetime.timedelta(days=x) for x in range(daysback)]
s = nntplib.NNTP('free.xsusenet.com', user='USERNAME', password='PASSWORD') # I am only allowed 5 connections at a time, so try for 4.
groups = []
resp, groups_list_tuple = s.list()
def remove_non_ascii_2(string):
return string.encode('ascii', errors='ignore').decode()
def download_nntp_file(mess_id):
resp, count, first, last, name = s.group(group)
message_id = range(first, last)
resp, number, mes_id = s.next()
resp, info = s.article(mes_id)
if os.path.exists('.\\' + group):
pass
else:
os.mkdir('.\\' + group)
print(f"Downloading: {mess_id}")
outfile = open('.\\' + group + '\\' + str(mess_id), 'a', encoding="utf-8")
for line in info.lines:
outfile.write(remove_non_ascii_2(str(line)) + '\n')
outfile.close()
for g_tuple in groups_list_tuple:
#print(g_tuple) # DEBUG_LINE
# Parse group_list info
group = g_tuple[0]
last = g_tuple[1]
first = g_tuple[2]
flag = g_tuple[3]
# Parse newsgroup info
with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
futures = executor.submit(download_nntp_file)
I can't test it with XSUseNet.
I wouldn't use global variables because when processes work at the same time then they may get the same values from these variables.
You should rather send values as parameters to functions.
Something like this:
def download_nntp_file(g_tuple):
# ... code which uses `g_tuple` instead of global variables ...
with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
for g_tuple in groups_list_tuple:
executor.submit(download_nntp_file, g_tuple)
But I would be simpler to use map() instead of submit() because it gets list with arguments and it doesn't need for-loop
def download_nntp_file(g_tuple):
# ... code which uses `g_tuple` instead of global variables ...
with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
executor.map(download_nntp_file, groups_list_tuple)

asynchronous API calls in apache beam

As the title says, I want to make asynchronous API calls in apache beam using python.
Currently, I am calling the API inside a DoFn for each element in the Pcollection.
DoFn code
class textapi_call(beam.DoFn):
def __init__(self, api_key):
self.api_key = api_key
def setup(self):
self.session = requests.session()
def process(self, element):
address = element[3] + ", " + element[4] + ", " + element[5] + ", " + element[6] + ", " + element[7]
url = findplace_url(address, api_key=self.api_key)
params = {"inputtype": "textquery",
"fields": "name,place_id,geometry,type,icon,permanently_closed,business_status"}
start = time.time()
res = self.session.get(url, params=params)
results = json.loads(res.content)
time_taken = time.time() - start
return [[element[0], address, str(results), time_taken]]
pipeline code:
with beam.Pipeline(options=pipeline_options) as p:
lines = p | ReadFromText(input_file,skip_header_lines=1)
lines_list = lines | "to list" >> beam.Map(parse_csv)
res = lines_list | "API calls" >> beam.ParDo(textapi_call(api_key))
How can I modify the code to make the API calls asynchronous or concurrent?
I could not find any examples relating to this in python.
I want to mainly improve the performance. Please let me know if there is another way to make the API calls faster in beam apart from horizontal scaling.
For something like this, where you're spending most of your time waiting for external services, there are a couple of ways to speed things up.
The easiest would be to use something like BatchElements followed by a DoFn that would process a whole batch in parallel. This works best when the service in question has a batch API, but one can do it manually as well, e.g.
class ProcessBatchDoFn(beam.DoFo):
def setup(self):
self.session = requests.session()
self.executor = concurrent.futures.Executor(...)
def process(batch_of_elements):
urls = [element_to_url(e) for e in batch_of_elements]
for element, result in zip(
batch_of_elements,
self.executor.map(self.session.get, urls))
yield element[0], url, str(result) # or whatever is needed downstream
used as
with beam.Pipeline(options=pipeline_options) as p:
lines = p | ReadFromText(input_file,skip_header_lines=1)
lines_list = lines | "to list" >> beam.Map(parse_csv)
res = lines_list | beam.BatchElements() | beam.ParDo(ProcessBatchDoFn())
Maybe using something like https://github.com/ross/requests-futures could make this simpler.
This will work well as long as the URLs all take about the same amount of time to fetch. Anther option would be to do rolling requests, e.g.
def ExpensivePerElementDoFn(beam.DoFn):
def setup(self):
self.executor = concurrent.futures.Executor(...)
def start_bundle(self):
self.pending = []
def process(element):
self.pending.append(self.executor.submit(fn, element))
yield from self.flush_done()
while len(self.pending) > MAX_CONCURRENT_REQUESTS:
time.sleep(0.1)
yield from self.flush_done()
def flush_done(self):
done = [ix for ix, future in enumerate(self.pending) if future.done()]
for ix in reversed(done):
future = self.pending.pop(ix)
yield future.result()
def finish_bundle(self):
for future in self.pending():
yield future.result()

Improve speed on current multiprocessing.Pool()

I have a json file with list of urls.
After reading documentation, I figured, multiproccessing.pool is the best option for me.
I ran 10 urls, with multiprocessing.Pool(10), I was expecting the results would pretty much be instant, but it takes me about 12 seconds to complete everything, not sure if I am using it correctly, but below is my code.
def download_site(data, outputArr, proxyArr=None):
session = requests.Session()
# print("Scraping last name {lastName}".format(lastName=data['lastName']))
userAgents = open('user-agents.txt').read().split('\n')
params = (
('Name', data['lastName']),
('type', 'P'),
)
url = 'someurl'
if not proxyArr:
proxyArr = {
'http': data['proxy']['http']
}
try:
with session.get(url, params=params, proxies=proxyArr, headers=headers) as response:
name = multiprocessing.current_process().name
try:
content = response.json()
loadJson = json.loads(content)['nameBirthDetails']
for case in loadJson:
dateFile = loadJson[case]['dateFiled']
year = int(dateFile.split('/')[-1])
if year > 2018:
profileDetailUrlParams = (
('caseId',loadJson[case]['caseYear']),
('caseType', 'WC'),
('caseNumber', loadJson[case]['caseSeqNbr']),
)
loadJson[case]['caseDetail'] = getAttorneyData(profileDetailUrlParams, session, proxyArr)
outputArr.append(loadJson[case])
# print("Total Scraped Results so far ", len(outputArr))
except (requests.exceptions.ConnectionError, json.decoder.JSONDecodeError):
print("Error Found JSON DECODE ERROR - passing for last name", data['lastName'])
except simplejson.errors.JSONDecodeError:
print("Found Simple Json Error", data['lastName'])
pass
# newProxy = generate_random_proxy()
# download_site(data, outputArr, newProxy)
except:
raise
def queueList(sites):
manager = multiprocessing.Manager()
outputArr = manager.list()
functionMain = partial(download_site, outputArr = outputArr)
p = multiprocessing.Pool(10)
records = p.map(functionMain, sites)
p.terminate()
p.join()
if __name__ == "__main__":
outputArr = []
fileData = json.loads(open('lastNamesWithProxy.json').read())[:10]
start_time = time.time()
queueList(fileData)
duration = time.time() - start_time
print(f"Downloaded {len(fileData)} in {duration} seconds")
The function download_site, is the function where I fetch a list via requests library - then for each item in the list, I make another requests aka function getAttorneyData
How can I further hone this to run faster? I have a high-end computer so CPU shouldn't be an issue, I want to use it to it's max potential.
My goal is to be able to spawn 10 workers and consume each worker with each request. So, 10 requests would really take me 1-2 seconds instead of 12, which is currently.

python web scraping from a list of urls

I am new in asks and trio in python, I got a sample code. let me explain
I have a list of URL every one is news URLs, each one has sub urls.
the first url requests and get all other hrefs and add in a list.
then get the article of all hrefs in that list.
The issue is certain times the article is getting other times empty.
tried the sample code for single urls that time its working
import asks
import trio
from goose3 import Goose
import logging as log
from goose3.configuration import ArticleContextPattern
from pprint import pprint
import json
import time
asks.init('trio')
async def extractor(path, htmls, paths, session):
try:
r = await session.get(path, timeout=2)
out = r.content
htmls.append(out)
paths.append(path)
except Exception as e:
out = str(e)
htmls.append(out)
paths.append(path)
async def main(path_list, session):
htmls = []
paths = []
async with trio.open_nursery() as n:
for path in path_list:
n.start_soon(extractor, path, htmls, paths, session)
return htmls, paths
async def run(urls, conns=50):
s = asks.Session(connections=conns)
g = Goose()
htmls, paths = await main(urls, s)
print(htmls," ",paths)
cleaned = []
for html, path in zip(htmls, paths):
dic = {}
dic['url'] = path
if html is not None:
try:
#g.config.known_context_pattern = ArticleContextPattern(attr='class', value='the-post')
article = g.extract(raw_html=html)
author=article.authors
dic['goose_text'] = article.cleaned_text
#print(article.cleaned_text)
#dic['goose_date'] = article.publish_datetime
dic['goose_title'] = article.title
if author:
dic['authors']=author[0]
else:
dic['authors'] =''
except Exception as e:
raise
print(e)
log.info('goose found no text using html')
dic['goose_html'] = html
dic['goose_text'] = ''
dic['goose_date'] = None
dic['goose_title'] = None
dic['authors'] =''
cleaned.append(dic)
return cleaned
async def real_main():
sss= '[{"crawl_delay_sec": 0, "name": "mining","goose_text":"","article_date":"","title":"", "story_url": "http://www.mining.com/canalaska-start-drilling-west-mcarthur-uranium-project","url": "http://www.mining.com/tag/latin-america/page/1/"},{"crawl_delay_sec": 0, "name": "mining", "story_url": "http://www.mining.com/web/tesla-fires-sound-alarms-safety-electric-car-batteries", "url": "http://www.mining.com/tag/latin-america/page/1/"}]'
obj = json.loads(sss)
pprint(obj)
articles=[]
for l in obj:
articles.append(await run([l['story_url']]))
#await trio.sleep(3)
pprint(articles)
if __name__ == "__main__":
trio.run(real_main)
get the article data without missing
I lack some further information to answer your question in-depth, but most likely it has to do with the way goose search for text within the html. See this answer for more details: https://stackoverflow.com/a/30408761/8867146
"asks" does not always raise an exception when the status code is != 200. You need to examine the response's status code before using its content. You might also want to increase the timeout, 2 seconds is not enough, particularly when you're firing off up to 50 connections in parallel.
In any case, here's a simplified program – all that Goose stuff is completely unnecessary for showing the actual error, two result arrays are not a good idea, and adding error messages to the result array looks broken.
Also you should investigate running the URL fetching and the processing in parallel. trio.open_memory_channel is your friend here.
import asks
asks.init('trio')
import trio
from pprint import pprint
async def extractor(path, session, results):
try:
r = await session.get(path, timeout=2)
if r.status_code != 200:
raise asks.errors.BadStatus("Not OK",r.status_code)
out = r.content
except Exception as e:
# do some reasonable error handling
print(path, repr(e))
else:
results.append((out, path))
async def main(path_list, session):
results = []
async with trio.open_nursery() as n:
for path in path_list:
n.start_soon(extractor, path, session, results)
return results
async def run(conns=50):
s = asks.Session(connections=conns)
urls = [
"http://www.mining.com/web/tesla-fires-sound-alarms-safety-electric-car-batteries",
"http://www.mining.com/canalaska-start-drilling-west-mcarthur-uranium-project",
"https://www.google.com", # just for testing more parallel connections
"https://www.debian.org",
]
results = await main(urls, s)
for content, path in results:
pass # analyze this result
print("OK")
if __name__ == "__main__":
trio.run(run)

Asyncio Loop Within Asyncio Loop

I'm just starting to use Asyncio and I'm trying to use it to parse a website.
I'm trying to parse 6 sections (self.signals) of the site, each section has N number of pages with tables on them, so essentially I'm trying to async the loop that calls what section, and async the pages in each section. This is what I have so far.
class FinViz():
def __init__(self):
self.url = 'https://finviz.com/screener.ashx?v=160&s='
self.signals = {
'Earnings_Before' : 'n_earningsbefore',
'Earnings_After' : 'n_earningsafter',
'Most_Active' : 'ta_mostactive',
'Top_Gainers' : 'ta_topgainers',
'Most_Volatile' : 'ta_mostvolatile',
'News' : 'n_majornews',
'Upgrade' : 'n_upgrades',
'Unusual_Volume' : 'ta_unusualvolume'
}
self.ticks = []
def _parseTable(self, data):
i, signal = data
url = self.signals[signal] if i == 0 else self.signals[signal] + '&r={}'.format(str(i * 20 + 1))
soup = BeautifulSoup(urlopen(self.url + url, timeout = 3).read(), 'html5lib')
table = soup.find('div', {'id' : 'screener-content'}).find('table',
{'width' : '100%', 'cellspacing': '1', 'cellpadding' : '3', 'border' : '0', 'bgcolor' : '#d3d3d3'})
for row in table.findAll('tr'):
col = row.findAll('td')[1]
if col.find('a'):
self.ticks.append(col.find('a').text)
async def parseSignal(self, signal):
try:
soup = BeautifulSoup(urlopen(self.url + self.signals[signal], timeout = 3).read(), 'html5lib')
tot = int(soup.find('td', {'class' : 'count-text'}).text.split()[1])
with concurrent.futures.ThreadPoolExecutor(max_workers = 20) as executor:
loop = asyncio.get_event_loop()
futures = []
for i in range(tot // 20 + (tot % 20 > 0)):
futures.append(loop.run_in_executor(executor, self._parseTable, (i, signal)))
for response in await asyncio.gather(*futures):
pass
except URLError:
pass
async def getAll(self):
with concurrent.futures.ThreadPoolExecutor(max_workers = 20) as executor:
loop = asyncio.get_event_loop()
futures = []
for signal in self.signals:
futures.append(await loop.run_in_executor(executor, self.parseSignal, signal))
for response in await asyncio.gather(*futures):
pass
print(self.ticks)
if __name__ == '__main__':
x = FinViz()
loop = asyncio.get_event_loop()
loop.run_until_complete(x.getAll())
This does do the job successfully, but it somehow does it slower than if I were to do the parsing without asyncio.
Any tips for an asynchronous noob?
Edit: Added full code
Remember python has a GIL, so threaded code will not help performance. To potentially speed things up use a ProcessPoolExecutor however note you'll incur the following overhead:
pickle/unpickling data to sub-process worker
pickle/unpickling result sent back to main process
You can avoid 1. if you run on a fork safe environment and store the data in a global variable.
You can also do stuff like share a memory mapped file...also sharing raw strings/bytes is the fastest.

Categories

Resources