I am looking for guidance around best practices with asyncio and aiohttp in Python 3. I have a basic scraper but I am not sure how to:
Properly implement error handling. More specific around my fetch function.
Do I really need the last main function to wrap my async crawler around?
Here is my code so far, it is working but I would like feedback on the two item above.
urls = []
async def fetch(url, payload={}):
async with ClientSession() as s:
async with s.get(url, params=payload) as resp:
content = await resp.read()
return content
async def get_profile_urls(url, payload):
content = await fetch(url, payload)
soup = BeautifulSoup(content, 'html.parser')
soup = soup.find_all(attrs={'class': 'classname'})
if soup:
urls.extend([s.find('a')['href'] for s in soup])
async def main():
tasks = []
payload = {
'page': 0,
'filter': 88}
for i in range(max_page + 1):
payload['page'] += 1
tasks.append(get_profile_urls(search_ulr, payload))
await asyncio.wait(tasks)
asyncio.run(main())
Related
async def get_html(self, url):
async with aiohttp.ClientSession() as session:
async with session.get(url, headers=headers) as resp:
return await resp.text()
async def getrank(self, url):
response = await self.get_html(f'url')
print(f'{response.site} | got site')
soup = BeautifulSoup(response, "html.parser")
rank = soup.find("div", {"id": "productDetails_db_sections"})
test2 = rank.find_all("span")
rank = str(test2[-2].text).replace(",","")
finalRank = int(re.search("\d+", rank).group())
if finalRank < 20000:
print(f'product has low rank, starting new function')
await self.getPriceFinal(url, finalrank)
async def getPriceFinal(self, url, rank):
try:
print(f'Checking for Price....') #THIS PRINTS
s = aiohttp.ClientSession()
response = await s.get(f"{url}", headers = self.headers) #THIS WAITS UNTIL getrank finished
print(response.status)
The main problem I have is that the function getPriceFinal() runs to the print and after that waits for the getrank() function to finish ... however what I would like to do is to start the getPriceFinal() function with the url from getrank() function concurrently .. and ideas on how to solve this issue?
New to asyncio, using it to try to make a very large number of API requests more quickly and store the data returned from each request in a dict. I think I've got the syntax of using asyncio and aiohttp figured out mostly, because I'm getting the data returned but I'm having a hard time taking that data and storing it in a dict.
search_ids = [1,2,3,4,5,6,7,8,9,10]
stats = {"Date":[],"Instance ID":[],"Result":[],"Display Name":[]}
async def main():
async with aiohttp.ClientSession() as session:
tasks = []
for search_id in search_ids:
task = asyncio.ensure_future(get_data(session, search_id))
tasks.append(task)
responses = await asyncio.gather(*tasks)
for y in responses['entries']:
stats['Display Name'].append(y['player']['UserInfo']['displayName'])
async def get_data(session, search_id):
url = f'https://www.myapi.com/{search_id}'
async with session.get(url, headers=HEADERS, ssl=False) as response:
results = await response.json()
return results['Response']
asyncio.run(main())
So when I run this, I get an error: TypeError: list indices must be integers or slices, not str
Which makes it seem to me as if the data that has been returned isn't iterable. However, I've looked at what's being returned and it's exactly what I'm expecting it to be. So much that if I change the code to look like this instead, it works fine:
search_ids = [1,2,3,4,5,6,7,8,9,10]
stats = {"Date":[],"Instance ID":[],"Result":[],"Display Name":[]}
async def main():
async with aiohttp.ClientSession() as session:
tasks = []
for search_id in search_ids:
task = asyncio.ensure_future(get_data(session, search_id))
tasks.append(task)
responses = await asyncio.gather(*tasks)
for y in responses:
stats['Display Name'].append(y['entries'][0]['player']['UserInfo']['displayName'])
stats['Display Name'].append(y['entries'][1]['player']['UserInfo']['displayName'])
stats['Display Name'].append(y['entries'][2]['player']['UserInfo']['displayName'])
stats['Display Name'].append(y['entries'][3]['player']['UserInfo']['displayName'])
stats['Display Name'].append(y['entries'][4]['player']['UserInfo']['displayName'])
stats['Display Name'].append(y['entries'][5]['player']['UserInfo']['displayName'])
async def get_data(session, search_id):
url = f'https://www.myapi.com/{search_id}'
async with session.get(url, headers=HEADERS, ssl=False) as response:
results = await response.json()
return results['Response']
asyncio.run(main())
Am I not basically doing the same thing manually here that I'm trying to do with a For loop on the top snippet? I would just go with this workaround except that I plan on pulling out much more data from each of these responses and it's not practical to manually do this over and over.
Plus obviously this makes me question if I'm understanding async correctly or not if this is giving me such a simple error.
Appreciate any help.
You’re iterating over two different things. In the first one you iterate over responses[“entries”]. In the second you use responses. responses is a list (of dictionaries), not a dictionary, so it can only be accessed by index, not by key.
When you ran your code synchronously, all you had to do was iterate over the entries in the response. Now that you're working with multiple responses in a list, you need to iterate over both the responses and the entries in each. To do this, you need to use two separate for loops.
responses = await asyncio.gather(
*[get_data(session, search_id) for search_id in search_ids]
)
for response in responses:
for entry in response["entries"]:
stats["Display Name"].append(
entry["player"]["UserInfo"]["displayName"]
)
This might help
for index, y in enumerate(responses['entries']):
stats['Display Name'].append(y['entries'][index]['player']['UserInfo']['displayName'])
the error is because of the key in data type. Please try this code based on your 2nd code
search_ids = [1,2,3,4,5,6,7,8,9,10]
stats = {"Date":[],"Instance ID":[],"Result":[],"Display Name":[]}
async def main():
async with aiohttp.ClientSession() as session:
tasks = []
for search_id in search_ids:
task = asyncio.ensure_future(get_data(session, search_id))
tasks.append(task)
responses = await asyncio.gather(
*[get_data(session, search_id) for search_id in search_ids])
for response in responses:
for entry in response["entries"]:
stats["Display Name"].append(
entry["player"]["UserInfo"]["displayName"]
)
async def get_data(session, carnage_id):
url = f'https://www.myapi.com/{search_id}'
async with session.get(url, headers=HEADERS, ssl=False) as response:
results = await response.json()
return results['Response']
asyncio.run(main())
I am using aysnc and aiohttp to crawl web image, but when it was running, I found it was not crawling as fast as I expected.
Is there any code that I can improve there?
In the for loop I am using many await inside, is that the correct way to deal with that?
async def fetch(url):
async with aiohttp.ClientSession() as session:
async with session.get(url=url,
headers=HEADERS,
proxy=PROXY_STR,
) as response:
text = await response.text()
resp = Selector(text=text)
nodes = resp.xpath('//div[#class="kl1-2"]')
for node in nodes:
next_url = node.xpath('.//div[#class="kl1-2a2"]/a/#href').extract_first()
title = node.xpath('.//div[#class="kl1-2a2"]/a/#title').extract_first()
await detail(session=session, next_url=next_url, title=title)
print('next page')
async def detail(**kwargs):
session = kwargs['session']
next_url = kwargs['next_url']
title = kwargs['title']
print(next_url)
print(title)
async with session.get(
url=next_url,
headers=HEADERS,
proxy=PROXY_STR,
) as response:
text = await response.text()
resp = Selector(text=text)
nodes = resp.xpath('//div[#class="kl2-1"]//img/#src').extract()
nodes = list(set(nodes))
for img in nodes:
await download_img(session=session,url=img,title=title)
print('next image')
async def download_img(**kwargs):
url= kwargs['url']
title= kwargs['title']
try:
conn = aiohttp.TCPConnector(ssl=False) # 防止ssl报错
async with aiohttp.ClientSession(connector=conn, trust_env=True) as session:
async with session.get(url=url, headers=SIMPLE_HEADERS, proxy=PROXY_STR) as response:
if response.status>=200 and response.status<300:
f=await aiofiles.open(save_file,'wb')
await f.write(await response.read())
await f.close()
except Exception as e:
return
async def main():
total_page = 3640
for page in range(0,total_page,35):
url = START_URL.format(page=page)
await fetch(url)
await asyncio.sleep(0)
print(f'downing page {page}-')
loop = asyncio.get_event_loop()
loop.run_until_complete(main())
I am trying to write a program using asyncio and was oriented towards this blog post. What I am trying to do is fetch some JSON data concurrently. For one input data frame. however, I would like to process the requested data further as soon as it becomes available.
So basically there are two groups of tasks:
process data in df1 concurrently and do some calc once JSON returned
process data in df2 concurrently
They are more or less independent of each other, but I want to run the group of tasks concurrently as well. Once both task groups are finished I want to further process them.
My question is if my implementation is properly designed in terms of asyncio patterns, where I just used two gather statements? Or whether this is the wrong concept? Here is a scatch:
import asyncio
import aiohttp
from aiohttp import ClientSession
async def fetch_json(url: str, session: ClientSession, data: json.dumps) -> Dict:
resp = await session.get(url=url, headers={"content-type": "application/json"}, data=data)
resp.raise_for_status()
logger.info("Got response [%s] for URL: %s", resp.status, url)
json = await resp.json()
return json
async def some_calc(url: str, session: ClientSession, data: json.dumps):
res = await fetch_json(url=url, session=session, data=data)
return [float(x) for x in res]
async def process_data(df: Dict, url: str, session: ClientSession):
async with session:
tasks = []
for data in df:
try:
if df1:
task = some_calc(url=url, session=session, data=data)
else:
task = fetch_json(url=url, session=session, data=data)
except Exception as e:
# ...
tasks.append(
task
)
res = await asyncio.gather(*tasks)
return res
async def bulk_execute(df1, df2):
url = "http://some.url/"
async with ClientSession() as session:
res = await asyncio.gather(process_data(df1, url, session), process_data(df2, url, session))
return res
if __name__ == "__main__":
res = asyncio.run(bulk_execute(df1, df2))
I try to use aiohttp and asyncio to do the request.But I got the error
' An asyncio.Future, a coroutine or an awaitable is required'
here's my code.How can I fix it.
import requests
from bs4 import BeautifulSoup
import asyncio
import aiohttp
res = requests.get('https://www.rottentomatoes.com/top/')
soup = BeautifulSoup(res.text,'lxml')
movie_list=[]
for link in soup.select('section li a[href]'):
movie_list.append('https://www.rottentomatoes.com'+link.get('href'))
async def request(url):
async with aiohttp.ClientSession() as session:
async with session.get(url) as resp:
body = await resp.text(encoding='utf-8')
soup =BeautifulSoup(body,'lxml')
movie = []
async for link in soup.select('tbody tr td a '):
await movie.append(link.get('href'))
return movie
async def main():
results = await asyncio.gather(*[request(url) for url in movie_list])
print(results)
return results
print(movie_list)
loop = asyncio.get_event_loop()
results = loop.run_until_complete(main)
You need to call loop.run_until_complete(main()), not just a function main (without parenthesis). The next thing is you don't need async keyword in soup.select(). I also changed a select string, to parse something:
import requests
from bs4 import BeautifulSoup
import asyncio
import aiohttp
res = requests.get('https://www.rottentomatoes.com/top/')
soup = BeautifulSoup(res.text,'lxml')
movie_list=[]
for link in soup.select('section li a[href]'):
movie_list.append('https://www.rottentomatoes.com'+link.get('href'))
async def request(url):
async with aiohttp.ClientSession() as session:
async with session.get(url) as resp:
body = await resp.text(encoding='utf-8')
soup = BeautifulSoup(body,'lxml')
movie = []
# no need to call async for here!
for link in soup.select('section#top_movies_main table a'):
movie.append(link['href'])
return movie
async def main():
results = await asyncio.gather(*[request(url) for url in movie_list])
print(results)
return results
print(movie_list)
loop = asyncio.get_event_loop()
results = loop.run_until_complete(main()) # you need to create coroutine
Prints:
['https://www.rottentomatoes.com/top/bestofrt/top_100_action__adventure_movies/', 'https://www.rottentomatoes.com/top/bestofrt/top_100_animation_movies/', 'https://www.rottentomatoes.com/top/bestofrt/top_100_art_house__international_movies/', 'https://www.rottentomatoes.com/top/bestofrt/top_100_classics_movies/', 'https://www.rottentomatoes.com/top/bestofrt/top_100_comedy_movies/', 'https://www.rottentomatoes.com/top/bestofrt/top_100_documentary_movies/', 'https://www.rottentomatoes.com/top/bestofrt/top_100_drama_movies/', 'https://www.rottentomatoes.com/top/bestofrt/top_100_horror_movies/', 'https://www.rottentomatoes.com/top/bestofrt/top_100_kids__family_movies/', 'https://www.rottentomatoes.com/top/bestofrt/top_100_musical__performing_arts_movies/', 'https://www.rottentomatoes.com/top/bestofrt/top_100_mystery__suspense_movies/', 'https://www.rottentomatoes.com/top/bestofrt/top_100_romance_movies/', 'https://www.rottentomatoes.com/top/bestofrt/top_100_science_fiction__fantasy_movies/', 'https://www.rottentomatoes.com/top/bestofrt/top_100_special_interest_movies/', 'https://www.rottentomatoes.com/top/bestofrt/top_100_sports__fitness_movies/', 'https://www.rottentomatoes.com/top/bestofrt/top_100_television_movies/', 'https://www.rottentomatoes.com/top/bestofrt/top_100_western_movies/']
[['/m/mad_max_fury_road', '/m/1013775-metropolis', '/m/wonder_woman_2017', '/m/logan_2017', '/m/1011615-king_kong', '/m/zootopia', '/m/1000355-adventures_of_robin_hood', '/m/star_wars_episode_vii_the_force_awakens',
... and so on