async request for python - python

I try to use aiohttp and asyncio to do the request.But I got the error
' An asyncio.Future, a coroutine or an awaitable is required'
here's my code.How can I fix it.
import requests
from bs4 import BeautifulSoup
import asyncio
import aiohttp
res = requests.get('https://www.rottentomatoes.com/top/')
soup = BeautifulSoup(res.text,'lxml')
movie_list=[]
for link in soup.select('section li a[href]'):
movie_list.append('https://www.rottentomatoes.com'+link.get('href'))
async def request(url):
async with aiohttp.ClientSession() as session:
async with session.get(url) as resp:
body = await resp.text(encoding='utf-8')
soup =BeautifulSoup(body,'lxml')
movie = []
async for link in soup.select('tbody tr td a '):
await movie.append(link.get('href'))
return movie
async def main():
results = await asyncio.gather(*[request(url) for url in movie_list])
print(results)
return results
print(movie_list)
loop = asyncio.get_event_loop()
results = loop.run_until_complete(main)

You need to call loop.run_until_complete(main()), not just a function main (without parenthesis). The next thing is you don't need async keyword in soup.select(). I also changed a select string, to parse something:
import requests
from bs4 import BeautifulSoup
import asyncio
import aiohttp
res = requests.get('https://www.rottentomatoes.com/top/')
soup = BeautifulSoup(res.text,'lxml')
movie_list=[]
for link in soup.select('section li a[href]'):
movie_list.append('https://www.rottentomatoes.com'+link.get('href'))
async def request(url):
async with aiohttp.ClientSession() as session:
async with session.get(url) as resp:
body = await resp.text(encoding='utf-8')
soup = BeautifulSoup(body,'lxml')
movie = []
# no need to call async for here!
for link in soup.select('section#top_movies_main table a'):
movie.append(link['href'])
return movie
async def main():
results = await asyncio.gather(*[request(url) for url in movie_list])
print(results)
return results
print(movie_list)
loop = asyncio.get_event_loop()
results = loop.run_until_complete(main()) # you need to create coroutine
Prints:
['https://www.rottentomatoes.com/top/bestofrt/top_100_action__adventure_movies/', 'https://www.rottentomatoes.com/top/bestofrt/top_100_animation_movies/', 'https://www.rottentomatoes.com/top/bestofrt/top_100_art_house__international_movies/', 'https://www.rottentomatoes.com/top/bestofrt/top_100_classics_movies/', 'https://www.rottentomatoes.com/top/bestofrt/top_100_comedy_movies/', 'https://www.rottentomatoes.com/top/bestofrt/top_100_documentary_movies/', 'https://www.rottentomatoes.com/top/bestofrt/top_100_drama_movies/', 'https://www.rottentomatoes.com/top/bestofrt/top_100_horror_movies/', 'https://www.rottentomatoes.com/top/bestofrt/top_100_kids__family_movies/', 'https://www.rottentomatoes.com/top/bestofrt/top_100_musical__performing_arts_movies/', 'https://www.rottentomatoes.com/top/bestofrt/top_100_mystery__suspense_movies/', 'https://www.rottentomatoes.com/top/bestofrt/top_100_romance_movies/', 'https://www.rottentomatoes.com/top/bestofrt/top_100_science_fiction__fantasy_movies/', 'https://www.rottentomatoes.com/top/bestofrt/top_100_special_interest_movies/', 'https://www.rottentomatoes.com/top/bestofrt/top_100_sports__fitness_movies/', 'https://www.rottentomatoes.com/top/bestofrt/top_100_television_movies/', 'https://www.rottentomatoes.com/top/bestofrt/top_100_western_movies/']
[['/m/mad_max_fury_road', '/m/1013775-metropolis', '/m/wonder_woman_2017', '/m/logan_2017', '/m/1011615-king_kong', '/m/zootopia', '/m/1000355-adventures_of_robin_hood', '/m/star_wars_episode_vii_the_force_awakens',
... and so on

Related

Request within async function is not running concurrently, only starts after everything finished

async def get_html(self, url):
async with aiohttp.ClientSession() as session:
async with session.get(url, headers=headers) as resp:
return await resp.text()
async def getrank(self, url):
response = await self.get_html(f'url')
print(f'{response.site} | got site')
soup = BeautifulSoup(response, "html.parser")
rank = soup.find("div", {"id": "productDetails_db_sections"})
test2 = rank.find_all("span")
rank = str(test2[-2].text).replace(",","")
finalRank = int(re.search("\d+", rank).group())
if finalRank < 20000:
print(f'product has low rank, starting new function')
await self.getPriceFinal(url, finalrank)
async def getPriceFinal(self, url, rank):
try:
print(f'Checking for Price....') #THIS PRINTS
s = aiohttp.ClientSession()
response = await s.get(f"{url}", headers = self.headers) #THIS WAITS UNTIL getrank finished
print(response.status)
The main problem I have is that the function getPriceFinal() runs to the print and after that waits for the getrank() function to finish ... however what I would like to do is to start the getPriceFinal() function with the url from getrank() function concurrently .. and ideas on how to solve this issue?

Discord bot does not send message to channel

I'm trying to make my bot send a message to a discord channel if certain conditions are met, but I can't seem to get the code working. The code checks every 5 seconds if a list contains the string '.12.' and should then forward the message.
import requests
import time
import discord
from discord.ext import commands, tasks
from bs4 import BeautifulSoup
while True:
client = commands.Bot(command_prefix='.')
#client.event
async def on_ready():
print('bot is active')
url = 'website link'
res = requests.get(url)
html = res.text
soup = BeautifulSoup(html, 'html.parser')
html_element = soup.find_all( 'td', { "class" : "eksam-ajad-aeg" } )
ret = []
for t in html_element:
ret.append(t.text)
print(ret)
if '.12.' in ret:
#client.event
async def send():
channel = client.get_channel(758088198852182037)
await channel.send('message')
client.run('token')
time.sleep(5)
Here is a bot script that appears to be working. Without having the url that you're attempting to search, I'm not able to help completely, but give this a try and see if it works for you:
import discord
import requests
from bs4 import BeautifulSoup
import asyncio
client = discord.Client()
#client.event
async def on_ready():
# Create a task and run check_html and feed it a parameter of 5 seconds
client.loop.create_task(check_html(5))
print("Bot is active")
async def check_html(time):
while True:
url = 'url here'
res = requests.get(url)
html = res.text
soup = BeautifulSoup(html, 'html.parser')
html_element = soup.find_all( 'td', { "class" : "eksam-ajad-aeg" } )
ret = []
for t in html_element:
ret.append(t.text)
print(ret)
if '.12.' in ret:
for guild in client.guilds:
for channel in guild.channels:
if channel.id == 758088198852182037:
await channel.send('message')
# Asyncronously sleep for 'time' seconds
await asyncio.sleep(time)
client.run('token')

How to change async crawl program based on aiohttp lib faster?

I am using aysnc and aiohttp to crawl web image, but when it was running, I found it was not crawling as fast as I expected.
Is there any code that I can improve there?
In the for loop I am using many await inside, is that the correct way to deal with that?
async def fetch(url):
async with aiohttp.ClientSession() as session:
async with session.get(url=url,
headers=HEADERS,
proxy=PROXY_STR,
) as response:
text = await response.text()
resp = Selector(text=text)
nodes = resp.xpath('//div[#class="kl1-2"]')
for node in nodes:
next_url = node.xpath('.//div[#class="kl1-2a2"]/a/#href').extract_first()
title = node.xpath('.//div[#class="kl1-2a2"]/a/#title').extract_first()
await detail(session=session, next_url=next_url, title=title)
print('next page')
async def detail(**kwargs):
session = kwargs['session']
next_url = kwargs['next_url']
title = kwargs['title']
print(next_url)
print(title)
async with session.get(
url=next_url,
headers=HEADERS,
proxy=PROXY_STR,
) as response:
text = await response.text()
resp = Selector(text=text)
nodes = resp.xpath('//div[#class="kl2-1"]//img/#src').extract()
nodes = list(set(nodes))
for img in nodes:
await download_img(session=session,url=img,title=title)
print('next image')
async def download_img(**kwargs):
url= kwargs['url']
title= kwargs['title']
try:
conn = aiohttp.TCPConnector(ssl=False) # 防止ssl报错
async with aiohttp.ClientSession(connector=conn, trust_env=True) as session:
async with session.get(url=url, headers=SIMPLE_HEADERS, proxy=PROXY_STR) as response:
if response.status>=200 and response.status<300:
f=await aiofiles.open(save_file,'wb')
await f.write(await response.read())
await f.close()
except Exception as e:
return
async def main():
total_page = 3640
for page in range(0,total_page,35):
url = START_URL.format(page=page)
await fetch(url)
await asyncio.sleep(0)
print(f'downing page {page}-')
loop = asyncio.get_event_loop()
loop.run_until_complete(main())

How to perform proper error handling with asyncio and aiohttp?

I am looking for guidance around best practices with asyncio and aiohttp in Python 3. I have a basic scraper but I am not sure how to:
Properly implement error handling. More specific around my fetch function.
Do I really need the last main function to wrap my async crawler around?
Here is my code so far, it is working but I would like feedback on the two item above.
urls = []
async def fetch(url, payload={}):
async with ClientSession() as s:
async with s.get(url, params=payload) as resp:
content = await resp.read()
return content
async def get_profile_urls(url, payload):
content = await fetch(url, payload)
soup = BeautifulSoup(content, 'html.parser')
soup = soup.find_all(attrs={'class': 'classname'})
if soup:
urls.extend([s.find('a')['href'] for s in soup])
async def main():
tasks = []
payload = {
'page': 0,
'filter': 88}
for i in range(max_page + 1):
payload['page'] += 1
tasks.append(get_profile_urls(search_ulr, payload))
await asyncio.wait(tasks)
asyncio.run(main())

Asyncio exception handling, possible to not gather exceptions?

I have some code, which makes some API calls with asyncio and aiohttp. For some urls, asyncio will raise an exception, so I allow it to return it (with asyncio.gather(return_exceptions = True)), so it doesn't break the event loop. Is it possible to no gather the returned exceptions, so it returns only the results which worked? Or do I need to clean up the list manually afterwards?
This is the code:
import asyncio
import aiohttp
import ssl
import datetime as dt
limit = 30
start_epoch = int(dt.datetime(2018,7,1).timestamp())
end_epoch = int(dt.datetime.now().timestamp())
epoch_step = 40000
url_list = []
while True:
url = "https://api.pushshift.io/reddit/search/comment/?q=" + "Nestle" + "&size=" + str(limit) + "&after=" + str(start_epoch) + "&before=" + str(start_epoch + epoch_step)
url_list.append(url)
start_epoch += epoch_step
if start_epoch > end_epoch:
break
async def fetch(session, url):
async with session.get(url, ssl=ssl.SSLContext()) as response:
return await response.json()
async def fetch_all(urls, loop):
async with aiohttp.ClientSession(loop=loop) as session:
results = await asyncio.gather(*[fetch(session, url) for url in urls], return_exceptions=True)
return results
if __name__ == '__main__':
loop = asyncio.get_event_loop()
urls = url_list
htmls = loop.run_until_complete(fetch_all(urls, loop))
print(htmls)
and it returns a list which looks something like this:
[ContentTypeError("0, message='Attempt to decode JSON with unexpected mimetype: text/html'",), {'data': [{'author':...]

Categories

Resources