How can I use the aiter-method in the aiomultiprocessing?
I have a small code to download urls. They are directly available in the "get(url)"-function, but I need/want them in the "main()"-function.
Here a small code example:
Import aiomultiprocessing
Import requests_html
async def get(url):
asession = AsyncHTMLSession()
response = await asession.get(url)
soup = BeautifulSoup(response.text, "lxml")
return soup
async def main():
async with aiomultiprocess.Pool(processes=4, childconcurrency=8) as pool:
await pool.map(get,urls) #--> What would be here the code to use them as they are ready?
I would be thankful for an example to use the aiter-method on the main()-function.
Thank you.
Related
Is there anyway I can make all this requests at the same time? I'm trying to reduce the time...
def pokemons():
for i in range(1, 800):
url = f"https://pokeapi.co/api/v2/pokemon/{i}"
requisicao = requests.get(url)
try:
lista = requisicao.json()
except ValueError:
print("ERRO TIPO")
What you want is AIOHTTP Asynchronous HTTP Client/Server.
import asyncio
import aiohttp
import ssl
async def fetch(session, pokemon_num):
url = f"https://pokeapi.co/api/v2/pokemon/{pokemon_num}"
async with session.get(url, ssl=ssl.SSLContext()) as response:
return await response.json()
async def fetch_all(loop):
async with aiohttp.ClientSession(loop=loop) as session:
results = await asyncio.gather(*[fetch(session, pokemon_n) for pokemon_n in range(800)], return_exceptions=True)
return results
if __name__ == '__main__':
loop = asyncio.get_event_loop()
result = loop.run_until_complete(fetch_all(loop))
print(result)
I ran this code and got all the requests in a total of 19.5 seconds. I hope its good for your case.
The snippet above comes from another answer from YuriiKramarenko, if it suits you, you can give him a thumbs up. I adjusted it for your specific parameters.
Good luck catching them all!
I'm making a python web scraper script. I should do this using asyncio. So for Async HTTP request I use AioHTTP.
It's ok but when i'm trying to make a non-blocking app (await), the beautifulsoup4 will block application (because beautifulsoup4 dose't support async)
This is what i'm tried.
import asyncio, aiohttp
from bs4 import BeautifulSoup
async def extractLinks(html):
soup = BeautifulSoup(html, 'html.parser')
return soup.select(".c-pro-box__title a")
async def getHtml(session, url):
async with session.get(url) as response:
return await response.text()
async def loadPage(url):
async with aiohttp.ClientSession() as session:
html = await getHtml(session, url)
links = await extractLinks(html)
return links
loop = asyncio.get_event_loop()
loop.run_until_complete(loadPage())
The extractLinks() will block program flow.
So is this possible to make it non-blocking? Or is there any library except beautifulsoup4 that support async as well as possible?
I am trying to scrape some data from https://www.officialcharts.com/ by parallelising web requests using asyncio/aiohttp. I implemented the code given at the link here.
I followed two different procedures. The first one goes like this.
from bs4 import BeautifulSoup
from urllib.request import urlopen
from selenium import webdriver
import time
import pandas as pd
import numpy as np
import re
import json
import requests
from bs4 import BeautifulSoup
from datetime import date, timedelta
from IPython.display import clear_output
import memory_profiler
import spotipy
import spotipy.util as util
import pandas as pd
from more_itertools import unique_everseen
weeks = []
d = date(1970, 1, 1)
d += timedelta(days = 6 - d.weekday())
for i in range(2500):
weeks.append(d.strftime('%Y%m%d'))
d += timedelta(days = 7)
import asyncio
from aiohttp import ClientSession
import nest_asyncio
nest_asyncio.apply()
result = []
async def fetch(url, session):
async with session.get(url) as response:
return await response.read()
async def run(r):
tasks = []
# Fetch all responses within one Client session,
# keep connection alive for all requests.
async with ClientSession() as session:
for i in range(r):
url = 'https://www.officialcharts.com/charts/singles-chart/' + weeks[i] + '/'
task = asyncio.ensure_future(fetch(url, session))
tasks.append(task)
responses = await asyncio.gather(*tasks)
result.append(responses)
loop = asyncio.get_event_loop()
future = asyncio.ensure_future(run(5))
loop.run_until_complete(future)
print('Done')
print(result[0][0] == None)
The problem with above code is, it fails when I make more than simultaneous 1000 requests.
The author of the post implemented a different procedure to address this issue and he claims we can do as many as 10K requests. I followed along his second procedure and here is my code for that.
import random
import asyncio
from aiohttp import ClientSession
import nest_asyncio
nest_asyncio.apply()
result = []
async def fetch(url, session):
async with session.get(url) as response:
delay = response.headers.get("DELAY")
date = response.headers.get("DATE")
print("{}:{} with delay {}".format(date, response.url, delay))
return await response.read()
async def bound_fetch(sem, url, session):
# Getter function with semaphore.
async with sem:
await fetch(url, session)
async def run(r):
tasks = []
# create instance of Semaphore
sem = asyncio.Semaphore(1000)
# Create client session that will ensure we dont open new connection
# per each request.
async with ClientSession() as session:
for i in range(r):
url = 'https://www.officialcharts.com/charts/singles-chart/' + weeks[i] + '/'
task = asyncio.ensure_future(bound_fetch(sem, url, session))
tasks.append(task)
responses = await asyncio.gather(*tasks)
result.append(responses)
number = 5
loop = asyncio.get_event_loop()
future = asyncio.ensure_future(run(number))
loop.run_until_complete(future)
print('Done')
print(result[0][0] == None)
For some reason, this doesn't return any responses.
PS:I am not from CS background and just program for fun. I have no clue what's going on inside the asyncio code.
Try to use the latest version.
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
from aiohttp import ClientSession, client_exceptions
from asyncio import Semaphore, ensure_future, gather, run
from json import dumps, loads
limit = 10
http_ok = [200]
async def scrape(url_list):
tasks = list()
sem = Semaphore(limit)
async with ClientSession() as session:
for url in url_list:
task = ensure_future(scrape_bounded(url, sem, session))
tasks.append(task)
result = await gather(*tasks)
return result
async def scrape_bounded(url, sem, session):
async with sem:
return await scrape_one(url, session)
async def scrape_one(url, session):
try:
async with session.get(url) as response:
content = await response.read()
except client_exceptions.ClientConnectorError:
print('Scraping %s failed due to the connection problem', url)
return False
if response.status not in http_ok:
print('Scraping%s failed due to the return code %s', url, response.status)
return False
content = loads(content.decode('UTF-8'))
return content
if __name__ == '__main__':
urls = ['http://demin.co/echo1/', 'http://demin.co/echo2/']
res = run(scrape(urls))
print(dumps(res, indent=4))
This is a template of a real project that works as predicted.
You can find this source code here
Recently, I'm looking at the python aiohttp lib, play around it, compare with python requests. Here is the code:
import aiohttp
import asyncio
import requests
request_url = 'http://www.baidu.com'
requests_resp = requests.get(request_url)
async def fetch(session, url):
async with session.get(url) as response:
return await response.text()
async def main():
async with aiohttp.ClientSession() as session:
aio_resp = await fetch(session, request_url)
print('aio_resp_length =', len(aio_resp))
loop = asyncio.get_event_loop()
loop.run_until_complete(main())
print('requests_resp_length = ', len(requests_resp.text))
The response lengths with a huge diffferences
aio_resp_length = 152576
requests_resp_length = 2381
Not sure what happens in aiohttp.session.get, but this result is not always like this. When you change the requests_url to http://www.example.com,the
response lengthes are the same. Can someone tell me what happened here?
Cheers
Because aiohttp has newline in it's response and requests doesn't.
you can check thier response like this
print('requests_resp_length = ', requests_resp.text[0:100])
print('aio_resp_length =', aio_resp[0:100])
I've written a script in python using asyncio association with aiohttp library to parse the names out of pop up boxes initiated upon clicking on contact info buttons out of diffetent agency information located within a table from this website asynchronously. The webpage displayes the tabular contents across 513 pages.
I encountered this error too many file descriptors in select() when I tried with asyncio.get_event_loop() but when I came across this thread I could see that there is a suggestion to use asyncio.ProactorEventLoop() to avoid such error so I used the latter but noticed that, even when I complied with the suggestion, the script collects the names only from few pages until it throws the following error. How can i fix this?
raise client_error(req.connection_key, exc) from exc
aiohttp.client_exceptions.ClientConnectorError: Cannot connect to host www.tursab.org.tr:443 ssl:None [The semaphore timeout period has expired]
This is my try so far with:
import asyncio
import aiohttp
from bs4 import BeautifulSoup
links = ["https://www.tursab.org.tr/en/travel-agencies/search-travel-agency?sayfa={}".format(page) for page in range(1,514)]
lead_link = "https://www.tursab.org.tr/en/displayAcenta?AID={}"
async def get_links(url):
async with asyncio.Semaphore(10):
async with aiohttp.ClientSession() as session:
async with session.get(url) as response:
text = await response.text()
result = await process_docs(text)
return result
async def process_docs(html):
coros = []
soup = BeautifulSoup(html,"lxml")
items = [itemnum.get("data-id") for itemnum in soup.select("#acentaTbl tr[data-id]")]
for item in items:
coros.append(fetch_again(lead_link.format(item)))
await asyncio.gather(*coros)
async def fetch_again(link):
async with asyncio.Semaphore(10):
async with aiohttp.ClientSession() as session:
async with session.get(link) as response:
text = await response.text()
sauce = BeautifulSoup(text,"lxml")
try:
name = sauce.select_one("p > b").text
except Exception: name = ""
print(name)
if __name__ == '__main__':
loop = asyncio.ProactorEventLoop()
asyncio.set_event_loop(loop)
loop.run_until_complete(asyncio.gather(*(get_links(link) for link in links)))
In short, What the process_docs() function does is collect data-id numbers from each pages to reuse them as the prefix of this https://www.tursab.org.tr/en/displayAcenta?AID={} link to collect the names from pop up boxes. One such id is 8757 and one such qualified links therefore https://www.tursab.org.tr/en/displayAcenta?AID=8757.
Btw, If I change the highest number used in the links variable to 20 or 30 or so, It goes smoothly.
async def get_links(url):
async with asyncio.Semaphore(10):
You can't do such a thing: it means that on each function call new semaphore instance will be created, while you need to single semaphore instance for all requests. Change your code this way:
sem = asyncio.Semaphore(10) # module level
async def get_links(url):
async with sem:
# ...
async def fetch_again(link):
async with sem:
# ...
You can also return default loop once you're using semaphore correctly:
if __name__ == '__main__':
loop = asyncio.get_event_loop()
loop.run_until_complete(...)
Finally, you should alter both get_links(url) and fetch_again(link) to do parsing outside of semaphore to release it as soon as possible, before semaphore is needed inside process_docs(text).
Final code:
import asyncio
import aiohttp
from bs4 import BeautifulSoup
links = ["https://www.tursab.org.tr/en/travel-agencies/search-travel-agency?sayfa={}".format(page) for page in range(1,514)]
lead_link = "https://www.tursab.org.tr/en/displayAcenta?AID={}"
sem = asyncio.Semaphore(10)
async def get_links(url):
async with sem:
async with aiohttp.ClientSession() as session:
async with session.get(url) as response:
text = await response.text()
result = await process_docs(text)
return result
async def process_docs(html):
coros = []
soup = BeautifulSoup(html,"lxml")
items = [itemnum.get("data-id") for itemnum in soup.select("#acentaTbl tr[data-id]")]
for item in items:
coros.append(fetch_again(lead_link.format(item)))
await asyncio.gather(*coros)
async def fetch_again(link):
async with sem:
async with aiohttp.ClientSession() as session:
async with session.get(link) as response:
text = await response.text()
sauce = BeautifulSoup(text,"lxml")
try:
name = sauce.select_one("p > b").text
except Exception:
name = "o"
print(name)
if __name__ == '__main__':
loop = asyncio.get_event_loop()
loop.run_until_complete(asyncio.gather(*(get_links(link) for link in links)))