I am trying to make the GET request in an asynchronous manner. And the client authentication system has implemented with DigestAuth.
async def run(self, url_list, api_auth):
sem = asyncio.Semaphore(10)
tasks = []
async with ClientSession(auth = HTTPDigestAuth("user_name", "password")) as session:
for url in url_list:
task = asyncio.ensure_future(self.get_content(sem, url, session))
tasks.append(task)
response = await asyncio.gather(*tasks)
logging.critical(f"Total size of response: {len(response)}")
TypeError: BasicAuth() tuple is required instead
When I created the authentication tuple with BasicAuth().
async def run(self, url_list, api_auth):
sem = asyncio.Semaphore(10)
tasks = []
async with ClientSession(auth = BasicAuth("user_name", "password", verify=True)) as session:
for url in url_list:
task = asyncio.ensure_future(self.get_content(sem, url, session))
tasks.append(task)
response = await asyncio.gather(*tasks)
logging.critical(f"Total size of response: {len(response)}")
401 - Unauthorized
Is there any proper/work-around approach to solve this?
Related
Please explain to me when should I use asyncio.ensure_future and create_task and what's the difference.
We can write like this:
async def run(r):
url = "http://localhost:8080/{}"
tasks = []
# Fetch all responses within one Client session,
# keep connection alive for all requests.
async with ClientSession() as session:
for i in range(r):
task = asyncio.ensure_future(fetch(url.format(i), session))
tasks.append(task)
responses = await asyncio.gather(*tasks)
# you now have all response bodies in this variable
print(responses)
Or like this:
async def get_all(*names: str):
started_at = time.time()
# Create tasks, so we start requesting all of them concurrently
tasks = [asyncio.create_task(get_pokemon(name)) for name in names]
# Await ALL
results = await asyncio.gather(*tasks)
for result in results:
if result:
pokemon = parse_pokemon(result)
print(f"💁 {pokemon.name} is of type(s) {','.join(pokemon.types)}")
else:
print(f"❌ No data found for...")
async def get_html(self, url):
async with aiohttp.ClientSession() as session:
async with session.get(url, headers=headers) as resp:
return await resp.text()
async def getrank(self, url):
response = await self.get_html(f'url')
print(f'{response.site} | got site')
soup = BeautifulSoup(response, "html.parser")
rank = soup.find("div", {"id": "productDetails_db_sections"})
test2 = rank.find_all("span")
rank = str(test2[-2].text).replace(",","")
finalRank = int(re.search("\d+", rank).group())
if finalRank < 20000:
print(f'product has low rank, starting new function')
await self.getPriceFinal(url, finalrank)
async def getPriceFinal(self, url, rank):
try:
print(f'Checking for Price....') #THIS PRINTS
s = aiohttp.ClientSession()
response = await s.get(f"{url}", headers = self.headers) #THIS WAITS UNTIL getrank finished
print(response.status)
The main problem I have is that the function getPriceFinal() runs to the print and after that waits for the getrank() function to finish ... however what I would like to do is to start the getPriceFinal() function with the url from getrank() function concurrently .. and ideas on how to solve this issue?
I am using aysnc and aiohttp to crawl web image, but when it was running, I found it was not crawling as fast as I expected.
Is there any code that I can improve there?
In the for loop I am using many await inside, is that the correct way to deal with that?
async def fetch(url):
async with aiohttp.ClientSession() as session:
async with session.get(url=url,
headers=HEADERS,
proxy=PROXY_STR,
) as response:
text = await response.text()
resp = Selector(text=text)
nodes = resp.xpath('//div[#class="kl1-2"]')
for node in nodes:
next_url = node.xpath('.//div[#class="kl1-2a2"]/a/#href').extract_first()
title = node.xpath('.//div[#class="kl1-2a2"]/a/#title').extract_first()
await detail(session=session, next_url=next_url, title=title)
print('next page')
async def detail(**kwargs):
session = kwargs['session']
next_url = kwargs['next_url']
title = kwargs['title']
print(next_url)
print(title)
async with session.get(
url=next_url,
headers=HEADERS,
proxy=PROXY_STR,
) as response:
text = await response.text()
resp = Selector(text=text)
nodes = resp.xpath('//div[#class="kl2-1"]//img/#src').extract()
nodes = list(set(nodes))
for img in nodes:
await download_img(session=session,url=img,title=title)
print('next image')
async def download_img(**kwargs):
url= kwargs['url']
title= kwargs['title']
try:
conn = aiohttp.TCPConnector(ssl=False) # 防止ssl报错
async with aiohttp.ClientSession(connector=conn, trust_env=True) as session:
async with session.get(url=url, headers=SIMPLE_HEADERS, proxy=PROXY_STR) as response:
if response.status>=200 and response.status<300:
f=await aiofiles.open(save_file,'wb')
await f.write(await response.read())
await f.close()
except Exception as e:
return
async def main():
total_page = 3640
for page in range(0,total_page,35):
url = START_URL.format(page=page)
await fetch(url)
await asyncio.sleep(0)
print(f'downing page {page}-')
loop = asyncio.get_event_loop()
loop.run_until_complete(main())
I am trying to write a program using asyncio and was oriented towards this blog post. What I am trying to do is fetch some JSON data concurrently. For one input data frame. however, I would like to process the requested data further as soon as it becomes available.
So basically there are two groups of tasks:
process data in df1 concurrently and do some calc once JSON returned
process data in df2 concurrently
They are more or less independent of each other, but I want to run the group of tasks concurrently as well. Once both task groups are finished I want to further process them.
My question is if my implementation is properly designed in terms of asyncio patterns, where I just used two gather statements? Or whether this is the wrong concept? Here is a scatch:
import asyncio
import aiohttp
from aiohttp import ClientSession
async def fetch_json(url: str, session: ClientSession, data: json.dumps) -> Dict:
resp = await session.get(url=url, headers={"content-type": "application/json"}, data=data)
resp.raise_for_status()
logger.info("Got response [%s] for URL: %s", resp.status, url)
json = await resp.json()
return json
async def some_calc(url: str, session: ClientSession, data: json.dumps):
res = await fetch_json(url=url, session=session, data=data)
return [float(x) for x in res]
async def process_data(df: Dict, url: str, session: ClientSession):
async with session:
tasks = []
for data in df:
try:
if df1:
task = some_calc(url=url, session=session, data=data)
else:
task = fetch_json(url=url, session=session, data=data)
except Exception as e:
# ...
tasks.append(
task
)
res = await asyncio.gather(*tasks)
return res
async def bulk_execute(df1, df2):
url = "http://some.url/"
async with ClientSession() as session:
res = await asyncio.gather(process_data(df1, url, session), process_data(df2, url, session))
return res
if __name__ == "__main__":
res = asyncio.run(bulk_execute(df1, df2))
I'm working on a python client that will asynchronously download vinyl cover art. My problem is that I'm new to python (especially asynchronous python) and I don't think my code is running ansychronously. I have another client written in Node.js that is able to get approx. 40 images/sec whereas this python one is only managing to get around 1.5/sec.
import aiohttp
import asyncio
from os import path,makedirs
caa_base_url = "https://coverartarchive.org/release"
image_download_dir = path.realpath('images')
# small,large, None = Max
image_size = None
async def getImageUrls(release_mbid,session):
async with session.get(f'{caa_base_url}/{release_mbid}') as resp:
if resp.status == 404 or resp.status == 403:
return
return [release_mbid,await resp.json()]
async def getImage(url,session):
try:
async with session.get(url) as resp:
return [url,await resp.read()]
except (aiohttp.ServerDisconnectedError):
return await getImage(url,session)
async def getMBIDs(mb_page_url):
async with aiohttp.ClientSession() as session:
async with session.get(mb_page_url) as resp:
mb_json = await resp.json()
tasks = []
async with aiohttp.ClientSession() as caa_session:
for release in mb_json["releases"]:
task = asyncio.ensure_future(getImageUrls(release["id"],caa_session))
tasks.append(task)
responses = await asyncio.gather(*tasks)
async with aiohttp.ClientSession() as caa_image_session:
for response in responses:
if response is not None:
caaTasks = []
release_mbid = response[0]
result = response[1]
for image in result["images"]:
if image["front"] == True:
caaTask = asyncio.ensure_future(getImage(image["image"],caa_session))
caaTasks.append(caaTask)
image_responses = await asyncio.gather(*caaTasks)
for image_response in image_responses:
image_url = image_response[0]
image_binary = image_response[1]
new_file_dir = path.join(image_download_dir,release_mbid)
if not path.isdir(new_file_dir):
makedirs(new_file_dir)
file_name = image_url[image_url.rfind("/")+1:]
file_path = path.join(new_file_dir,file_name)
new_file = open(file_path,'wb')
new_file.write(image_binary)
mb_base_url = "https://musicbrainz.org/ws/2/release"
num_pages = 100
releases_per_page = 100
mb_page_urls = []
async def getMBPages():
for page_index in range(num_pages):
await getMBIDs('%s?query=*&type=album&format=Vinyl&limit=%s&offset=%s&fmt=json' % (mb_base_url,releases_per_page,page_index*releases_per_page))
await asyncio.sleep(1)
loop = asyncio.get_event_loop()
loop.run_until_complete(getMBPages())
P.S. The sleep is because musicbrainz api limits to 1 request/sec