Not able to create session with HTTPDigestAuth? - python

I am trying to make the GET request in an asynchronous manner. And the client authentication system has implemented with DigestAuth.
async def run(self, url_list, api_auth):
sem = asyncio.Semaphore(10)
tasks = []
async with ClientSession(auth = HTTPDigestAuth("user_name", "password")) as session:
for url in url_list:
task = asyncio.ensure_future(self.get_content(sem, url, session))
tasks.append(task)
response = await asyncio.gather(*tasks)
logging.critical(f"Total size of response: {len(response)}")
TypeError: BasicAuth() tuple is required instead
When I created the authentication tuple with BasicAuth().
async def run(self, url_list, api_auth):
sem = asyncio.Semaphore(10)
tasks = []
async with ClientSession(auth = BasicAuth("user_name", "password", verify=True)) as session:
for url in url_list:
task = asyncio.ensure_future(self.get_content(sem, url, session))
tasks.append(task)
response = await asyncio.gather(*tasks)
logging.critical(f"Total size of response: {len(response)}")
401 - Unauthorized
Is there any proper/work-around approach to solve this?

Related

What's the difference between ensure_future and create_task?

Please explain to me when should I use asyncio.ensure_future and create_task and what's the difference.
We can write like this:
async def run(r):
url = "http://localhost:8080/{}"
tasks = []
# Fetch all responses within one Client session,
# keep connection alive for all requests.
async with ClientSession() as session:
for i in range(r):
task = asyncio.ensure_future(fetch(url.format(i), session))
tasks.append(task)
responses = await asyncio.gather(*tasks)
# you now have all response bodies in this variable
print(responses)
Or like this:
async def get_all(*names: str):
started_at = time.time()
# Create tasks, so we start requesting all of them concurrently
tasks = [asyncio.create_task(get_pokemon(name)) for name in names]
# Await ALL
results = await asyncio.gather(*tasks)
for result in results:
if result:
pokemon = parse_pokemon(result)
print(f"💁 {pokemon.name} is of type(s) {','.join(pokemon.types)}")
else:
print(f"❌ No data found for...")

Request within async function is not running concurrently, only starts after everything finished

async def get_html(self, url):
async with aiohttp.ClientSession() as session:
async with session.get(url, headers=headers) as resp:
return await resp.text()
async def getrank(self, url):
response = await self.get_html(f'url')
print(f'{response.site} | got site')
soup = BeautifulSoup(response, "html.parser")
rank = soup.find("div", {"id": "productDetails_db_sections"})
test2 = rank.find_all("span")
rank = str(test2[-2].text).replace(",","")
finalRank = int(re.search("\d+", rank).group())
if finalRank < 20000:
print(f'product has low rank, starting new function')
await self.getPriceFinal(url, finalrank)
async def getPriceFinal(self, url, rank):
try:
print(f'Checking for Price....') #THIS PRINTS
s = aiohttp.ClientSession()
response = await s.get(f"{url}", headers = self.headers) #THIS WAITS UNTIL getrank finished
print(response.status)
The main problem I have is that the function getPriceFinal() runs to the print and after that waits for the getrank() function to finish ... however what I would like to do is to start the getPriceFinal() function with the url from getrank() function concurrently .. and ideas on how to solve this issue?

How to change async crawl program based on aiohttp lib faster?

I am using aysnc and aiohttp to crawl web image, but when it was running, I found it was not crawling as fast as I expected.
Is there any code that I can improve there?
In the for loop I am using many await inside, is that the correct way to deal with that?
async def fetch(url):
async with aiohttp.ClientSession() as session:
async with session.get(url=url,
headers=HEADERS,
proxy=PROXY_STR,
) as response:
text = await response.text()
resp = Selector(text=text)
nodes = resp.xpath('//div[#class="kl1-2"]')
for node in nodes:
next_url = node.xpath('.//div[#class="kl1-2a2"]/a/#href').extract_first()
title = node.xpath('.//div[#class="kl1-2a2"]/a/#title').extract_first()
await detail(session=session, next_url=next_url, title=title)
print('next page')
async def detail(**kwargs):
session = kwargs['session']
next_url = kwargs['next_url']
title = kwargs['title']
print(next_url)
print(title)
async with session.get(
url=next_url,
headers=HEADERS,
proxy=PROXY_STR,
) as response:
text = await response.text()
resp = Selector(text=text)
nodes = resp.xpath('//div[#class="kl2-1"]//img/#src').extract()
nodes = list(set(nodes))
for img in nodes:
await download_img(session=session,url=img,title=title)
print('next image')
async def download_img(**kwargs):
url= kwargs['url']
title= kwargs['title']
try:
conn = aiohttp.TCPConnector(ssl=False) # 防止ssl报错
async with aiohttp.ClientSession(connector=conn, trust_env=True) as session:
async with session.get(url=url, headers=SIMPLE_HEADERS, proxy=PROXY_STR) as response:
if response.status>=200 and response.status<300:
f=await aiofiles.open(save_file,'wb')
await f.write(await response.read())
await f.close()
except Exception as e:
return
async def main():
total_page = 3640
for page in range(0,total_page,35):
url = START_URL.format(page=page)
await fetch(url)
await asyncio.sleep(0)
print(f'downing page {page}-')
loop = asyncio.get_event_loop()
loop.run_until_complete(main())

Run two concurrent task groups asynchronously with asyncio

I am trying to write a program using asyncio and was oriented towards this blog post. What I am trying to do is fetch some JSON data concurrently. For one input data frame. however, I would like to process the requested data further as soon as it becomes available.
So basically there are two groups of tasks:
process data in df1 concurrently and do some calc once JSON returned
process data in df2 concurrently
They are more or less independent of each other, but I want to run the group of tasks concurrently as well. Once both task groups are finished I want to further process them.
My question is if my implementation is properly designed in terms of asyncio patterns, where I just used two gather statements? Or whether this is the wrong concept? Here is a scatch:
import asyncio
import aiohttp
from aiohttp import ClientSession
async def fetch_json(url: str, session: ClientSession, data: json.dumps) -> Dict:
resp = await session.get(url=url, headers={"content-type": "application/json"}, data=data)
resp.raise_for_status()
logger.info("Got response [%s] for URL: %s", resp.status, url)
json = await resp.json()
return json
async def some_calc(url: str, session: ClientSession, data: json.dumps):
res = await fetch_json(url=url, session=session, data=data)
return [float(x) for x in res]
async def process_data(df: Dict, url: str, session: ClientSession):
async with session:
tasks = []
for data in df:
try:
if df1:
task = some_calc(url=url, session=session, data=data)
else:
task = fetch_json(url=url, session=session, data=data)
except Exception as e:
# ...
tasks.append(
task
)
res = await asyncio.gather(*tasks)
return res
async def bulk_execute(df1, df2):
url = "http://some.url/"
async with ClientSession() as session:
res = await asyncio.gather(process_data(df1, url, session), process_data(df2, url, session))
return res
if __name__ == "__main__":
res = asyncio.run(bulk_execute(df1, df2))

API calls not running ansynchronously

I'm working on a python client that will asynchronously download vinyl cover art. My problem is that I'm new to python (especially asynchronous python) and I don't think my code is running ansychronously. I have another client written in Node.js that is able to get approx. 40 images/sec whereas this python one is only managing to get around 1.5/sec.
import aiohttp
import asyncio
from os import path,makedirs
caa_base_url = "https://coverartarchive.org/release"
image_download_dir = path.realpath('images')
# small,large, None = Max
image_size = None
async def getImageUrls(release_mbid,session):
async with session.get(f'{caa_base_url}/{release_mbid}') as resp:
if resp.status == 404 or resp.status == 403:
return
return [release_mbid,await resp.json()]
async def getImage(url,session):
try:
async with session.get(url) as resp:
return [url,await resp.read()]
except (aiohttp.ServerDisconnectedError):
return await getImage(url,session)
async def getMBIDs(mb_page_url):
async with aiohttp.ClientSession() as session:
async with session.get(mb_page_url) as resp:
mb_json = await resp.json()
tasks = []
async with aiohttp.ClientSession() as caa_session:
for release in mb_json["releases"]:
task = asyncio.ensure_future(getImageUrls(release["id"],caa_session))
tasks.append(task)
responses = await asyncio.gather(*tasks)
async with aiohttp.ClientSession() as caa_image_session:
for response in responses:
if response is not None:
caaTasks = []
release_mbid = response[0]
result = response[1]
for image in result["images"]:
if image["front"] == True:
caaTask = asyncio.ensure_future(getImage(image["image"],caa_session))
caaTasks.append(caaTask)
image_responses = await asyncio.gather(*caaTasks)
for image_response in image_responses:
image_url = image_response[0]
image_binary = image_response[1]
new_file_dir = path.join(image_download_dir,release_mbid)
if not path.isdir(new_file_dir):
makedirs(new_file_dir)
file_name = image_url[image_url.rfind("/")+1:]
file_path = path.join(new_file_dir,file_name)
new_file = open(file_path,'wb')
new_file.write(image_binary)
mb_base_url = "https://musicbrainz.org/ws/2/release"
num_pages = 100
releases_per_page = 100
mb_page_urls = []
async def getMBPages():
for page_index in range(num_pages):
await getMBIDs('%s?query=*&type=album&format=Vinyl&limit=%s&offset=%s&fmt=json' % (mb_base_url,releases_per_page,page_index*releases_per_page))
await asyncio.sleep(1)
loop = asyncio.get_event_loop()
loop.run_until_complete(getMBPages())
P.S. The sleep is because musicbrainz api limits to 1 request/sec

Categories

Resources