How to change async crawl program based on aiohttp lib faster? - python

I am using aysnc and aiohttp to crawl web image, but when it was running, I found it was not crawling as fast as I expected.
Is there any code that I can improve there?
In the for loop I am using many await inside, is that the correct way to deal with that?
async def fetch(url):
async with aiohttp.ClientSession() as session:
async with session.get(url=url,
headers=HEADERS,
proxy=PROXY_STR,
) as response:
text = await response.text()
resp = Selector(text=text)
nodes = resp.xpath('//div[#class="kl1-2"]')
for node in nodes:
next_url = node.xpath('.//div[#class="kl1-2a2"]/a/#href').extract_first()
title = node.xpath('.//div[#class="kl1-2a2"]/a/#title').extract_first()
await detail(session=session, next_url=next_url, title=title)
print('next page')
async def detail(**kwargs):
session = kwargs['session']
next_url = kwargs['next_url']
title = kwargs['title']
print(next_url)
print(title)
async with session.get(
url=next_url,
headers=HEADERS,
proxy=PROXY_STR,
) as response:
text = await response.text()
resp = Selector(text=text)
nodes = resp.xpath('//div[#class="kl2-1"]//img/#src').extract()
nodes = list(set(nodes))
for img in nodes:
await download_img(session=session,url=img,title=title)
print('next image')
async def download_img(**kwargs):
url= kwargs['url']
title= kwargs['title']
try:
conn = aiohttp.TCPConnector(ssl=False) # 防止ssl报错
async with aiohttp.ClientSession(connector=conn, trust_env=True) as session:
async with session.get(url=url, headers=SIMPLE_HEADERS, proxy=PROXY_STR) as response:
if response.status>=200 and response.status<300:
f=await aiofiles.open(save_file,'wb')
await f.write(await response.read())
await f.close()
except Exception as e:
return
async def main():
total_page = 3640
for page in range(0,total_page,35):
url = START_URL.format(page=page)
await fetch(url)
await asyncio.sleep(0)
print(f'downing page {page}-')
loop = asyncio.get_event_loop()
loop.run_until_complete(main())

Related

Request within async function is not running concurrently, only starts after everything finished

async def get_html(self, url):
async with aiohttp.ClientSession() as session:
async with session.get(url, headers=headers) as resp:
return await resp.text()
async def getrank(self, url):
response = await self.get_html(f'url')
print(f'{response.site} | got site')
soup = BeautifulSoup(response, "html.parser")
rank = soup.find("div", {"id": "productDetails_db_sections"})
test2 = rank.find_all("span")
rank = str(test2[-2].text).replace(",","")
finalRank = int(re.search("\d+", rank).group())
if finalRank < 20000:
print(f'product has low rank, starting new function')
await self.getPriceFinal(url, finalrank)
async def getPriceFinal(self, url, rank):
try:
print(f'Checking for Price....') #THIS PRINTS
s = aiohttp.ClientSession()
response = await s.get(f"{url}", headers = self.headers) #THIS WAITS UNTIL getrank finished
print(response.status)
The main problem I have is that the function getPriceFinal() runs to the print and after that waits for the getrank() function to finish ... however what I would like to do is to start the getPriceFinal() function with the url from getrank() function concurrently .. and ideas on how to solve this issue?

Getting duplicate Network Response Playwright Python

I have a working script but when I changed the page.on in playwright it actually runs the network response a certain number of times as per the loop count. I have been trying to figure out why that happens.
For example at i=0 it gives one response.url print but at i=10 it prints response.url 10 times and then send 10 duplicate data to mongodb. I have no idea why this is happening. The link being sent based on the print are all the same.
Would be a great help if anyone can let me know what it is that I am doing wrong that is causing this issue.
Pls see sample code here.
#imports here
today = datetime.today().strftime("%m%d%Y")
filenamearr = []
mongousername = 'XXX'
mongopassword = 'XXXX'
client = MongoClient("mongodb+srv://%s:%s#XXXXX.XXXX.mongodb.net/?retryWrites=true&w=majority"%(mongousername,mongopassword))
db = client.DB1
logg = []
async def runbrowser(playwright,url):
async def handle_response(response,buttonnumber):
l = str(response.url)
para = 'param'
if para in l:
print(response.url)
textdata = await response.text()
subtask = asyncio.create_task(jsonparse(textdata))
done, pending = await asyncio.wait({subtask})
if subtask in done:
print("Success in Json parser")
result = await subtask
status = [buttonnumber,result]
logg.append(status)
print(status)
logdf = pd.DataFrame(logg)
logdf.columns = ['BUTTON','RESULT']
fname = 'XXXX' + today +".csv"
logdf.to_csv(fname,index=False)
async def jsonparse(textdata):
try:
#parsing happens here to output to MongoDB
return "Success"
except Exception as e:
print("Failled parsing")
return e
browser = await playwright.firefox.launch(
headless=True,
)
context = await browser.new_context(
locale='en-US',
ignore_https_errors = True,
)
page = await context.new_page()
await page.goto(url,timeout=0)
button = page.locator("xpath=//button[#event-list-item='']")
bcount = button.locator(":scope",has_text="Locator")
count = await bcount.count()
print(count)
for i in range(count):
print("\n\n\n\n\nSleeping 10 seconds before clicking button")
buttonnumber = i
await asyncio.sleep(10)
print("Clickking Button: ", i)
cbtn = bcount.nth(i)
await cbtn.hover()
await asyncio.sleep(4)
await cbtn.click()
if i==0:
print("i=0")
await page.reload(timeout=0)
retry = page.on("response",lambda response: handle_response(response,buttonnumber))
title = await page.title()
print(title)
print("Heading back to the main page.")
await page.go_back(timeout=0)
await page.reload()
await page.wait_for_timeout(5000)
await page.close()
print("Closing Tab")
await browser.close()
async def main():
tasks = []
async with async_playwright() as playwright:
url = 'https://samplelink.com'
tasks.append(asyncio.create_task(runbrowser(playwright,url)))
for t in asyncio.as_completed(tasks):
print(await t)
await asyncio.gather(*tasks)
asyncio.run(main())

Not able to create session with HTTPDigestAuth?

I am trying to make the GET request in an asynchronous manner. And the client authentication system has implemented with DigestAuth.
async def run(self, url_list, api_auth):
sem = asyncio.Semaphore(10)
tasks = []
async with ClientSession(auth = HTTPDigestAuth("user_name", "password")) as session:
for url in url_list:
task = asyncio.ensure_future(self.get_content(sem, url, session))
tasks.append(task)
response = await asyncio.gather(*tasks)
logging.critical(f"Total size of response: {len(response)}")
TypeError: BasicAuth() tuple is required instead
When I created the authentication tuple with BasicAuth().
async def run(self, url_list, api_auth):
sem = asyncio.Semaphore(10)
tasks = []
async with ClientSession(auth = BasicAuth("user_name", "password", verify=True)) as session:
for url in url_list:
task = asyncio.ensure_future(self.get_content(sem, url, session))
tasks.append(task)
response = await asyncio.gather(*tasks)
logging.critical(f"Total size of response: {len(response)}")
401 - Unauthorized
Is there any proper/work-around approach to solve this?

How to perform proper error handling with asyncio and aiohttp?

I am looking for guidance around best practices with asyncio and aiohttp in Python 3. I have a basic scraper but I am not sure how to:
Properly implement error handling. More specific around my fetch function.
Do I really need the last main function to wrap my async crawler around?
Here is my code so far, it is working but I would like feedback on the two item above.
urls = []
async def fetch(url, payload={}):
async with ClientSession() as s:
async with s.get(url, params=payload) as resp:
content = await resp.read()
return content
async def get_profile_urls(url, payload):
content = await fetch(url, payload)
soup = BeautifulSoup(content, 'html.parser')
soup = soup.find_all(attrs={'class': 'classname'})
if soup:
urls.extend([s.find('a')['href'] for s in soup])
async def main():
tasks = []
payload = {
'page': 0,
'filter': 88}
for i in range(max_page + 1):
payload['page'] += 1
tasks.append(get_profile_urls(search_ulr, payload))
await asyncio.wait(tasks)
asyncio.run(main())

API calls not running ansynchronously

I'm working on a python client that will asynchronously download vinyl cover art. My problem is that I'm new to python (especially asynchronous python) and I don't think my code is running ansychronously. I have another client written in Node.js that is able to get approx. 40 images/sec whereas this python one is only managing to get around 1.5/sec.
import aiohttp
import asyncio
from os import path,makedirs
caa_base_url = "https://coverartarchive.org/release"
image_download_dir = path.realpath('images')
# small,large, None = Max
image_size = None
async def getImageUrls(release_mbid,session):
async with session.get(f'{caa_base_url}/{release_mbid}') as resp:
if resp.status == 404 or resp.status == 403:
return
return [release_mbid,await resp.json()]
async def getImage(url,session):
try:
async with session.get(url) as resp:
return [url,await resp.read()]
except (aiohttp.ServerDisconnectedError):
return await getImage(url,session)
async def getMBIDs(mb_page_url):
async with aiohttp.ClientSession() as session:
async with session.get(mb_page_url) as resp:
mb_json = await resp.json()
tasks = []
async with aiohttp.ClientSession() as caa_session:
for release in mb_json["releases"]:
task = asyncio.ensure_future(getImageUrls(release["id"],caa_session))
tasks.append(task)
responses = await asyncio.gather(*tasks)
async with aiohttp.ClientSession() as caa_image_session:
for response in responses:
if response is not None:
caaTasks = []
release_mbid = response[0]
result = response[1]
for image in result["images"]:
if image["front"] == True:
caaTask = asyncio.ensure_future(getImage(image["image"],caa_session))
caaTasks.append(caaTask)
image_responses = await asyncio.gather(*caaTasks)
for image_response in image_responses:
image_url = image_response[0]
image_binary = image_response[1]
new_file_dir = path.join(image_download_dir,release_mbid)
if not path.isdir(new_file_dir):
makedirs(new_file_dir)
file_name = image_url[image_url.rfind("/")+1:]
file_path = path.join(new_file_dir,file_name)
new_file = open(file_path,'wb')
new_file.write(image_binary)
mb_base_url = "https://musicbrainz.org/ws/2/release"
num_pages = 100
releases_per_page = 100
mb_page_urls = []
async def getMBPages():
for page_index in range(num_pages):
await getMBIDs('%s?query=*&type=album&format=Vinyl&limit=%s&offset=%s&fmt=json' % (mb_base_url,releases_per_page,page_index*releases_per_page))
await asyncio.sleep(1)
loop = asyncio.get_event_loop()
loop.run_until_complete(getMBPages())
P.S. The sleep is because musicbrainz api limits to 1 request/sec

Categories

Resources