AttributeError when using cloudscraper object in asynchronous function - python

import cloudscraper
import requests
import asyncio
async def scrape_prices():
scraper_object = cloudscraper.create_scraper()
async with scraper_object as session:
for item_number in prices_to_be_scraped:
hyperlink = "https://opensea.io/assets/" +
str(get_contract_address(get_project_name())) + "/" + str(
item_number)
async with str(scraper_object.get(hyperlink).text) as response:
print(await response)
ERROR IM GETTING IN TERMINAL:
async with scraper_object as session:
AttributeError: __aenter __

I was having a similar issue, and I found this library https://pypi.org/project/aiocfscrape/ which should be doing the same as cloudscraper does but asynchronous.
The use example that they show in the documents is this one:
import asyncio
from aiocfscrape import CloudflareScraper
async def test_open_page(url):
async with CloudflareScraper() as session:
async with session.get(url) as resp:
return await resp.text()
if __name__ == '__main__':
asyncio.run(test_open_page('<your url>'))
I hope you find this library useful.

Related

How to implement asyncio GET requests as self-contained class?

Much is written about creating self-contained functions for other tasks,
How to call a async function contained in a class?
How to set class attribute with await in __init__
but none address how to do so for GET requests.
Considering the following MWE -- how might this be transformed into a self-contained class?
import aiohttp
import asyncio
import time
async def get_pokemon(session, url):
async with session.get(url) as resp:
pokemon = await resp.json()
return pokemon['name']
async def main():
async with aiohttp.ClientSession() as session:
tasks = []
for number in range(1, 151):
url = f'https://pokeapi.co/api/v2/pokemon/{number}'
tasks.append(asyncio.ensure_future(get_pokemon(session, url)))
original_pokemon = await asyncio.gather(*tasks)
for pokemon in original_pokemon:
print(pokemon)
asyncio.run(main())
Code credit: https://www.twilio.com/blog/asynchronous-http-requests-in-python-with-aiohttp

Python asyncio.gather returns None

I'm using Python asyncio to implement a fast http client.
As you can see in the comments below inside the worker function I get the responses as soon as they are finished. I would like to get the responses ordered and this is why I'm using asyncio.gather.
Why is it returning None? Can anybody help?
Thank you so much!
import time
import aiohttp
import asyncio
MAXREQ = 100
MAXTHREAD = 500
URL = 'https://google.com'
g_thread_limit = asyncio.Semaphore(MAXTHREAD)
async def worker(session):
async with session.get(URL) as response:
await response.read() #If I print this line I get the responses correctly
async def run(worker, *argv):
async with g_thread_limit:
await worker(*argv)
async def main():
async with aiohttp.ClientSession() as session:
await asyncio.gather(*[run(worker, session) for _ in range(MAXREQ)])
if __name__ == '__main__':
totaltime = time.time()
print(asyncio.get_event_loop().run_until_complete(main())) #I'm getting a None here
print (time.time() - totaltime)
Your function run doesn't return nothing explicitly, so it returns None implicitly. Add return statement and you'll get a result
async def worker(session):
async with session.get(URL) as response:
return await response.read()
async def run(worker, *argv):
async with g_thread_limit:
return await worker(*argv)

asyncio tasks using aiohttp.ClientSession

I'm using python 3.7 and trying to make a crawler that can go multiple domains asynchronously. I'm using for this asyncio and aiohttp but i'm experiencing problems with the aiohttp.ClientSession. This is my reduced code:
import aiohttp
import asyncio
async def fetch(session, url):
async with session.get(url) as response:
print(await response.text())
async def main():
loop = asyncio.get_event_loop()
async with aiohttp.ClientSession(loop=loop) as session:
cwlist = [loop.create_task(fetch(session, url)) for url in ['http://python.org', 'http://google.com']]
asyncio.gather(*cwlist)
if __name__ == "__main__":
asyncio.run(main())
The thrown exception is this:
_GatheringFuture exception was never retrieved
future: <_GatheringFuture finished exception=RuntimeError('Session is closed')>
What am i doing wrong here?
You forgot to await the asyncio.gather result:
async with aiohttp.ClientSession(loop=loop) as session:
cwlist = [loop.create_task(fetch(session, url)) for url in ['http://python.org', 'http://google.com']]
await asyncio.gather(*cwlist)
If you ever have an async with containing no await expressions you should be fairly suspicious.

asyncio/aiohttp not returning response

I am trying to scrape some data from https://www.officialcharts.com/ by parallelising web requests using asyncio/aiohttp. I implemented the code given at the link here.
I followed two different procedures. The first one goes like this.
from bs4 import BeautifulSoup
from urllib.request import urlopen
from selenium import webdriver
import time
import pandas as pd
import numpy as np
import re
import json
import requests
from bs4 import BeautifulSoup
from datetime import date, timedelta
from IPython.display import clear_output
import memory_profiler
import spotipy
import spotipy.util as util
import pandas as pd
from more_itertools import unique_everseen
weeks = []
d = date(1970, 1, 1)
d += timedelta(days = 6 - d.weekday())
for i in range(2500):
weeks.append(d.strftime('%Y%m%d'))
d += timedelta(days = 7)
import asyncio
from aiohttp import ClientSession
import nest_asyncio
nest_asyncio.apply()
result = []
async def fetch(url, session):
async with session.get(url) as response:
return await response.read()
async def run(r):
tasks = []
# Fetch all responses within one Client session,
# keep connection alive for all requests.
async with ClientSession() as session:
for i in range(r):
url = 'https://www.officialcharts.com/charts/singles-chart/' + weeks[i] + '/'
task = asyncio.ensure_future(fetch(url, session))
tasks.append(task)
responses = await asyncio.gather(*tasks)
result.append(responses)
loop = asyncio.get_event_loop()
future = asyncio.ensure_future(run(5))
loop.run_until_complete(future)
print('Done')
print(result[0][0] == None)
The problem with above code is, it fails when I make more than simultaneous 1000 requests.
The author of the post implemented a different procedure to address this issue and he claims we can do as many as 10K requests. I followed along his second procedure and here is my code for that.
import random
import asyncio
from aiohttp import ClientSession
import nest_asyncio
nest_asyncio.apply()
result = []
async def fetch(url, session):
async with session.get(url) as response:
delay = response.headers.get("DELAY")
date = response.headers.get("DATE")
print("{}:{} with delay {}".format(date, response.url, delay))
return await response.read()
async def bound_fetch(sem, url, session):
# Getter function with semaphore.
async with sem:
await fetch(url, session)
async def run(r):
tasks = []
# create instance of Semaphore
sem = asyncio.Semaphore(1000)
# Create client session that will ensure we dont open new connection
# per each request.
async with ClientSession() as session:
for i in range(r):
url = 'https://www.officialcharts.com/charts/singles-chart/' + weeks[i] + '/'
task = asyncio.ensure_future(bound_fetch(sem, url, session))
tasks.append(task)
responses = await asyncio.gather(*tasks)
result.append(responses)
number = 5
loop = asyncio.get_event_loop()
future = asyncio.ensure_future(run(number))
loop.run_until_complete(future)
print('Done')
print(result[0][0] == None)
For some reason, this doesn't return any responses.
PS:I am not from CS background and just program for fun. I have no clue what's going on inside the asyncio code.
Try to use the latest version.
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
from aiohttp import ClientSession, client_exceptions
from asyncio import Semaphore, ensure_future, gather, run
from json import dumps, loads
limit = 10
http_ok = [200]
async def scrape(url_list):
tasks = list()
sem = Semaphore(limit)
async with ClientSession() as session:
for url in url_list:
task = ensure_future(scrape_bounded(url, sem, session))
tasks.append(task)
result = await gather(*tasks)
return result
async def scrape_bounded(url, sem, session):
async with sem:
return await scrape_one(url, session)
async def scrape_one(url, session):
try:
async with session.get(url) as response:
content = await response.read()
except client_exceptions.ClientConnectorError:
print('Scraping %s failed due to the connection problem', url)
return False
if response.status not in http_ok:
print('Scraping%s failed due to the return code %s', url, response.status)
return False
content = loads(content.decode('UTF-8'))
return content
if __name__ == '__main__':
urls = ['http://demin.co/echo1/', 'http://demin.co/echo2/']
res = run(scrape(urls))
print(dumps(res, indent=4))
This is a template of a real project that works as predicted.
You can find this source code here

Can someone help to explain why the python aiohttp return more response content than requests.get?

Recently, I'm looking at the python aiohttp lib, play around it, compare with python requests. Here is the code:
import aiohttp
import asyncio
import requests
request_url = 'http://www.baidu.com'
requests_resp = requests.get(request_url)
async def fetch(session, url):
async with session.get(url) as response:
return await response.text()
async def main():
async with aiohttp.ClientSession() as session:
aio_resp = await fetch(session, request_url)
print('aio_resp_length =', len(aio_resp))
loop = asyncio.get_event_loop()
loop.run_until_complete(main())
print('requests_resp_length = ', len(requests_resp.text))
The response lengths with a huge diffferences
aio_resp_length = 152576
requests_resp_length = 2381
Not sure what happens in aiohttp.session.get, but this result is not always like this. When you change the requests_url to http://www.example.com,the
response lengthes are the same. Can someone tell me what happened here?
Cheers
Because aiohttp has newline in it's response and requests doesn't.
you can check thier response like this
print('requests_resp_length = ', requests_resp.text[0:100])
print('aio_resp_length =', aio_resp[0:100])

Categories

Resources