asyncio/aiohttp not returning response - python

I am trying to scrape some data from https://www.officialcharts.com/ by parallelising web requests using asyncio/aiohttp. I implemented the code given at the link here.
I followed two different procedures. The first one goes like this.
from bs4 import BeautifulSoup
from urllib.request import urlopen
from selenium import webdriver
import time
import pandas as pd
import numpy as np
import re
import json
import requests
from bs4 import BeautifulSoup
from datetime import date, timedelta
from IPython.display import clear_output
import memory_profiler
import spotipy
import spotipy.util as util
import pandas as pd
from more_itertools import unique_everseen
weeks = []
d = date(1970, 1, 1)
d += timedelta(days = 6 - d.weekday())
for i in range(2500):
weeks.append(d.strftime('%Y%m%d'))
d += timedelta(days = 7)
import asyncio
from aiohttp import ClientSession
import nest_asyncio
nest_asyncio.apply()
result = []
async def fetch(url, session):
async with session.get(url) as response:
return await response.read()
async def run(r):
tasks = []
# Fetch all responses within one Client session,
# keep connection alive for all requests.
async with ClientSession() as session:
for i in range(r):
url = 'https://www.officialcharts.com/charts/singles-chart/' + weeks[i] + '/'
task = asyncio.ensure_future(fetch(url, session))
tasks.append(task)
responses = await asyncio.gather(*tasks)
result.append(responses)
loop = asyncio.get_event_loop()
future = asyncio.ensure_future(run(5))
loop.run_until_complete(future)
print('Done')
print(result[0][0] == None)
The problem with above code is, it fails when I make more than simultaneous 1000 requests.
The author of the post implemented a different procedure to address this issue and he claims we can do as many as 10K requests. I followed along his second procedure and here is my code for that.
import random
import asyncio
from aiohttp import ClientSession
import nest_asyncio
nest_asyncio.apply()
result = []
async def fetch(url, session):
async with session.get(url) as response:
delay = response.headers.get("DELAY")
date = response.headers.get("DATE")
print("{}:{} with delay {}".format(date, response.url, delay))
return await response.read()
async def bound_fetch(sem, url, session):
# Getter function with semaphore.
async with sem:
await fetch(url, session)
async def run(r):
tasks = []
# create instance of Semaphore
sem = asyncio.Semaphore(1000)
# Create client session that will ensure we dont open new connection
# per each request.
async with ClientSession() as session:
for i in range(r):
url = 'https://www.officialcharts.com/charts/singles-chart/' + weeks[i] + '/'
task = asyncio.ensure_future(bound_fetch(sem, url, session))
tasks.append(task)
responses = await asyncio.gather(*tasks)
result.append(responses)
number = 5
loop = asyncio.get_event_loop()
future = asyncio.ensure_future(run(number))
loop.run_until_complete(future)
print('Done')
print(result[0][0] == None)
For some reason, this doesn't return any responses.
PS:I am not from CS background and just program for fun. I have no clue what's going on inside the asyncio code.

Try to use the latest version.
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
from aiohttp import ClientSession, client_exceptions
from asyncio import Semaphore, ensure_future, gather, run
from json import dumps, loads
limit = 10
http_ok = [200]
async def scrape(url_list):
tasks = list()
sem = Semaphore(limit)
async with ClientSession() as session:
for url in url_list:
task = ensure_future(scrape_bounded(url, sem, session))
tasks.append(task)
result = await gather(*tasks)
return result
async def scrape_bounded(url, sem, session):
async with sem:
return await scrape_one(url, session)
async def scrape_one(url, session):
try:
async with session.get(url) as response:
content = await response.read()
except client_exceptions.ClientConnectorError:
print('Scraping %s failed due to the connection problem', url)
return False
if response.status not in http_ok:
print('Scraping%s failed due to the return code %s', url, response.status)
return False
content = loads(content.decode('UTF-8'))
return content
if __name__ == '__main__':
urls = ['http://demin.co/echo1/', 'http://demin.co/echo2/']
res = run(scrape(urls))
print(dumps(res, indent=4))
This is a template of a real project that works as predicted.
You can find this source code here

Related

Python Requests : How to send many post requests in the same time wait response the first and second

_1 = requests.post(logUrl,data=userDayta, headers=logHead)
i want send many post request like this in the same time
Here are two methods that work:
The reason that i posted both methods in full is that the examples given on the main website throw (RuntimeError: Event loop is closed) messages whereas both of these work.
Method 1: few lines of code, but longer run time (6.5 seconds):
import aiohttp
import asyncio
import time
start_time = time.time()
async def main():
async with aiohttp.ClientSession() as session:
for number in range(1, 151):
pokemon_url = f'https://pokeapi.co/api/v2/pokemon/{number}'
async with session.get(pokemon_url) as resp:
pokemon = await resp.json()
print(pokemon['name'])
loop = asyncio.get_event_loop()
loop.run_until_complete(main())
# Wait 250 ms for the underlying SSL connections to close
loop.run_until_complete(asyncio.sleep(0.250))
loop.close()
Method 2: more code, but shorter run time (1.5 seconds):
import aiohttp
import asyncio
import time
start_time = time.time()
async def get_pokemon(session, url):
async with session.get(url) as resp:
pokemon = await resp.json()
return pokemon['name']
async def main():
async with aiohttp.ClientSession() as session:
tasks = []
for number in range(1, 151):
url = f'https://pokeapi.co/api/v2/pokemon/{number}'
tasks.append(asyncio.ensure_future(get_pokemon(session, url)))
original_pokemon = await asyncio.gather(*tasks)
for pokemon in original_pokemon:
print(pokemon)
loop = asyncio.get_event_loop()
loop.run_until_complete(main())
# Wait 250 ms for the underlying SSL connections to close
loop.run_until_complete(asyncio.sleep(0.250))
loop.close()
both methods are considerable faster than the equivalent synchronous code !!

AttributeError when using cloudscraper object in asynchronous function

import cloudscraper
import requests
import asyncio
async def scrape_prices():
scraper_object = cloudscraper.create_scraper()
async with scraper_object as session:
for item_number in prices_to_be_scraped:
hyperlink = "https://opensea.io/assets/" +
str(get_contract_address(get_project_name())) + "/" + str(
item_number)
async with str(scraper_object.get(hyperlink).text) as response:
print(await response)
ERROR IM GETTING IN TERMINAL:
async with scraper_object as session:
AttributeError: __aenter __
I was having a similar issue, and I found this library https://pypi.org/project/aiocfscrape/ which should be doing the same as cloudscraper does but asynchronous.
The use example that they show in the documents is this one:
import asyncio
from aiocfscrape import CloudflareScraper
async def test_open_page(url):
async with CloudflareScraper() as session:
async with session.get(url) as resp:
return await resp.text()
if __name__ == '__main__':
asyncio.run(test_open_page('<your url>'))
I hope you find this library useful.

trying to reduce the time or make requests at the same time

Is there anyway I can make all this requests at the same time? I'm trying to reduce the time...
def pokemons():
for i in range(1, 800):
url = f"https://pokeapi.co/api/v2/pokemon/{i}"
requisicao = requests.get(url)
try:
lista = requisicao.json()
except ValueError:
print("ERRO TIPO")
What you want is AIOHTTP Asynchronous HTTP Client/Server.
import asyncio
import aiohttp
import ssl
async def fetch(session, pokemon_num):
url = f"https://pokeapi.co/api/v2/pokemon/{pokemon_num}"
async with session.get(url, ssl=ssl.SSLContext()) as response:
return await response.json()
async def fetch_all(loop):
async with aiohttp.ClientSession(loop=loop) as session:
results = await asyncio.gather(*[fetch(session, pokemon_n) for pokemon_n in range(800)], return_exceptions=True)
return results
if __name__ == '__main__':
loop = asyncio.get_event_loop()
result = loop.run_until_complete(fetch_all(loop))
print(result)
I ran this code and got all the requests in a total of 19.5 seconds. I hope its good for your case.
The snippet above comes from another answer from YuriiKramarenko, if it suits you, you can give him a thumbs up. I adjusted it for your specific parameters.
Good luck catching them all!

Instagram API start returning loading page after some calls

I am using the code below to get account information of one thousand instagram accounts using asycnio. In the initial requests the output is correct but after 10-20 calls, instagram starts returning loading page's HTML code. What could I be doing wrong here ? Below is the python code.
import random
import asyncio
from aiohttp import ClientSession
import urllib.request
import aiohttp
async def fetch(url, session,sem):
print("------")
print(url)
async with session.get(url = url) as response:
print(await response.text())
await response.text()
# exit()
if response.status == 200:
await sem.acquire()
fname = url[22:]
fname = fname.split('/')
fname = fname[0] + '.txt'
f = open(fname , 'w')
f.write(str(await response.text()))
sem.release()
# return (await response.text())
async def run(url_list):
tasks = []
# create instance of Semaphore
sem = asyncio.Semaphore(2)
# Create client session that will ensure we dont open new connection
# per each request.
async with ClientSession() as session:
for url in url_list:
task = asyncio.ensure_future(fetch(url, session,sem))
tasks.append(task)
responses = asyncio.gather(*tasks)
await responses
# making the url list here
url_list = []
file = open('url.txt', 'r')
for url in file:
url_list.append(url)
print(url_list)
import time
old = time.time()
loop = asyncio.get_event_loop()
future = asyncio.ensure_future(run(url_list))
loop.run_until_complete(future)
print(time.time() - old)
Here are some of the URL's from url.txt file
https://instagram.com/johanna_kre/?__a=1
https://instagram.com/channie_f/?__a=1
https://instagram.com/lilakuh68/?__a=1
https://instagram.com/nataliacallisto/?__a=1
https://instagram.com/edbastian/?__a=1
https://instagram.com/sylvana.h/?__a=1
https://instagram.com/munich_bombon/?__a=1
https://instagram.com/younotus/?__a=1
https://instagram.com/meet.herbert/?__a=1
https://instagram.com/inaaogo/?__a=1
https://instagram.com/dennisaogo/?__a=1
https://instagram.com/mrslight__/?__a=1
https://instagram.com/reneturrek/?__a=1
https://instagram.com/_eeasyyy/?__a=1
https://instagram.com/sentinobln/?__a=1
https://instagram.com/eri.ka_g/?__a=1
Your semaphore is not limiting the requests as you want it to; you should acquire it before making the request, not before processing the content.
With your current implementation you are making 100 concurrent requests (aiohttp's client default limit) but only process the responses two at a time (however at this point from the server's perspective the requests are already processed).
Use:
async def fetch(url, session,sem):
print("------")
print(url)
await sem.acquire()
async with session.get(url = url) as response:
print(await response.text())
await response.text()
...
sem.release()
...

Aiohttp try to get page response with page numbers until hit empty response

I'm trying to have a small function scraping data from a JSON end point,
the url is like https://xxxxxxxx.com/products.json?&page=" which I can insert a page number,
While I was using requests module I just had a while loop and incrementing the page number and break until I get a empty response (which page is empty)
Is there a possible way to do the same thing with aiohttp?
What I only achieved so far is just pre-genenrate certain number of urls and pass it into tasks
Wondering if I can use a loop as well and stop when see empty response
Thank you very much
'''
import asyncio
import aiohttp
async def download_one(url):
async with aiohttp.ClientSession() as session:
async with session.get(url) as resp:
pprint.pprint(await resp.json(content_type=None))
async def download_all(sites):
tasks = [asyncio.create_task(download_one(site)) for site in sites]
await asyncio.gather(*tasks)
def main():
sites = list(map(lambda x: request_url + str(x), range(1, 50)))
asyncio.run(download_all(sites))
'''
Here is a piece of untested code. Even if it won't work, it will give you an idea how to do the job
import asyncio
import aiohttp
async def download_one(session, url):
async with session.get(url) as resp:
resp = await resp.json()
if not resp:
raise Exception("No data found") # needs to be there for breaking the loop
async def download_all(sites):
async with aiohttp.ClientSession() as session:
futures = [download_one(session, site) for site in sites]
done, pending = await asyncio.wait(
futures, return_when=FIRST_EXCEPTION # will return the result when exception is raised by any future
)
for future in pending:
future.cancel() # it will shut down all redundant jobs
def main():
sites = list(map(lambda x: request_url + str(x), range(1, 50)))
asyncio.run_until_complete(download_all(sites))

Categories

Resources