How can I improve the aiohttp crawler speed?

How can I improve the aiohttp crawler speed? - python

import aiohttp
from bs4 import BeautifulSoup
from xlrd import open_workbook
from xlwt import Workbook
url_list = [https://www.facebook.com,https://www.baidu.com,https://www.yahoo.com,...]
#There are more than 20000 different websites in the list
#Some websites may not be accessible
keywords=['xxx','xxx'....]
start = time.time()
localtime = time.asctime(time.localtime(time.time()))
print("start time :", localtime)
choose_url=[]
url_title=[]
async def get(url, session):
try:
async with session.get(url=url,timeout=0) as response:
resp = await response.text()
soup = BeautifulSoup(resp, "lxml")
title = soup.find("title").text.strip()
for keyword in keywords:
if keyword in title:
choose_url.append(url)
url_title.append(title)
print("Successfully got url {} with resp's name {}.".format(url, title))
break
except Exception as e:
pass
async def main(urls):
connector = aiohttp.TCPConnector(ssl=False,limit=0,limit_per_host =0)
session = aiohttp.ClientSession(connector=connector)
ret = await asyncio.gather(*[get(url, session) for url in urls])
print("Finalized all. Return is a list of outputs.")
await session.close()
def write_exccel(choose_url,url_title):
#write choose_url,url_title to excel
pass
asyncio.run(main(url_list))
write_exccel(choose_url,url_title)
localtime = time.asctime(time.localtime(time.time()))
print("now time is :", localtime)
end = time.time()
print('time used：', end - start)
I have 20000 URLs to request. But it takes a long time (more than 4 or 5 hours).It just needs 3 hours if I use requests+multiprocessing(Pool 4).
I tried to use aiohttp+multiprocessing,It doesn't seem to work. Can the code be as fast as possible either by optimizing this code or using any available technology? Thanks

I don't know if the following method is fast or not.
import time
from simplified_scrapy import Spider, SimplifiedDoc, SimplifiedMain, utils
class MySpider(Spider):
name = 'demo_spider'
start_urls = ["https://www.facebook.com","https://www.baidu.com","https://www.yahoo.com"] # Entry page
keywords = ['xxx','xxx']
choose_url=[]
url_title=[]
concurrencyPer1s = 10
def extract(self, url, html, models, modelNames):
doc = SimplifiedDoc(html)
title = doc.title
if title.containsOr(self.keywords):
self.choose_url.append(url.url)
self.url_title.append(title.text)
print("Successfully got url {} with resp's name {}.".format(url, title.text))
def urlCount(self):
count = Spider.urlCount(self)
if count==0:
SimplifiedMain.setRunFlag(False)
return count
start = time.time()
localtime = time.asctime(time.localtime(time.time()))
print("start time :", localtime)
SimplifiedMain.startThread(MySpider(),{"concurrency":600, "concurrencyPer1S":100, "intervalTime":0.001, "max_workers":10}) # Start download
localtime = time.asctime(time.localtime(time.time()))
print("now time is :", localtime)
end = time.time()
print('time used：', end - start)

Related

aiohttp asyncio parsing works fine for a time, then without any error gets no data

I need to parse html from list of domains (only main pages)
Script works well for a period of time, then it's getting no data with very high speed. Looks like requests doesn't even send.
My code:
import asyncio
import time
import aiohttp
import pandas as pd
import json
from bs4 import BeautifulSoup
df = pd.read_excel('work_file.xlsx')
domains_count = df.shape[0]
start_time = time.time()
print(start_time)
data = {}
async def get_data(session, url, j):
try:
async with session.get(url) as resp:
html = await resp.text()
rawhtml = BeautifulSoup(html, 'lxml')
title = rawhtml.title
data[url] = {'url': url, 'resp': resp.status, 'title': str(title), 'html': str(rawhtml)}
print(j)
print(url)
except Exception as e:
data[url] = {'url': url, 'resp': str(e), 'title': 'None', 'html': 'None'}
print(j)
print(url)
print(str(e))
async def get_queue():
tasks = []
async with aiohttp.ClientSession(timeout=aiohttp.ClientTimeout(total=120)) as session:
for j, i in enumerate(df.domain):
i = 'http://' + i.lower()
task = asyncio.create_task(get_data(session, i, j))
tasks.append(task)
await asyncio.gather(*tasks)
if __name__ == '__main__':
asyncio.run(get_queue())
with open('parsed_data.json', 'a+') as file:
file.write(json.dumps(data))
end_time = time.time() - start_time
print(end_time)

That was timeout error.
'resp': str(e)
This code prints only error exception message. TimeOut Error has no exception message, so str(e) = empty string.
str(repr(e)) helps to see an Error.

Extracting more than 100 links from tickertape.in using asyncio makes it run forever

I want to extract data from tickertape.in (just the % change of the price) but when I have more than 100 links it is running forever.
here is my code :----
import asyncio
import aiohttp
import os
import time
import pandas as pd
data = pd.read_csv('ticker_tape_links.csv')
results = []
start = time.time()
def get_tasks(session,start,end):
tasks = []
for url in data['links'][start:end]:
tasks.append(asyncio.create_task(session.get(url, ssl=False)))
return tasks
async def get_symbols(start,end):
async with aiohttp.ClientSession() as session:
tasks = get_tasks(session,start,end)
responses = await asyncio.gather(*tasks)
for response in responses:
results.append(response)
await get_symbols(0,1000)
end = time.time()
total_time = end - start
print("It took {} seconds to make {} API calls".format(total_time, len(data['links'][0:1000])))
I used await get_symbols(0,1000) instead of asyncio.run(get_symbols()) because the latter is not supported in jupyter notebook but the former works

Different result between urllib and aiohttp

So basically i'm trying to get the currently playing track from online radio direct link (Example -
http://air.radiorecord.ru:8101/rr_320).
Firstly i found something in the internet, written with urllib, my application is asynchronous so i needed to use aiohttp. With urllib it worked perfectly, while aiohttp sometimes just can't find anything. Pls help :(
before:
def get_now(self, session):
request = urllib.Request(self.data.get('url'),headers={'Icy-MetaData': 1} ) # request metadata
response = urllib.urlopen(request)
metadata = response.headers
metaint = int(response.headers['icy-metaint'])
for _ in range(10): # title may be empty initially, try several times
response.read(metaint) # skip to metadata
metadata_length = struct.unpack('B', response.read(1))[0] * 16 # length byte
metadata = response.read(metadata_length).rstrip(b'\0')
# extract title from the metadata
m = re.search(br"StreamTitle='([^']*)';", metadata)
if m:
title = m.group(1)
if title:
break
else:
return "No title found"
return title.decode('utf8', errors='replace')
except:
return "No title found"
after:
async def get_now(self, session):
async with session.get(self.stream_url, headers={'Icy-MetaData': "1"}) as resp:
content = resp.content
metadata = resp.headers
metaint = int(metadata['icy-metaint'])
for _ in range(30):
await content.read(metaint)
metadata_length = struct.unpack('B', await content.read(1))[0] * 16 # length byte
metadata = (await content.read(metadata_length)).rstrip(b'\0')
m = re.search(br"StreamTitle='([^']*)';", metadata)
if m:
title = m.group(1)
if title:
return title.decode('utf8', errors='replace')
else:
return "No title found"
return "Nothing found"

The snippet below is always able to detect the current track (in around 400ms) but instead of processing only part of the chunk it checks the whole chunk as it's read:
import aiohttp
import asyncio
import re
async def get_now(stream_url, session):
headers={"Icy-MetaData": "1"}
async with session.get(stream_url, headers=headers) as resp:
for _ in range(10):
data = await resp.content.read(8192)
m = re.search(br"StreamTitle='([^']*)';", data.rstrip(b"\0"))
if m:
title = m.group(1)
if title:
return title.decode("utf8", errors="replace")
else:
return "No title found"
return "Nothing found"
async def get_track():
session = aiohttp.ClientSession()
stream_url = "http://air.radiorecord.ru:8101/rr_320"
result = await get_now(stream_url, session)
print(f"result: {result}")
await session.close()
asyncio.run(get_track())
Result on my computer (CPU usage is very low on a quite old CPU: i7-3517U):
[ionut#ionut-pc ~]$ time python test.py
result: Record Club - Nejtrino & Baur
real 0m0.401s
user 0m0.198s
sys 0m0.031s

Python multiprocessing in for loop (requests and BeautifulSoup)

I have list of a lot of links and I want to use multiprocessing to speed the proccess, here is simplified version, I need it to be ordered like this:
I tried a lot of things, process, pool etc. I always had errors, I need to do it with 4 or 8 threads and make it ordered like this. Thank you for all help. Here is code:
from bs4 import BeautifulSoup
import requests
import time
links = ["http://www.tennisexplorer.com/match-detail/?id=1672704", "http://www.tennisexplorer.com/match-detail/?id=1699387", "http://www.tennisexplorer.com/match-detail/?id=1698990" "http://www.tennisexplorer.com/match-detail/?id=1696623", "http://www.tennisexplorer.com/match-detail/?id=1688719", "http://www.tennisexplorer.com/match-detail/?id=1686305"]
data = []
def essa(match, omega):
aaa = BeautifulSoup(requests.get(match).text, "lxml")
center = aaa.find("div", id="center")
p1_l = center.find_all("th", class_="plName")[0].find("a").get("href")
p2_l = center.find_all("th", class_="plName")[1].find("a").get("href")
return p1_l + " - " + p2_l + " - " + str(omega)
i = 1
start_time = time.clock()
for link in links:
data.append(essa(link, i))
i += 1
for d in data:
print(d)
print(time.clock() - start_time, "seconds")

Spawn several threads of the function and join them together:
from threading import Thread
def essa(match, omega):
aaa = BeautifulSoup(requests.get(match).text, "lxml")
center = aaa.find("div", id="center")
p1_l = center.find_all("th", class_="plName")[0].find("a").get("href")
p2_l = center.find_all("th", class_="plName")[1].find("a").get("href")
print p1_l + " - " + p2_l + " - " + str(omega)
if __name__ == '__main__':
threadlist = []
for index, url in enumerate(links):
t= Thread(target=essa,args=(url, index))
t.start()
threadlist.append(t)
for b in threadlist:
b.join()
You wont get them to print in order, for the simple reason that some http responses take longer than others.

As far I can understand you have the list of links and make requests concurrently to make the process faster. Here is the sample code for multithreading. I hope this will help you. Read the documentation for concurrent futures.
import concurrent.futures
import urllib.request
URLS = ['http://www.foxnews.com/',
'http://www.cnn.com/',
'http://europe.wsj.com/',
'http://www.bbc.co.uk/',
'http://some-made-up-domain.com/']
# Retrieve a single page and report the URL and contents
def load_url(url, timeout):
with urllib.request.urlopen(url, timeout=timeout) as conn:
return conn.read()
# We can use a with statement to ensure threads are cleaned up promptly
with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
# Start the load operations and mark each future with its URL
future_to_url = {executor.submit(load_url, url, 60): url for url in URLS}
for future in concurrent.futures.as_completed(future_to_url):
url = future_to_url[future]
try:
data = future.result()
except Exception as exc:
print('%r generated an exception: %s' % (url, exc))
else:
print('%r page is %d bytes' % (url, len(data)))

aiohttp: rate limiting requests-per-second by domain

I am writing a web crawler that is running parallel fetches for many different domains. I want to limit the number of requests-per-second that are made to each individual domain, but I do not care about the total number of connections that are open, or the total requests per second that are made across all domains. I want to maximize the number of open connections and requests-per-second overall, while limiting the number of requests-per-second made to individual domains.
All of the currently existing examples I can find either (1) limit the number of open connections or (2) limit the total number of requests-per-second made in the fetch loop. Examples include:
aiohttp: rate limiting parallel requests
aiohttp: set maximum number of requests per second
Neither of them do what I am requesting which is to limit requests-per-second on a per domain basis. The first question only answers how to limit requests-per-second overall. The second one doesn't even have answers to the actual question (the OP asks about requests per second and the answers all talk about limiting # of connections).
Here is the code that I tried, using a simple rate limiter I made for a synchronous version, which doesn't work when the DomainTimer code is run in an async event loop:
from collections import defaultdict
from datetime import datetime, timedelta
import asyncio
import async_timeout
import aiohttp
from urllib.parse import urlparse
from queue import Queue, Empty
from HTMLProcessing import processHTML
import URLFilters
SEED_URLS = ['http://www.bbc.co.uk', 'http://www.news.google.com']
url_queue = Queue()
for u in SEED_URLS:
url_queue.put(u)
# number of pages to download per run of crawlConcurrent()
BATCH_SIZE = 100
DELAY = timedelta(seconds = 1.0) # delay between requests from single domain, in seconds
HTTP_HEADERS = {'Referer': 'http://www.google.com',
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:59.0) Gecko/20100101 Firefox/59.0'}
class DomainTimer():
def __init__(self):
self.timer = None
def resetTimer(self):
self.timer = datetime.now()
def delayExceeded(self, delay):
if not self.timer: #We haven't fetched this before
return True
if (datetime.now() - self.timer) >= delay:
return True
else:
return False
crawl_history = defaultdict(dict) # given a URL, when is last time crawled?
domain_timers = defaultdict(DomainTimer)
async def fetch(session, url):
domain = urlparse(url).netloc
print('here fetching ' + url + "\n")
dt = domain_timers[domain]
if dt.delayExceeded(DELAY) or not dt:
with async_timeout.timeout(10):
try:
dt.resetTimer() # reset domain timer
async with session.get(url, headers=HTTP_HEADERS) as response:
if response.status == 200:
crawl_history[url] = datetime.now()
html = await response.text()
return {'url': url, 'html': html}
else:
# log HTTP response, put into crawl_history so
# we don't attempt to fetch again
print(url + " failed with response: " + str(response.status) + "\n")
return {'url': url, 'http_status': response.status}
except aiohttp.ClientConnectionError as e:
print("Connection failed " + str(e))
except aiohttp.ClientPayloadError as e:
print("Recieved bad data from server # " + url + "\n")
else: # Delay hasn't passed yet: skip for now & put # end of q
url_queue.put(url);
return None
async def fetch_all(urls):
"""Launch requests for all web pages."""
tasks = []
async with aiohttp.ClientSession() as session:
for url in urls:
task = asyncio.ensure_future(fetch(session, url))
tasks.append(task) # create list of tasks
return await asyncio.gather(*tasks) # gather task responses
def batch_crawl():
"""Launch requests for all web pages."""
start_time = datetime.now()
# Here we build the list of URLs to crawl for this batch
urls = []
for i in range(BATCH_SIZE):
try:
next_url = url_queue.get_nowait() # get next URL from queue
urls.append(next_url)
except Empty:
print("Processed all items in URL queue.\n")
break;
loop = asyncio.get_event_loop()
asyncio.set_event_loop(loop)
pages = loop.run_until_complete(fetch_all(urls))
crawl_time = (datetime.now() - start_time).seconds
print("Crawl completed. Fetched " + str(len(pages)) + " pages in " + str(crawl_time) + " seconds.\n")
return pages
def parse_html(pages):
""" Parse the HTML for each page downloaded in this batch"""
start_time = datetime.now()
results = {}
for p in pages:
if not p or not p['html']:
print("Received empty page")
continue
else:
url, html = p['url'], p['html']
results[url] = processHTML(html)
processing_time = (datetime.now() - start_time).seconds
print("HTML processing finished. Processed " + str(len(results)) + " pages in " + str(processing_time) + " seconds.\n")
return results
def extract_new_links(results):
"""Extract links from """
# later we could track where links were from here, anchor text, etc,
# and weight queue priority based on that
links = []
for k in results.keys():
new_urls = [l['href'] for l in results[k]['links']]
for u in new_urls:
if u not in crawl_history.keys():
links.append(u)
return links
def filterURLs(urls):
urls = URLFilters.filterDuplicates(urls)
urls = URLFilters.filterBlacklistedDomains(urls)
return urls
def run_batch():
pages = batch_crawl()
results = parse_html(pages)
links = extract_new_links(results)
for l in filterURLs(links):
url_queue.put(l)
return results
There are no errors or exceptions thrown, and the rate-limiting code works fine in for synchronous fetches, but the DomainTimer has no apparent effect when run in async loop. The delay of one request-per-second per domain is not upheld...
How would I modify this synchronous rate limiting code to work within the async event loop? Thanks!

It's hard to debug your code since it contains many unrelated stuff, it's easier to show idea on a new simple example.
Main idea:
write your Semaphore-like class using __aenter__, __aexit__
that accepts url (domain)
use domain-specific Lock to prevent multiple requests to the same domain
sleep before allowing next request according to domain's last request and RPS
track time of last request for each domain
Code:
import asyncio
import aiohttp
from urllib.parse import urlparse
from collections import defaultdict
class Limiter:
# domain -> req/sec:
_limits = {
'httpbin.org': 4,
'eu.httpbin.org': 1,
}
# domain -> it's lock:
_locks = defaultdict(lambda: asyncio.Lock())
# domain -> it's last request time
_times = defaultdict(lambda: 0)
def __init__(self, url):
self._host = urlparse(url).hostname
async def __aenter__(self):
await self._lock
to_wait = self._to_wait_before_request()
print(f'Wait {to_wait} sec before next request to {self._host}')
await asyncio.sleep(to_wait)
async def __aexit__(self, *args):
print(f'Request to {self._host} just finished')
self._update_request_time()
self._lock.release()
#property
def _lock(self):
"""Lock that prevents multiple requests to same host."""
return self._locks[self._host]
def _to_wait_before_request(self):
"""What time we need to wait before request to host."""
request_time = self._times[self._host]
request_delay = 1 / self._limits[self._host]
now = asyncio.get_event_loop().time()
to_wait = request_time + request_delay - now
to_wait = max(0, to_wait)
return to_wait
def _update_request_time(self):
now = asyncio.get_event_loop().time()
self._times[self._host] = now
# request that uses Limiter instead of Semaphore:
async def get(url):
async with Limiter(url):
async with aiohttp.ClientSession() as session: # TODO reuse session for different requests.
async with session.get(url) as resp:
return await resp.text()
# main:
async def main():
coros = [
get('http://httpbin.org/get'),
get('http://httpbin.org/get'),
get('http://httpbin.org/get'),
get('http://httpbin.org/get'),
get('http://httpbin.org/get'),
get('http://eu.httpbin.org/get'),
get('http://eu.httpbin.org/get'),
get('http://eu.httpbin.org/get'),
get('http://eu.httpbin.org/get'),
get('http://eu.httpbin.org/get'),
]
await asyncio.gather(*coros)
if __name__ == '__main__':
loop = asyncio.get_event_loop()
try:
loop.run_until_complete(main())
finally:
loop.run_until_complete(loop.shutdown_asyncgens())
loop.close()

I developed a library named octopus-api (https://pypi.org/project/octopus-api/), that enables you to rate limit and set the number of connections to the endpoint using aiohttp under the hood. The goal of it is to simplify all the aiohttp setup needed.
Here is an example of how to use it, where the get_ethereum is the user-defined request function. It could have also been a web crawler function request or what ever fits:
from octopus_api import TentacleSession, OctopusApi
from typing import Dict, List
if __name__ == '__main__':
async def get_ethereum(session: TentacleSession, request: Dict):
async with session.get(url=request["url"], params=request["params"]) as response:
body = await response.json()
return body
client = OctopusApi(rate=50, resolution="sec", connections=6)
result: List = client.execute(requests_list=[{
"url": "https://api.pro.coinbase.com/products/ETH-EUR/candles?granularity=900&start=2021-12-04T00:00:00Z&end=2021-12-04T00:00:00Z",
"params": {}}] * 1000, func=get_ethereum)
print(result)
The TentacleSession works the same as how you write POST, GET, PUT and PATCH for aiohttp.ClientSession.
Let me know if it helps your issue related to rate limits and connection for crawling.

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

How can I improve the aiohttp crawler speed? - python

Related

aiohttp asyncio parsing works fine for a time, then without any error gets no data

Extracting more than 100 links from tickertape.in using asyncio makes it run forever

Different result between urllib and aiohttp

Python multiprocessing in for loop (requests and BeautifulSoup)

aiohttp: rate limiting requests-per-second by domain

Categories

Resources