How to search by list of keywords using asyncio?

How to search by list of keywords using asyncio? - python

Im trying make my search method work by list of search keywords.
Is there a way how can I make await asyncio work though list search?
async def _request(query: dict):
async with httpx.AsyncClient() as client:
r = await client.post('https://nmmgzjq6qi-2.algolianet.com/1/indexes/public_prod_inventory_track_index/query?x-algolia-agent=Algolia%20for%20JavaScript%20(4.12.0)%3B%20Browser', headers=headers, json=query)
return r.json()
async def to_search(query: str, tags: list[str] = [], page=0, hitsPerPage=100):
data = {
"query": query,
"page": page,
"hitsPerPage": hitsPerPage,
"facets": [
"*"
],
"analytics": True,
"clickAnalytics": True,
"tagFilters": [],
"facetFilters": [
make_tags_filter(tags)
],
"maxValuesPerFacet": hitsPerPage,
"enableABTest": False,
"userToken": userToken,
"filters": "",
"ruleContexts": []
}
return await _request(data)
import asyncio
search = ['coffee', 'banana', 'apple']
#search = input()
for x in search:
r = await asyncio.gather(*[to_search(x, page=i) for i in range(10)])
Also. Is there a way how to make search = input() inputing list of keywords (assuming split them with comas)

You can make serveral requests with any parameters the way in the code snippet below. Pay attention that it is the easiest way to do it. If you have many tasks, you have to implement producer-consumers pattern using asyncio.Queue.
import asyncio
from typing import List
import httpx
async def to_search(url, client: httpx.AsyncClient):
res = await client.get(url)
await asyncio.sleep(3)
return res.status_code
async def main_wrapper(urls: List[str]):
# you need only one AsyncClient for asyncio app.
async with httpx.AsyncClient() as client:
results = await asyncio.gather(*[to_search(i, client) for i in urls])
print(results)
if __name__ == '__main__':
urls = ["http://google.com"] * 20
asyncio.run(main_wrapper(urls=urls))

Related

Iterate over a large list with Asyncio and get url file size for each element

I have a dataframe with 287.000+ file urls at the column df1['Gravação']. I want to get the length of all the files and save into the list tamanho_mb = [ ]
I was able to make the synchronous formula normally, but since the amount of data is very large, I would like to use asyncio to make it faster.
the synchronous approach for this is:
import pandas as pd
import csv
import urllib.request as ur
import requests
import asyncio
df1 = pd.read_csv("report.csv", sep = ';', encoding = 'utf-8')
df1['Gravação'] = ['https://www.pngall.com/wp-content/uploads/11/Harbor-Seal-PNG-Pic.png', 'https://www.pngall.com/wp-content/uploads/11/Harbor-Seal-PNG-File.png', 'https://www.pngall.com/wp-content/uploads/11/Harbor-Seal-PNG-Photo.png', 'https://www.pngall.com/wp-content/uploads/11/Harbor-Seal-PNG-Clipart.png'\]
tamanho_mb = [ ]
for i in df1['Gravação']:
tamanho = ur.urlopen(i)
tamanho = tamanho.length
tamanho_mb.append(tamanho)
how could i achieve the same result using asyncio?

You can try:
import asyncio
import aiohttp
lst = [
"https://www.pngall.com/wp-content/uploads/11/Harbor-Seal-PNG-Pic.png",
"https://www.pngall.com/wp-content/uploads/11/Harbor-Seal-PNG-File.png",
"https://www.pngall.com/wp-content/uploads/11/Harbor-Seal-PNG-Photo.png",
"https://www.pngall.com/wp-content/uploads/11/Harbor-Seal-PNG-Clipart.png",
]
# limit concurrency to 2 to not spam the server:
sem = asyncio.Semaphore(2)
async def get_lenght(session, url):
async with sem, session.get(url) as resp:
content = await resp.read()
return url, len(content)
async def main():
async with aiohttp.ClientSession() as session:
tasks = {get_lenght(session, url) for url in lst}
for task in asyncio.as_completed(tasks):
result = await task
print(result)
asyncio.run(main())
Prints:
('https://www.pngall.com/wp-content/uploads/11/Harbor-Seal-PNG-File.png', 209659)
('https://www.pngall.com/wp-content/uploads/11/Harbor-Seal-PNG-Photo.png', 99739)
('https://www.pngall.com/wp-content/uploads/11/Harbor-Seal-PNG-Pic.png', 240212)
('https://www.pngall.com/wp-content/uploads/11/Harbor-Seal-PNG-Clipart.png', 13111524)

manipulating websocket values

I'm trying to handling websockets with python for the first time.
I created two clients and I want to do some calculate with these two results.
(which is realtime crypto price)
Is it possible to add or multiply numbers of theses result?
import websockets
import asyncio
import json
from binance import AsyncClient, BinanceSocketManager
async def upbit_ws_client():
uri = "wss://api.upbit.com/websocket/v1"
async with websockets.connect(uri) as websocket:
subscribe_fmt = [
{"ticket": "test"},
{
"type": "ticker",
"codes": ["KRW-BTC"],
"isOnlyRealtime": True
},
{"format": "SIMPLE"}
]
subscribe_data = json.dumps(subscribe_fmt)
await websocket.send(subscribe_data)
while True:
data = await websocket.recv()
data = json.loads(data)
print(data['cd'], data['hp'])
async def binance_ws_client():
client = await AsyncClient.create()
bm = BinanceSocketManager(client)
ts = bm.symbol_book_ticker_socket("BTCUSDT")
async with ts as tscm:
while True:
res = await tscm.recv()
print(res['b'], res['a'])
if __name__ == '__main__':
my_loop = asyncio.get_event_loop()
my_loop.run_until_complete(asyncio.gather(*[upbit_ws_client(), binance_ws_client()]))
my_loop.close()

The results are strings so you need to convert them to floats first in order to have a data type which you can calculate:
a = float(res['a'])
b = float(res['b'])
Then you can substract a and b:
result = a - b
print(result)
which you can add to your code like below:
import websockets
import asyncio
import json
from binance import AsyncClient, BinanceSocketManager
async def upbit_ws_client():
uri = "wss://api.upbit.com/websocket/v1"
async with websockets.connect(uri) as websocket:
subscribe_fmt = [
{"ticket": "test"},
{
"type": "ticker",
"codes": ["KRW-BTC"],
"isOnlyRealtime": True
},
{"format": "SIMPLE"}
]
subscribe_data = json.dumps(subscribe_fmt)
await websocket.send(subscribe_data)
while True:
data = await websocket.recv()
data = json.loads(data)
print(data['cd'], data['hp'])
async def binance_ws_client():
client = await AsyncClient.create()
bm = BinanceSocketManager(client)
ts = bm.symbol_book_ticker_socket("BTCUSDT")
async with ts as tscm:
while True:
res = await tscm.recv()
print(res['b'], res['a'])
a = float(res['a'])
b = float(res['b'])
result = a - b
print(result)
if __name__ == '__main__':
my_loop = asyncio.get_event_loop()
my_loop.run_until_complete(asyncio.gather(*[upbit_ws_client(), binance_ws_client()]))
my_loop.close()

Python async API requests in batches

I m tryin to make async API calls this way:
func to send request:
async def get_data(client, postdata):
res = await client.post(url=_url, headers=_headers, data=postdata)
return res
func to parse JSON:
async def parse_res(client, postdata):
res = await get_data(client, postdata)
if bool(json.loads(res.text)['suggestions']):
_oks = <...grab some JSON fields...>
else:
_oks = {}
return _oks
I wrap this two funcs in MAIN():
async def main(_jobs):
async with httpx.AsyncClient() as client:
batch = []
calls = []
for job in _jobs:
_postdata = '{ "query": "'+ job + '" }'
calls.append(asyncio.create_task(parse_res(client, _postdata)))
batch = await asyncio.gather(*calls)
return batch
and then just run MAIN()
But the API can handle about 30-50 fast (nearly simultaneous requests or throws 429 HTTP error).
So i need to send batches of 30 calls and process 10 000 requests in chunks.
How do i process 10 000 (ten thousand) API calls in batches of 30 ?

One library that comes in handy here is funcy. It offers various helper for working with sequences. One of that would be chunks. This allows you to split a sequence into chunks of equal size or fewer in the end if the totalsize does not divide.
from funcy import chunks
result = []
for job_chunk in chunks(30, _jobs):
calls = [parse_res(client, '{ "query": "'+ job + '" }') for job un job_chunk]
batch = await asyncio.gather(*calls)
result.extend(batch)

You could use Simon Hawe's answer, however here's a different approach without the usage of external libraries
Use asyncio.Semaphore to limit the amount of calls made concurrently, when the semaphore is released it will let another function to run.
import asyncio
sem = asyncio.Semaphore(30) # no. of simultaneous requests
async def get_data(client, postdata):
async with sem:
res = client.post(url=_url, headers=_headers, data=postdata)
return res
async def parse_res(client, postdata):
res = await get_data(client, postdata)
if bool(json.loads(res.text)['suggestions']):
_oks = <...grab some JSON fields...>
else:
_oks = {}
return _oks
async def main(_jobs: int):
async with httpx.AsyncClient() as client:
postdata = '{"query": "' + job + '"}'
calls = [
asyncio.create_task(parse_res(client, postdata)
for _ in range(_jobs)
]
return await asyncio.gather(*calls)

Using boto3 to await a synchronous lambda invocation

I want to invoke a lambda function synchronously (request - response) but want to use python async-await to await the response.
response = await client.invoke('my-func', InvocationType='RequestResponse', Payload='...'))
I found a kind of solution here but it is cumbersome and from 2016.
Is there a better approach today?

I found a way of doing it by manually running the invoke function on the asyncio event loop:
import asyncio
import concurrent
import boto3
import json
import botocore
class LambdaClient():
def __init__(self, concurrency: int = 20):
self.executor = concurrent.futures.ThreadPoolExecutor(
max_workers=concurrency,
)
client_config = botocore.config.Config(
max_pool_connections=concurrency
)
self.client = boto3.client('lambda', config=client_config)
async def invoke_async(self, snapshot):
loop = asyncio.get_running_loop()
result = await loop.run_in_executor(self.executor, lambda: self.invoke(snapshot))
return result
def invoke(self, snapshot):
payload = {
'path': '/calculate/value',
'body': json.dumps(snapshot)
}
b = bytes(json.dumps(payload), encoding='utf8')
response = self.client.invoke(
FunctionName='function-name',
InvocationType='RequestResponse',
LogType='None',
Payload=b)
if 'StatusCode' not in response or response['StatusCode'] != 200:
raise ValueError(f'Lambda invocation failed with response {response}')
output = response["Payload"].read()
return output

How to perform proper error handling with asyncio and aiohttp?

I am looking for guidance around best practices with asyncio and aiohttp in Python 3. I have a basic scraper but I am not sure how to:
Properly implement error handling. More specific around my fetch function.
Do I really need the last main function to wrap my async crawler around?
Here is my code so far, it is working but I would like feedback on the two item above.
urls = []
async def fetch(url, payload={}):
async with ClientSession() as s:
async with s.get(url, params=payload) as resp:
content = await resp.read()
return content
async def get_profile_urls(url, payload):
content = await fetch(url, payload)
soup = BeautifulSoup(content, 'html.parser')
soup = soup.find_all(attrs={'class': 'classname'})
if soup:
urls.extend([s.find('a')['href'] for s in soup])
async def main():
tasks = []
payload = {
'page': 0,
'filter': 88}
for i in range(max_page + 1):
payload['page'] += 1
tasks.append(get_profile_urls(search_ulr, payload))
await asyncio.wait(tasks)
asyncio.run(main())

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

How to search by list of keywords using asyncio? - python

Related

Iterate over a large list with Asyncio and get url file size for each element

manipulating websocket values

Python async API requests in batches

Using boto3 to await a synchronous lambda invocation

How to perform proper error handling with asyncio and aiohttp?

Categories

Resources