Setting API rate limit on Python with ratelimit library - python

from ratelimit import limits, RateLimitException, sleep_and_retry
from backoff import on_exception, expo
max_hit = 5
period = 300
#limits(calls=max_hit, period=period)
def StashNotes(self):
url = ("https://www.r10.net/")
raw_data = requests.get(url, headers=headers)
if raw_data.status_code != 200:
raise Exception('API response: {}'.format(raw_data.status_code))
else:
## some unnecessary things here ##
I am trying to limit the API rate with a max hit of 5 and period 300, so my requests.get will not hit any more than 5 times in 300 second period. #limits(calls=max_hit,period=period) doesn't work, and can't really figure out why.
Is there any other way to do this besides ratelimit library, or how to fix #limits decoration? Any kind of solution is appreciated, thanks.
headers=headers contains sensitive information but it doesn't matter anyway just 2 cookie values in it.

It throws exceptions for me when I try to make more than 5 iterations of call on API, getting exception after 5 calls,
ratelimit.exception.RateLimitException: too many calls
Fullcode:
from flask import Flask
from ratelimit import limits
max_hit = 5
period = 300
#limits(calls=max_hit, period=period)
def StashNotes():
return "sany"
app = Flask(__name__)
#app.route("/")
def hello_world():
return StashNotes()

Related

Threading using Python limiting the number of threads and passing list of different values as arguments

I am here basically accessing the api call with various values coming from the list list_of_string_ids
I am expecting to create 20 threads, tell them to do something, write the values to DB and then have them all returning zero and going again to take the next data etc.
I have problem getting this to work using threading. Below is a code which is working correctly as expected, however it is taking very long to finish execration (around 45 minutes or more). The website I am getting the data from allows Async I/O using rate of 20 requests.
I assume this can make my code 20x faster but not really sure how to implement it.
import requests
import json
import time
import threading
import queue
headers = {'Content-Type': 'application/json',
'Authorization': 'Bearer TOKEN'}
start = time.perf_counter()
project_id_number = 123
project_id_string = 'pjiji4533'
name = "Assignment"
list_of_string_ids = [132,123,5345,123,213,213,...,n] # Len of list is 20000
def construct_url_threaded(project_id_number, id_string):
url = f"https://api.test.com/{}/{}".format(project_id_number,id_string)
r = requests.get(url , headers=headers) # Max rate allowed is 20 requests at once.
json_text = r.json()
comments = json.dumps(json_text, indent=2)
for item in json_text['data']:
# DO STUFF
for string_id in all_string_ids_list:
construct_url_threaded(project_id_number=project_id_number, id_string=string_id)
My trial is below
def main():
q = queue.Queue()
threads = [threading.Thread(target=create_url_threaded, args=(project_id_number,string_id, q)) for i in range(5) ] #5 is for testing
for th in threads:
th.daemon = True
th.start()
result1 = q.get()
result2 = q.get()

Questions regarding python requests package

So, I started learning about python's requests package lately and I got into a challenge.
At first I was given a link “http://pms.zelros.com/” that only gave me a tip : query param id must be an uuid v4
I started working on that and so far I've came up with this:
'''
def get_optimal_frequency(nb_of_requests=50):
"""
This sends a number of request in a row to raise Error 429 and get the "optimal frequency" by setting it
to the maximal 'X-Rate-Limit-Remaining' that we got + 10% as a margin of error
:param nb_of_requests: The number of requests sent to raise the Error
:return: The safe time to wait between requets in ms
:rtype: int
"""
session = requests.Session()
query = uuid.uuid4()
optimal_frequency = 0
headers = {
'User-Agent': 'Chrome/79.0.3945.88',
}
for i in range(nb_of_requests):
response = session.get("http://pms.zelros.com", params={'id':query}, headers=headers)
if response.headers.get('X-Rate-Limit-Remaining') is not None and int(response.headers.get('X-Rate-Limit-Remaining')) > optimal_frequency:
optimal_frequency = int(response.headers.get('X-Rate-Limit-Remaining'))
return 1.1*optimal_frequency
def spam_until_score(score):
"""
This sends requests with a uuidv4 until the desired score is reached
:param score: The score wanted
:return: The response of the last request
:rtype: requests.models.Response
"""
start = time.time()
current_score = 0
query = uuid.uuid4()
session = requests.Session()
optimal_frequency = get_optimal_frequency()
headers = {
'User-Agent': 'Chrome/79.0.3945.88',
}
while(current_score < score):
response = session.get("http://pms.zelros.com", params={'id':query}, headers=headers)
dict_response = response.json()
if (int(dict_response.get('score')) < current_score):
break
else:
current_score = int(dict_response.get('score'))
time.sleep(optimal_frequency/1000)
end = time.time()
duration = end - start
return response, duration
But I'm stuck, the goal is to reach 1 000 000 score and getting to 10 000 took 5536s.
The help I've got so far are these:
Level 10000
From /people
Let's add a people payload
"people": [x, x, x]
Level 2000
And you can add a score payload to optimize your preparation
Level 700
You can /prepare your request.
Level 300
Nice start. It was easy :)
Let's use some fancy http verbs.
Level 100
You already know that you cannot spam me.
But do you know that there is an optimal frequency to contact me ?
Level 0
Hello !
Welcome to the Zelros challenge
The goal is to reach a one millon score.
Sorry for the long message but here are my questions :
-Is there a way to send more requests without raising error 429, maybe using parallel requests ? If yes, how should I do it ?
-I don't really get how preparing requests could help me.
-What other html methods besides the GET one could I be using ?
Thanks for your time and help

Trying to add throttle control to paralleled API calls in python

I am using Google places API which has a query per second limit of 10. This means I cannot make more than 10 requests within a second. If we were using Serial execution this wouldn't be an issue as the APIs avg response time is 250 ms, so i will be able to make just 4 calls in a second.
To utilize the entire 10 QPS limit i used multithreading and made parallel API calls. But now i need to control the number of calls that can happen in a second, it should not go beyond 10 (google API starts throwing errors if i cross the limit)
Below is the code that i have so far, I am not able to figure out why the program just gets stuck sometimes or takes alot longer than required.
import time
from datetime import datetime
import random
from threading import Lock
from concurrent.futures import ThreadPoolExecutor as pool
import concurrent.futures
import requests
import matplotlib.pyplot as plt
from statistics import mean
from ratelimiter import RateLimiter
def make_parallel(func, qps=10):
lock = Lock()
threads_execution_que = []
limit_hit = False
def qps_manager(arg):
current_second = time.time()
lock.acquire()
if len(threads_execution_que) >= qps or limit_hit:
limit_hit = True
if current_second - threads_execution_que[0] <= 1:
time.sleep(current_second - threads_execution_que[0])
current_time = time.time()
threads_execution_que.append(current_time)
lock.release()
res = func(arg)
lock.acquire()
threads_execution_que.remove(current_time)
lock.release()
return res
def wrapper(iterable, number_of_workers=12):
result = []
with pool(max_workers=number_of_workers) as executer:
bag = {executer.submit(func, i): i for i in iterable}
for future in concurrent.futures.as_completed(bag):
result.append(future.result())
return result
return wrapper
#make_parallel
def api_call(i):
min_func_time = random.uniform(.25, .3)
start_time = time.time()
try:
response = requests.get('https://jsonplaceholder.typicode.com/posts', timeout=1)
except Exception as e:
response = e
if (time.time() - start_time) - min_func_time < 0:
time.sleep(min_func_time - (time.time() - start_time))
return response
api_call([1]*50)
Ideally the code should take not more than 1.5 seconds, but currently it is taking about 12-14 seconds.
The script speeds up to its expected speed as soon as i remove the QPS manager logic.
Please do suggest what i am doing wrong and also, if there is any package available already which does this mechanism out of the box.
Looks like ratelimit does just that:
from ratelimit import limits, sleep_and_retry
#make_parallel
#sleep_and_retry
#limits(calls=10, period=1)
def api_call(i):
try:
response = requests.get("https://jsonplaceholder.typicode.com/posts", timeout=1)
except Exception as e:
response = e
return response
EDIT: I did some testing and it looks like #sleep_and_retry is a little too optimistic, so just increase the period a little, to 1.2 second:
s = datetime.now()
api_call([1] * 50)
elapsed_time = datetime.now() - s
print(elapsed_time > timedelta(seconds=50 / 10))

How to retrieve large amounts of data (5000+ videos) from YouTube Data API v3?

My goal is to extract all videos from a playlist which can have many videos, ~3000 and can have more than 5000 videos. With maxResults=50 and after implementing pagination with nextPageToken, I'm only able to call the API 20 times, after which nextPageToken isn't sent with the response
I'm calling the API from a python application. I have a while loop running till nextPageToken isn't sent, ideally this should happen AFTER all the videos are extracted, but it prematurely exits after calling the API 19-20 times
def main():
youtube = get_authorised_youtube() # returns YouTube resource authorized with OAuth.
first_response = make_single_request(youtube, None) # make_single_request() takes in the youtube resource and nextPageToken, if any.
nextPageToken = first_response["nextPageToken"]
try:
count = 0
while True:
response = make_single_request(youtube, nextPageToken)
nextPageToken = response["nextPageToken"]
count += 1
print(count, end=" ")
print(nextPageToken)
except KeyError as e: # KeyError to catch if nextPageToken wasn't present
response.pop("items")
print(response) # prints the last response for analysis
if __name__ == '__main__':
main()
snippet of make_single_request():
def make_single_request(youtube, nextPageToken):
if nextPageToken is None:
request = youtube.videos().list(
part="id",
myRating="like",
maxResults=50
)
else:
request = youtube.videos().list(
part="id",
myRating="like",
pageToken=nextPageToken,
maxResults=50
)
response = request.execute()
return response
Expected the code to make upwards of 50 API calls but is observed to only make around 20 calls, consistently.
Note: The following code was executed with an unpaid GCP account. The calls made has part="id" which has a quota cost of 0. The calls limit according to GCP is: 10,000. According to the quota on the console, I make only 20.
Output:
1 CGQQAA
2 CJYBEAA
3 CMgBEAA
4 CPoBEAA
5 CKwCEAA
6 CN4CEAA
7 CJADEAA
8 CMIDEAA
9 CPQDEAA
10 CKYEEAA
11 CNgEEAA
12 CIoFEAA
13 CLwFEAA
14 CO4FEAA
15 CKAGEAA
16 CNIGEAA
17 CIQHEAA
18 CLYHEAA
19 {'kind': 'youtube#videoListResponse', 'etag': '"ETAG"', 'prevPageToken': 'CLYHEAE', 'pageInfo': {'totalResults': TOTAL_RESULTS(>4000), 'resultsPerPage': 50}}
EDIT: After changing maxResults=20, It is observed that the code makes around 50 API calls, therefore the total number of videos that can be extracted is a constant at 1000.
For obtaining the entire list of liked videos of a given channel without any omissions, I suggest you to use PlaylistItems endpoint instead, queried for the given channel's liked-videos playlist by passing a proper value to the endpoint's playlistId parameter.
A given channel's liked-videos playlist ID is obtained upon querying the channel's own endpoint. The needed ID is to be found at .items.contentDetails.relatedPlaylists.likes.
if the goal is to retrieve the FULL list of liked videos in a tideous but working way you can checkout this question.
you basically scrape the data of a deeplink page...
and whats not mentioned in this post is that after you have retrieved the video ids and you may want more data, you can use the videos endpoint with a list of comma seperated video ids to get more informations.
if you need inspirations for the script this is an adjusted version of the api scripts that are provided by youtube
just adjust the credentials file path and the input path of the file thats been retrieved by doing the webscrape
import os
import google_auth_oauthlib.flow
import googleapiclient.discovery
import googleapiclient.errors
import json
scopes = ["https://www.googleapis.com/auth/youtube.readonly"]
def do_request(youtube, video_ids):
#https://developers.google.com/youtube/v3/docs/videos/list
request = youtube.videos().list(
part='contentDetails,id,snippet,statistics',
id=','.join(video_ids),
maxResults=50
)
return request.execute()["items"]
def main(video_ids):
# Disable OAuthlib's HTTPS verification when running locally.
# *DO NOT* leave this option enabled in production.
os.environ["OAUTHLIB_INSECURE_TRANSPORT"] = "1"
api_service_name = "youtube"
api_version = "v3"
client_secrets_file = "INPUTAPICREDFILEHERE./creds.json"
# Get credentials and create an API client
flow = google_auth_oauthlib.flow.InstalledAppFlow.from_client_secrets_file(
client_secrets_file, scopes)
credentials = flow.run_console()
youtube = googleapiclient.discovery.build(
api_service_name, api_version, credentials=credentials)
data = { 'items': [] }
current_id_batch = []
for id in video_ids:
if len(current_id_batch) == 50:
print(f"Fetching.. current batch {len(data['items'])} of {len(video_ids)}")
result = do_request(youtube, current_id_batch)
data['items'].extend(result)
current_id_batch = []
current_id_batch.append(id)
result = do_request(youtube, current_id_batch)
data['items'].extend(result)
with open('./data.json', 'w') as outfile:
outfile.write(json.dumps(data, indent=4))
if __name__ == "__main__":
liked_vids = {}
f = open('PATHTOLIKEDVIDEOS/liked_videos.json', encoding="utf8")
liked_vids = json.load(f)
main(list(liked_vids.keys()))
Try to wait some time in a such way:
import time
time.sleep(1) # time here in seconds

How to rotate proxies on a Python requests

I'm trying to do some scraping, but I get blocked every 4 requests. I have tried to change proxies but the error is the same. What should I do to change it properly?
Here is some code where I try it. First I get proxies from a free web. Then I go do the request with the new proxy but it doesn't work because I get blocked.
from fake_useragent import UserAgent
import requests
def get_player(id,proxy):
ua=UserAgent()
headers = {'User-Agent':ua.random}
url='https://www.transfermarkt.es/jadon-sancho/profil/spieler/'+str(id)
try:
print(proxy)
r=requests.get(u,headers=headers,proxies=proxy)
execpt:
....
code to manage the data
....
Getting proxies
def get_proxies():
ua=UserAgent()
headers = {'User-Agent':ua.random}
url='https://free-proxy-list.net/'
r=requests.get(url,headers=headers)
page = BeautifulSoup(r.text, 'html.parser')
proxies=[]
for proxy in page.find_all('tr'):
i=ip=port=0
for data in proxy.find_all('td'):
if i==0:
ip=data.get_text()
if i==1:
port=data.get_text()
i+=1
if ip!=0 and port!=0:
proxies+=[{'http':'http://'+ip+':'+port}]
return proxies
Calling functions
proxies=get_proxies()
for i in range(1,100):
player=get_player(i,proxies[i//4])
....
code to manage the data
....
I know that proxies scrape is well because when i print then I see something like:
{'http': 'http://88.12.48.61:42365'}
I would like to don't get blocked.
I recently had this same issue, but using proxy servers online as recommended in other answers is always risky (from privacy standpoint), slow, or unreliable.
Instead, you can use the requests-ip-rotator python library to proxy traffic through AWS API Gateway, which gives you a new IP each time:
pip install requests-ip-rotator
This can be used as follows (for your site specifically):
import requests
from requests_ip_rotator import ApiGateway, EXTRA_REGIONS
gateway = ApiGateway("https://www.transfermarkt.es")
gateway.start()
session = requests.Session()
session.mount("https://www.transfermarkt.es", gateway)
response = session.get("https://www.transfermarkt.es/jadon-sancho/profil/spieler/your_id")
print(response.status_code)
# Only run this line if you are no longer going to run the script, as it takes longer to boot up again next time.
gateway.shutdown()
Combined with multithreading/multiprocessing, you'll be able to scrape the site in no time.
The AWS free tier provides you with 1 million requests per region, so this option will be free for all reasonable scraping.
import requests
from itertools import cycle
list_proxy = ['socks5://Username:Password#IP1:20000',
'socks5://Username:Password#IP2:20000',
'socks5://Username:Password#IP3:20000',
'socks5://Username:Password#IP4:20000',
]
proxy_cycle = cycle(list_proxy)
# Prime the pump
proxy = next(proxy_cycle)
for i in range(1, 10):
proxy = next(proxy_cycle)
print(proxy)
proxies = {
"http": proxy,
"https":proxy
}
r = requests.get(url='https://ident.me/', proxies=proxies)
print(r.text)
The problem with using free proxies from sites like this is
websites know about these and may block just because you're using one of them
you don't know that other people haven't gotten them blacklisted by doing bad things with them
the site is likely using some form of other identifier to track you across proxies based on other characteristics (device fingerprinting, proxy-piercing, etc)
Unfortunately, there's not a lot you can do other than be more sophisticated (distribute across multiple devices, use VPN/TOR, etc) and risk your IP being blocked for attempting DDOS-like traffic or, preferably, see if the site has an API for access
Presumably you have your own pool of proxies - what is the best way to rotate them?
First, blindly picking random proxy we risk of repeating connection from the same proxy multiple times in a row. To add, most connection pattern based blocking is using proxy subnet (3rd number) rather than host - it's best to prevent repeats at subnet level.
It's also a good idea to track proxy performance as not all proxies are equal - we want to use our better performing proxies more often and let dead proxies cooldown.
All of this can be done with weighted randomization which is implemented by Python's random.choices() function:
import random
from time import time
from typing import List, Literal
class Proxy:
"""container for a proxy"""
def __init__(self, ip, type_="datacenter") -> None:
self.ip: str = ip
self.type: Literal["datacenter", "residential"] = type_
_, _, self.subnet, self.host = ip.split(":")[0].split('.')
self.status: Literal["alive", "unchecked", "dead"] = "unchecked"
self.last_used: int = None
def __repr__(self) -> str:
return self.ip
def __str__(self) -> str:
return self.ip
class Rotator:
"""weighted random proxy rotator"""
def __init__(self, proxies: List[Proxy]):
self.proxies = proxies
self._last_subnet = None
def weigh_proxy(self, proxy: Proxy):
weight = 1_000
if proxy.subnet == self._last_subnet:
weight -= 500
if proxy.status == "dead":
weight -= 500
if proxy.status == "unchecked":
weight += 250
if proxy.type == "residential":
weight += 250
if proxy.last_used:
_seconds_since_last_use = time() - proxy.last_used
weight += _seconds_since_last_use
return weight
def get(self):
proxy_weights = [self.weigh_proxy(p) for p in self.proxies]
proxy = random.choices(
self.proxies,
weights=proxy_weights,
k=1,
)[0]
proxy.last_used = time()
self.last_subnet = proxy.subnet
return proxy
If we mock run this Rotator we can see how weighted randoms distribute our connections:
from collections import Counter
if __name__ == "__main__":
proxies = [
# these will be used more often
Proxy("xx.xx.121.1", "residential"),
Proxy("xx.xx.121.2", "residential"),
Proxy("xx.xx.121.3", "residential"),
# these will be used less often
Proxy("xx.xx.122.1"),
Proxy("xx.xx.122.2"),
Proxy("xx.xx.123.1"),
Proxy("xx.xx.123.2"),
]
rotator = Rotator(proxies)
# let's mock some runs:
_used = Counter()
_failed = Counter()
def mock_scrape():
proxy = rotator.get()
_used[proxy.ip] += 1
if proxy.host == "1": # simulate proxies with .1 being significantly worse
_fail_rate = 60
else:
_fail_rate = 20
if random.randint(0, 100) < _fail_rate: # simulate some failure
_failed[proxy.ip] += 1
proxy.status = "dead"
mock_scrape()
else:
proxy.status = "alive"
return
for i in range(10_000):
mock_scrape()
for proxy, count in _used.most_common():
print(f"{proxy} was used {count:>5} times")
print(f" failed {_failed[proxy]:>5} times")
# will print:
# xx.xx.121.2 was used 2629 times
# failed 522 times
# xx.xx.121.3 was used 2603 times
# failed 508 times
# xx.xx.123.2 was used 2321 times
# failed 471 times
# xx.xx.122.2 was used 2302 times
# failed 433 times
# xx.xx.121.1 was used 1941 times
# failed 1187 times
# xx.xx.122.1 was used 1629 times
# failed 937 times
# xx.xx.123.1 was used 1572 times
# failed 939 times
By using weighted randoms we can create a connection pattern that appears random but smart. We can apply generic patterns like not proxies from the same IP family in a row as well as custom per-target logic like priotizing North American IPs for NA targets etc.
For more on this see my blog How to Rotate Proxies in Web Scraping

Categories

Resources