So I am creating a script to communicate to our API server for asset management and retrieve some information. I've found that the longest total time portion of the script is:
{method 'read' of '_ssl._SSLSocket' objects}
Currently we're pulling information about 25 assets or so and that specific portion is taking 18.89 seconds.
Is there any way to optimize this so it doesn't take 45 minutes to do all 2,700 computers we have?
I can provide a copy of the actual code if that would be helpful.
import urllib2
import base64
import json
import csv
# Count Number so that process only runs for 25 assets at a time will be
# replaced with a variable that is determined by the number of computers added
# to the list
Count_Stop = 25
final_output_list = []
def get_creds():
# Credentials Function that retrieves username:pw from .file
with open('.cred') as cred_file:
cred_string = cred_file.read().rstrip()
return cred_string
print(cred_string)
def get_all_assets():
# Function to retrieve computer ID + computer names and store the ID in a
# new list called computers_parsed
request = urllib2.Request('jss'
'JSSResource/computers')
creds = get_creds()
request.add_header('Authorization', 'Basic ' + base64.b64encode(creds))
response = urllib2.urlopen(request).read()
# At this point the request for ID + name has been retrieved and now to be
# formatted in json
parsed_ids_json = json.loads(response)
# Then assign the parsed list (which has nested lists) at key 'computers'
# to a new list variable called computer_set
computer_set = parsed_ids_json['computers']
# New list to store just the computer ID's obtained in Loop below
computer_ids = []
# Count variable, when equal to max # of computers in Count_stop it stops.
count = 0
# This for loop iterates over ID + name in computer_set and returns the ID
# to the list computer_ids
for computers in computer_set:
count += 1
computer_ids.append(computers['id'])
# This IF condition allows for the script to be tested at 25 assets
# instead of all 2,000+ (comment out other announce_all_assets call)
if count == Count_Stop:
announce_all_assets(computer_ids, count)
# announce_all_assets(computer_ids, count)
def announce_all_assets(computer_ids, count):
print('Final list of ID\'s for review: ' + str(computer_ids))
print('Total number of computers to check against JSS: ' +
str(count))
extension_attribute_request(computer_ids, count)
def extension_attribute_request(computer_ids, count):
# Creating new variable, first half of new URL used in loop to get
# extension attributes using the computer ID's in computers_ids
base_url = 'jss'
what_we_want = '/subset/extensionattributes'
creds = get_creds()
print('Extension attribute function starts now:')
for ids in computer_ids:
request_url = base_url + str(ids) + what_we_want
request = urllib2.Request(request_url)
request.add_header('Authorization', 'Basic ' + base64.b64encode(creds))
response = urllib2.urlopen(request).read()
parsed_ext_json = json.loads(response)
ext_att_json = parsed_ext_json['computer']['extension_attributes']
retrieve_all_ext(ext_att_json)
def retrieve_all_ext(ext_att_json):
new_computer = {}
# new_computer['original_id'] = ids['id']
# new_computer['original_name'] = ids['name']
for computer in ext_att_json:
new_computer[str(computer['name'])] = computer['value']
add_to_master_list(new_computer)
def add_to_master_list(new_computer):
final_output_list.append(new_computer)
print(final_output_list)
def main():
# Function to run the get all assets function
get_all_assets()
if __name__ == '__main__':
# Function to run the functions in order: main > get all assets >
main()
I'd _highly recommend using the 'requests' module over 'urllib2'. It handles a lot of stuff for you and will save you many a headache.
I believe it will also give you better performance, but I'd love to hear your feedback.
Here's your code using requests. (I've added newlines to highlight my changes. Note the built-in .json() decoder.):
# Requires requests module be installed.:
# `pip install requests` or `pip3 install requests`
# https://pypi.python.org/pypi/requests/
import requests
import base64
import json
import csv
# Count Number so that process only runs for 25 assets at a time will be
# replaced with a variable that is determined by the number of computers added
# to the list
Count_Stop = 25
final_output_list = []
def get_creds():
# Credentials Function that retrieves username:pw from .file
with open('.cred') as cred_file:
cred_string = cred_file.read().rstrip()
return cred_string
print(cred_string)
def get_all_assets():
# Function to retrieve computer ID + computer names and store the ID in a
# new list called computers_parsed
base_url = 'jss'
what_we_want = 'JSSResource/computers'
request_url = base_url + what_we_want
# NOTE the request_url is constructed based on your request assignment just below.
# As such, it is malformed as a URL, and I assume anonymized for your posting on SO.
# request = urllib2.Request('jss'
# 'JSSResource/computers')
#
creds = get_creds()
headers={
'Authorization': 'Basic ' + base64.b64encode(creds),
}
response = requests.get( request_url, headers )
parsed_ids_json = response.json()
#[NO NEED FOR THE FOLLOWING. 'requests' HANDLES DECODES JSON. SEE ABOVE ASSIGNMENT.]
# At this point the request for ID + name has been retrieved and now to be
# formatted in json
# parsed_ids_json = json.loads(response)
# Then assign the parsed list (which has nested lists) at key 'computers'
# to a new list variable called computer_set
computer_set = parsed_ids_json['computers']
# New list to store just the computer ID's obtained in Loop below
computer_ids = []
# Count variable, when equal to max # of computers in Count_stop it stops.
count = 0
# This for loop iterates over ID + name in computer_set and returns the ID
# to the list computer_ids
for computers in computer_set:
count += 1
computer_ids.append(computers['id'])
# This IF condition allows for the script to be tested at 25 assets
# instead of all 2,000+ (comment out other announce_all_assets call)
if count == Count_Stop:
announce_all_assets(computer_ids, count)
# announce_all_assets(computer_ids, count)
def announce_all_assets(computer_ids, count):
print('Final list of ID\'s for review: ' + str(computer_ids))
print('Total number of computers to check against JSS: ' +
str(count))
extension_attribute_request(computer_ids, count)
def extension_attribute_request(computer_ids, count):
# Creating new variable, first half of new URL used in loop to get
# extension attributes using the computer ID's in computers_ids
base_url = 'jss'
what_we_want = '/subset/extensionattributes'
creds = get_creds()
print('Extension attribute function starts now:')
for ids in computer_ids:
request_url = base_url + str(ids) + what_we_want
headers={
'Authorization': 'Basic ' + base64.b64encode(creds),
}
response = requests.get( request_url, headers )
parsed_ext_json = response.json()
ext_att_json = parsed_ext_json['computer']['extension_attributes']
retrieve_all_ext(ext_att_json)
def retrieve_all_ext(ext_att_json):
new_computer = {}
# new_computer['original_id'] = ids['id']
# new_computer['original_name'] = ids['name']
for computer in ext_att_json:
new_computer[str(computer['name'])] = computer['value']
add_to_master_list(new_computer)
def add_to_master_list(new_computer):
final_output_list.append(new_computer)
print(final_output_list)
def main():
# Function to run the get all assets function
get_all_assets()
if __name__ == '__main__':
# Function to run the functions in order: main > get all assets >
main()
Please do let me know the relative performance time with your 25 assets in 18.89 seconds! I'm very curious.
I'd still recommend my other answer below(?) regarding the use of the requests module from a pure cleanliness perspective (requests is very clean to work with), but I recognize it may or may not address your original question.
If you want to try PyCurl, which likely will impact your original question, here's the same code implemented with that approach:
# Requires pycurl module be installed.:
# `pip install pycurl` or `pip3 install pycurl`
# https://pypi.python.org/pypi/pycurl/7.43.0
# NOTE: The syntax used herein for pycurl is python 3 compliant.
# Not python 2 compliant.
import pycurl
import base64
import json
import csv
def pycurl_data( url, headers ):
buffer = BytesIO()
connection = pycurl.Curl()
connection.setopt( connection.URL, url )
connection.setopt(pycurl.HTTPHEADER, headers )
connection.setopt( connection.WRITEDATA, buffer )
connection.perform()
connection.close()
body = buffer.getvalue()
# NOTE: The following assumes a byte string and a utf8 format. Change as desired.
return json.loads( body.decode('utf8') )
# Count Number so that process only runs for 25 assets at a time will be
# replaced with a variable that is determined by the number of computers added
# to the list
Count_Stop = 25
final_output_list = []
def get_creds():
# Credentials Function that retrieves username:pw from .file
with open('.cred') as cred_file:
cred_string = cred_file.read().rstrip()
return cred_string
print(cred_string)
def get_all_assets():
# Function to retrieve computer ID + computer names and store the ID in a
# new list called computers_parsed
base_url = 'jss'
what_we_want = 'JSSResource/computers'
request_url = base_url + what_we_want
# NOTE the request_url is constructed based on your request assignment just below.
# As such, it is malformed as a URL, and I assume anonymized for your posting on SO.
# request = urllib2.Request('jss'
# 'JSSResource/computers')
#
creds = get_creds()
headers= [ 'Authorization: Basic ' + base64.b64encode(creds) ]
response = pycurl_data( url, headers )
# At this point the request for ID + name has been retrieved and now to be
# formatted in json
parsed_ids_json = json.dumps( response )
# Then assign the parsed list (which has nested lists) at key 'computers'
# to a new list variable called computer_set
computer_set = parsed_ids_json['computers']
# New list to store just the computer ID's obtained in Loop below
computer_ids = []
# Count variable, when equal to max # of computers in Count_stop it stops.
count = 0
# This for loop iterates over ID + name in computer_set and returns the ID
# to the list computer_ids
for computers in computer_set:
count += 1
computer_ids.append(computers['id'])
# This IF condition allows for the script to be tested at 25 assets
# instead of all 2,000+ (comment out other announce_all_assets call)
if count == Count_Stop:
announce_all_assets(computer_ids, count)
# announce_all_assets(computer_ids, count)
def announce_all_assets(computer_ids, count):
print('Final list of ID\'s for review: ' + str(computer_ids))
print('Total number of computers to check against JSS: ' +
str(count))
extension_attribute_request(computer_ids, count)
def extension_attribute_request(computer_ids, count):
# Creating new variable, first half of new URL used in loop to get
# extension attributes using the computer ID's in computers_ids
base_url = 'jss'
what_we_want = '/subset/extensionattributes'
creds = get_creds()
print('Extension attribute function starts now:')
for ids in computer_ids:
request_url = base_url + str(ids) + what_we_want
headers= [ 'Authorization: Basic ' + base64.b64encode(creds) ]
response = pycurl_data( url, headers )
parsed_ext_json = json.dumps( response )
ext_att_json = parsed_ext_json['computer']['extension_attributes']
retrieve_all_ext(ext_att_json)
def retrieve_all_ext(ext_att_json):
new_computer = {}
# new_computer['original_id'] = ids['id']
# new_computer['original_name'] = ids['name']
for computer in ext_att_json:
new_computer[str(computer['name'])] = computer['value']
add_to_master_list(new_computer)
def add_to_master_list(new_computer):
final_output_list.append(new_computer)
print(final_output_list)
def main():
# Function to run the get all assets function
get_all_assets()
if __name__ == '__main__':
# Function to run the functions in order: main > get all assets >
main()
Related
I was wondering if i could get some input from some season python exports, i have a couple questions
I am extracting data from an api request and calculating the total vulnerabilities,
what is the best way i can return this data so that i can call it in another function
what is the way i can add up all the vulnerabilities (right now its just adding it per 500 at a time, id like to do the sum of every vulnerability
def _request():
third_party_patching_filer = {
"asset": "asset.agentKey IS NOT NULL",
"vulnerability" : "vulnerability.categories NOT IN ['microsoft patch']"}
headers = _headers()
print(headers)
url1 = f"https://us.api.insight.rapid7.com/vm/v4/integration/assets"
resp = requests.post(url=url1, headers=headers, json=third_party_patching_filer, verify=False).json()
jsonData = resp
#print(jsonData)
has_next_cursor = False
nextKey = ""
if "cursor" in jsonData["metadata"]:
has_next_cursor = True
nextKey = jsonData["metadata"]["cursor"]
while has_next_cursor:
url2 = f"https://us.api.insight.rapid7.com/vm/v4/integration/assets?&size=500&cursor={nextKey}"
resp2 = requests.post(url=url2, headers=headers, json=third_party_patching_filer, verify=False).json()
cursor = resp2["metadata"]
print(cursor)
if "cursor" in cursor:
nextKey = cursor["cursor"]
print(f"next key {nextKey}")
#print(desktop_support)
for data in resp2["data"]:
for tags in data['tags']:
total_critical_vul_osswin = []
total_severe_vul_osswin = []
total_modoer_vuln_osswin = []
if tags["name"] == 'OSSWIN':
print("OSSWIN")
critical_vuln_osswin = data['critical_vulnerabilities']
severe_vuln_osswin = data['severe_vulnerabilities']
modoer_vuln_osswin = data['moderate_vulnerabilities']
total_critical_vul_osswin.append(critical_vuln_osswin)
total_severe_vul_osswin.append(severe_vuln_osswin)
total_modoer_vuln_osswin.append(modoer_vuln_osswin)
print(sum(total_critical_vul_osswin))
print(sum(total_severe_vul_osswin))
print(sum(total_modoer_vuln_osswin))
if tags["name"] == 'DESKTOP_SUPPORT':
print("Desktop")
total_critical_vul_desktop = []
total_severe_vul_desktop = []
total_modorate_vuln_desktop = []
critical_vuln_desktop = data['critical_vulnerabilities']
severe_vuln_desktop = data['severe_vulnerabilities']
moderate_vuln_desktop = data['moderate_vulnerabilities']
total_critical_vul_desktop.append(critical_vuln_desktop)
total_severe_vul_desktop.append(severe_vuln_desktop)
total_modorate_vuln_desktop.append(moderate_vuln_desktop)
print(sum(total_critical_vul_desktop))
print(sum(total_severe_vul_desktop))
print(sum(total_modorate_vuln_desktop))
else:
pass
else:
has_next_cursor = False
If you have a lot of parameters to pass, consider using a dict to combine them. Then you can just return the dict and pass it along to the next function that needs that data. Another approach would be to create a class and either access the variables directly or have helper functions that do so. The latter is a cleaner solution vs a dict, since with a dict you have to quote every variable name, and with a class you can easily add additional functionally beyond just being a container for a bunch of instance variables.
If you want the total across all the data, you should put these initializations:
total_critical_vul_osswin = []
total_severe_vul_osswin = []
total_modoer_vuln_osswin = []
before the while has_next_cursor loop (and similarly for the desktop totals). The way your code is currently, they are initialized each cursor (ie, each 500 samples based on the URL).
I'm calling a LinkedIn API with the code below and it does what I want.
However when I use almost identical code inside a loop it returns a type error.
it returns a type error:
File "C:\Users\pchmurzynski\OneDrive - Centiq Ltd\Documents\Python\mergedreqs.py", line 54, in <module>
auth_headers = headers(access_token)
TypeError: 'dict' object is not callable
It has a problem with this line (which again, works fine outside of the loop):
headers = headers(access_token)
I tried changing it to
headers = headers.get(access_token)
or
headers = headers[access_token]
EDIT:
I have also tried this, with the same error:
auth_headers = headers(access_token)
But it didn't help. What am I doing wrong? Why does the dictionary work fine outside of the loop, but not inside of it and what should I do to make it work?
What I am hoping to achieve is to get a list, which I can save as json with share statistics called for each ID from the "shids" list. That can be done with individual requests - one link for one ID,
(f'https://api.linkedin.com/v2/organizationalEntityShareStatistics?q=organizationalEntity&organizationalEntity=urn%3Ali%3Aorganization%3A77487&ugcPosts=List(urn%3Ali%3AugcPost%3A{shid})
or a a request with a list of ids.
(f'https://api.linkedin.com/v2/organizationalEntityShareStatistics?q=organizationalEntity&organizationalEntity=urn%3Ali%3Aorganization%3A77487&ugcPosts=List(urn%3Ali%3AugcPost%3A{shid},urn%3Ali%3AugcPost%3A{shid2},...,urn%3Ali%3AugcPost%3A{shidx})
Updated Code thanks to your comments.
shlink = ("https://api.linkedin.com/v2/organizationalEntityShareStatistics?q=organizationalEntity&organizationalEntity=urn%3Ali%3Aorganization%3A77487&shares=List(urn%3Ali%3Ashare%3A{})")
#loop through the list of share ids and make an api request for each of them
shares = []
token = auth(credentials) # Authenticate the API
headers = fheaders(token) # Make the headers to attach to the API call.
for shid in shids:
#create a request link for each sh id
r = (shlink.format(shid))
#call the api
res = requests.get(r, headers = auth_headers)
share_stats = res.json()
#append the shares list with the responce
shares.append(share_stats["elements"])
works fine outside the loop
Because in the loop, you re-define the variable. Added print statments to show it
from liapiauth import auth, headers # one type
for ...:
...
print(type(headers))
headers = headers(access_token) # now set to another type
print(type(headers))
Lesson learned - don't overrwrite your imports
Some refactors - your auth token isn't changing, so don't put it in the loop; You can use one method for all LinkedIn API queries
from liapiauth import auth, headers
import requests
API_PREFIX = 'https://api.linkedin.com/v2'
SHARES_ENDPOINT_FMT = '/organizationalEntityShareStatistics?q=organizationalEntity&organizationalEntity=urn%3Ali%3Aorganization%3A77487&shares=List(urn%3Ali%3Ashare%3A{}'
def get_linkedin_response(endpoint, headers):
return requests.get(API_PREFIX + endpoint, headers=headers)
def main(access_token=None):
if access_token is None:
raise ValueError('Access-Token not defined')
auth_headers = headers(access_token)
shares = []
for shid in shids:
endpoint = SHARES_ENDPOINT_FMT.format(shid)
resp = get_linkedin_response(endpoint, auth_headers)
if resp.status_code // 100 == 2:
share_stats = resp.json()
shares.append(share_stats[1])
# TODO: extract your data here
idlist = [el["id"] for el in shares_list["elements"]]
if __name__ == '__main__':
credentials = 'credentials.json'
main(auth(credentials))
I have this loop in my app.py. For some reason it extends the load time by over 3 seconds. Are there any solutions?
import dateutil.parser as dp
# Converts date from ISO-8601 string to formatted string and returns it
def dateConvert(date):
return dp.parse(date).strftime("%H:%M # %e/%b/%y")
def nameFromID(userID):
if userID is None:
return 'Unknown'
else:
response = requests.get("https://example2.org/" + str(userID), headers=headers)
return response.json()['firstName'] + ' ' + response.json()['lastName']
logs = []
response = requests.get("https://example.org", headers=headers)
for response in response.json():
logs.append([nameFromID(response['member']), dateConvert(response['createdAt'])])
It extends the load time by over 3 seconds because it does a lot of unnecessary work, that's why.
You're not using requests Sessions. Each request will require creating and tearing down an HTTPS connection. That's slow.
You're doing another HTTPS request for each name conversion. (See above.)
You're parsing the JSON you get in that function twice.
Whatever dp.parse() is (dateutil?), it's probably doing a lot of extra work parsing from a free-form string. If you know the input format, use strptime.
Here's a rework that should be significantly faster. Please see the TODO points first, of course.
Also, if you are at liberty to knowing the member id -> name mapping doesn't change, you can make name_cache a suitably named global variable too (but remember it may be persisted between requests).
import datetime
import requests
INPUT_DATE_FORMAT = "TODO_FILL_ME_IN" # TODO: FILL ME IN.
def dateConvert(date: str):
return datetime.datetime.strptime(date, INPUT_DATE_FORMAT).strftime(
"%H:%M # %e/%b/%y"
)
def nameFromID(sess: requests.Session, userID):
if userID is None:
return "Unknown"
response = sess.get(f"https://example2.org/{userID}")
response.raise_for_status()
data = response.json()
return "{firstName} {lastName}".format_map(data)
def do_thing():
headers = {} # TODO: fill me in
name_cache = {}
with requests.Session() as sess:
sess.headers.update(headers)
logs = []
response = sess.get("https://example.org")
for response in response.json():
member_id = response["member"]
name = name_cache.get(member_id)
if not name:
name = name_cache[member_id] = nameFromID(sess, member_id)
logs.append([name, dateConvert(response["createdAt"])])
Question: Is a time delay a good way of dealing with request rate limits?
I am very new to requests, APIs and web services. I am trying to create a web service that, given an ID, makes a request to MusicBrainz API and retrieves some information. However, apparently I am making too many requests, or making them too fast. In the last line of the code, if the delay parameter is set to 0, this error will appear:
{'error': 'Your requests are exceeding the allowable rate limit. Please see http://wiki.musicbrainz.org/XMLWebService for more information.'}
And looking into that link, I found out that:
The rate at which your IP address is making requests is measured. If that rate is too high, all your requests will be declined (http 503) until the rate drops again. Currently that rate is (on average) 1 request per second.
Therefore I thought, okey, I will insert a time delay of 1 second, and it will work. And it worked, but I guess there are nicer, neater and smarter ways of dealing with such a problem. Do you know one?
CODE:
####################################################
################### INSTRUCTIONS ###################
####################################################
'''
This script runs locally and returns a JSON formatted file, containing
information about the release-groups of an artist whose MBID must be provided.
'''
#########################################
############ CODE STARTS ################
#########################################
#IMPORT PACKAGES
#All of them come with Anaconda3 installation, otherwise they can be installed with pip
import requests
import json
import math
import time
#Base URL for looking-up release-groups on musicbrainz.org
root_URL = 'http://musicbrainz.org/ws/2/'
#Parameters to run an example
offset = 10
limit = 1
MBID = '65f4f0c5-ef9e-490c-aee3-909e7ae6b2ab'
def collect_data(MBID, root_URL):
'''
Description: Auxiliar function to collect data from the MusicBrainz API
Arguments:
MBID - MusicBrainz Identity of some artist.
root_URL - MusicBrainz root_URL for requests
Returns:
decoded_output - dictionary file containing all the information about the release-groups
of type album of the requested artist
'''
#Joins paths. Note: Release-groups can be filtered by type.
URL_complete = root_URL + 'release-group?artist=' + MBID + '&type=album' + '&fmt=json'
#Creates a requests object and sends a GET request
request = requests.get(URL_complete)
assert request.status_code == 200
output = request.content #bits file
decoded_output = json.loads(output) #dict file
return decoded_output
def collect_releases(release_group_id, root_URL, delay = 1):
'''
Description: Auxiliar function to collect data from the MusicBrainz API
Arguments:
release_group_id - ID of the release-group whose number of releases is to be extracted
root_URL - MusicBrainz root_URL for requests
Returns:
releases_count - integer containing the number of releases of the release-group
'''
URL_complete = root_URL + 'release-group/' + release_group_id + '?inc=releases' + '&fmt=json'
#Creates a requests object and sends a GET request
request = requests.get(URL_complete)
#Parses the content of the request to a dictionary
output = request.content
decoded_output = json.loads(output)
#Time delay to not exceed MusicBrainz request rate limit
time.sleep(delay)
releases_count = 0
if 'releases' in decoded_output:
releases_count = len(decoded_output['releases'])
else:
print(decoded_output)
#raise ValueError(decoded_output)
return releases_count
def paginate(store_albums, offset, limit = 50):
'''
Description: Auxiliar function to paginate results
Arguments:
store_albums - Dictionary containing information about each release-group
offset - Integer. Corresponds to starting album to show.
limit - Integer. Default to 50. Maximum number of albums to show per page
Returns:
albums_paginated - Paginated albums according to specified limit and offset
'''
#Restricts limit to 150
if limit > 150:
limit = 150
if offset > len(store_albums['albums']):
raise ValueError('Offset is greater than number of albums')
#Apply offset
albums_offset = store_albums['albums'][offset:]
#Count pages
pages = math.ceil(len(albums_offset) / limit)
albums_limited = []
if len(albums_offset) > limit:
for i in range(pages):
albums_limited.append(albums_offset[i * limit : (i+1) * limit])
else:
albums_limited = albums_offset
albums_paginated = {'albums' : None}
albums_paginated['albums'] = albums_limited
return albums_paginated
def post(MBID, offset, limit, delay = 1):
#Calls the auxiliar function 'collect_data' that retrieves the JSON file from MusicBrainz API
json_file = collect_data(MBID, root_URL)
#Creates list and dictionary for storing the information about each release-group
album_details_list = []
album_details = {"id": None, "title": None, "year": None, "release_count": None}
#Loops through all release-groups in the JSON file
for item in json_file['release-groups']:
album_details["id"] = item["id"]
album_details["title"] = item["title"]
album_details["year"] = item["first-release-date"].split("-")[0]
album_details["release_count"] = collect_releases(item["id"], root_URL, delay)
album_details_list.append(album_details.copy())
#Creates dictionary with all the albums of the artist
store_albums = {"albums": None}
store_albums["albums"] = album_details_list
#Paginates the dictionary
stored_paginated_albums = paginate(store_albums, offset , limit)
#Returns JSON typed file containing the different albums arranged according to offset&limit
return json.dumps(stored_paginated_albums)
#Runs the program and prints the JSON output as specified in the wording of the exercise
print(post(MBID, offset, limit, delay = 1))
There aren't any nicer ways of dealing with this problem, other than asking the API owner to increase your rate limit. The only way to avoid a rate limit problem is by not making too many requests at a time, and besides hacking the API in such a way that you bypass its requests counter, you're stuck with waiting one second between each request.
I've got a script here which (ideally) iterates through multiple pages X of JSON data for each entity Y (in this case, multiple loans X for each team Y). The way that the api is constructed, I believe I must physically change a subdirectory within the URL in order to iterate through multiple entities. Here is the explicit documentation and URL:
GET /teams/:id/loans
Returns loans belonging to a particular team.
Example http://api.kivaws.org/v1/teams/2/loans.json
Parameters id(number) Required. The team ID for which to return loans.
page(number) The page position of results to return. Default: 1
sort_by(string) The order by which to sort results. One of: oldest,
newest Default: newest app_id(string) The application id in reverse
DNS notation. ids_only(string) Return IDs only to make the return
object smaller. One of: true, false Default: false Response
loan_listing – HTML , JSON , XML , RSS
Status Production
And here is my script, which does run and appear to extract the correct data, but doesn't seem to write any data to the outfile:
# -*- coding: utf-8 -*-
import urllib.request as urllib
import json
import time
# storing team loans dict. The key is the team id, en value is the list of lenders
team_loans = {}
url = "http://api.kivaws.org/v1/teams/"
#teams_id range 1 - 11885
for i in range(1, 100):
params = dict(
id = i
)
#i =1
try:
handle = urllib.urlopen(str(url+str(i)+"/loans.json"))
print(handle)
except:
print("Could not handle url")
continue
# reading response
item_html = handle.read().decode('utf-8')
# converting bytes to str
data = str(item_html)
# converting to json
data = json.loads(data)
# getting number of pages to crawl
numPages = data['paging']['pages']
# deleting paging data
data.pop('paging')
# calling additional pages
if numPages >1:
for pa in range(2,numPages+1,1):
#pa = 2
handle = urllib.urlopen(str(url+str(i)+"/loans.json?page="+str(pa)))
print("Pulling loan data from team " + str(i) + "...")
# reading response
item_html = handle.read().decode('utf-8')
# converting bytes to str
datatemp = str(item_html)
# converting to json
datatemp = json.loads(datatemp)
#Pagings are redundant headers
datatemp.pop('paging')
# adding data to initial list
for loan in datatemp['loans']:
data['loans'].append(loan)
time.sleep(2)
# recording loans by team in dict
team_loans[i] = data['loans']
if (data['loans']):
print("===Data added to the team_loan dictionary===")
else:
print("!!!FAILURE to add data to team_loan dictionary!!!")
# recorging data to file when 10 teams are read
print("===Finished pulling from page " + str(i) + "===")
if (int(i) % 10 == 0):
outfile = open("team_loan.json", "w")
print("===Now writing data to outfile===")
json.dump(team_loans, outfile, sort_keys = True, indent = 2, ensure_ascii=True)
outfile.close()
else:
print("!!!FAILURE to write data to outfile!!!")
# compliance with API # of requests
time.sleep(2)
print ('Done! Check your outfile (team_loan.json)')
I know that may be a heady amount of code to throw in your faces, but it's a pretty sequential process.
Again, this program is pulling the correct data, but it is not writing this data to the outfile. Can anyone understand why?
For others who may read this post, the script does in face write data to an outfile. It was simply test code logic that was wrong. Ignore the print statements I have put into place.