handle url pagination with python generator - python

Currently I'm fetching only first page from the server, part of the json is
{"status":"success","count":100,"total":22188,"next":"https://pimber.ly/api/v2/products/?sinceId=5981e16fcde47c0854dc540b","previous":"https://pimber.ly/api/v2/products/?maxId=5981e01dcde47c0854dc4afd","sinceId":"5981e01dcde47c0854dc4afd","maxId":"5981e16fcde47c0854dc540b","data":[.....]}
and the function is:
_fetch_data = response.json()
while _fetch_data['next'] is not None:
response = requests.get(
url=API_DOMAIN',
headers=headers
)
_page_data = response.json()['data']
for _data in _page_data:
yield _data
Current state of the function is only processing the first page, and it will just do that forever, so how can i fix the function to check next so can fetch total data?

I guess it should be
_fetch_data = response.json()
while _fetch_data['next'] is not None:
response = requests.get(_fetch_data['next'], headers=headers)
_fetch_data = response.json()
for _data in fetch_data['data']:
yield _data

Related

get request payload in python

my code is about send get request using query parameters which depends on a page number
After that o have to do for loop to get some ids from the response and also getting the next page number of the same response
and send a new get request with the new next page number that I got from the first response, and I need to get the ids also from the new response
My code works fine , but I’m using two loop which it’s not the right way I think? I couldn’t do it with one loop any ideas?
def get():
response = requests.get(url, headers=header)
data = response.text
data = json.loads(data)
check_if_theres_next_page = data['pagination']['hasMorePages']
check_for_next_page_number = data['pagination']['nextPage']
last_page_number = data['pagination']['lastPage']
orders = data['orders']
list_of_ids = []
for manufacturingOrderId in orders:
ids = manufacturingOrderId['manufacturingOrderId']
list_of_ids.append(ids)
if check_for_next_page_number == 4:
check_for_next_page_number = last_page_number
if check_if_theres_next_page:
url_ = url + '&page_number=' + str(check_for_next_page_number)
response = requests.get(url_, headers=header)
data = response.text
data = json.loads(data)
orders = data['orders']
for manufacturingOrderId_ in orders:
ids = manufacturingOrderId_['manufacturingOrderId']
list_of_ids.append(ids)
if "nextPage" in data['pagination']:
check_for_next_page_number = data['pagination']['nextPage']
else:
check_if_theres_next_page = False
return list_of_ids

Paginating API error (Nested 'next' link)

I am trying to build a class to automate requests to our suplier, but i am having trouble with pagination.
This is the working snippet so far, but the api limits the request to 1000 records, and if the endpoint has more, then I would have to paginate:
response = requests.get(url, data=params, headers=headers).json()
return response
This is what I tried and failed, getting a KeyError: 'next':
response = requests.get(url, data=params, headers=headers).json()
results = response['data']
while response['links']['next']:
response = requests.get(response['links']['next'], data=params, headers=headers).json()
results.extend(response['data'])
return results
you can check the basic sctructure for the response here in the API doc.
Please enlighten me, thank you very much!
You could simply check
while "next" in response['links']:
# ... code ...
But if it can get other problems then putting all in `try/except can be also usefull.
results = [] # empty list
try:
response = requests.get(url, data=params, headers=headers)
#print(response.status_code) # for debug
#print(response.text) # for debug
json_data = response.json()
results += json_data['data']
while "next" in json_data['links']:
response = requests.get(json_data['links']['next'], data=params, headers=headers)
#print(response.status_code) # for debug
#print(response.text) # for debug
json_data = response.json()
results += json_data['data']
except Exception as ex:
print("Exception:", ex)
return results
But I would reduce it to while True
results = [] # empty list
try:
while True:
response = requests.get(url, data=params, headers=headers)
#print(response.status_code) # for debug
#print(response.text) # for debug
json_data = response.json()
results += json_data['data']
if "next" not in json_data['links']:
break
url = json_data['links']['next']
except Exception as ex:
print("Exception:", ex)
return results

Using "Next Page" in while loop

I am retrieving data from an API endpoint which only allows me to retrieve a maximum of 100 data points at a time. There is a "next page" field within the response which I could use to retrieve the next 100 data points and so on (there are about 70,000 in total) by plugging the next page url back into the GET request. How can I utilize a for loop or while loop to retrieve all the data available in the endpoint by automatically plugging the "next page" URL back into the get request?
Here is the code im using. The problem is when I execute the While loop I get the same response everytime because it is running on the first response instance. I can't think of the solution of how to adjust this.
response = requests.get(url + '/api/named_users?limit=100', headers=headers)
users = []
resp_json = response.json()
users.append(resp_json)
while resp_json.get('next_page') != '':
response = s.get(resp_json.get('next_page'), headers = headers)
resp_json = response.json()
users.append(resp_json)
To summarize: I want to take the "next page" URL in every response to get the next 100 data points and append it to a list each time until I have all the data fetched.
You can do it, with a recursive function.
For example something like this :
response = requests.get(url + '/api/named_users?limit=100', headers=headers)
users = []
resp_json = response.json()
users.append(resp_json)
users = next_page(resp_json.get('next_page'), users)
def next_page(url, users):
if url != '':
response = s.get(url, headers=headers)
resp_json = response.json()
users.append(resp_json)
if resp_json.get('next_page') != '':
return next_page(resp_json.get('next_page'), users)
return users
But in general, APIs return a total number of items and a number of items per request. So you can easily paginate and loop through all items.
Here is some pseudo-code :
for i in range(items_returned__per_request, total_number_of_items/items_returned__per_request):
response = s.get(resp_json.get('next_page'), headers=headers)
resp_json = response.json()
users.append(resp_json)

How to read the next page on API using python?

I need help on how to do a loop so each time I make a GET request, it will always be the new page from the API.
I start with getting the first response. It includes a parameter to the next page next_key
{
"result": [
{
...,
...
}
],
"next_key": 123
}
Below is my current attempt
import requests
import json
url = "https://flespi.io/gw/channels/all/messages"
headers = {"Authorization": "FlespiToken 23ggh45"}
def getFirst():
data = {"limit_count":100, "limit_size":10000}
params = {"data":json.dumps(data, separators=(",", ":"))}
reqFirst = requests.get(url, params=params, headers=headers).json()
return reqFirst["next_key"] ## this returns "123"
def getDataNext():
data = {"limit_count":100, "limit_size":10000, "curr_key":getFirst()}
params = {"data":json.dumps(data, separators=(",", ":"))}
reqNext = requests.get(url, params=params, headers=headers)
jsonData = reqNext.json()
while True:
if "next_key" in jsonData:
data = {"limit_count":100, "limit_size":10000,"curr_key":jsonData["next_key"]}
params = {"data":json.dumps(data, separators=(",", ":"))}
req = requests.get(url, params=params, headers=headers).json() ## this should do GET request for the third page and so on...
print req["next_key"] # this returns "3321" which is the value for "next_key" in second page
else:
pass
getDataNext()
The full url including limit count, limit size and curr key is as follows https://flespi.io/gw/channels/all/messages?data=%7B%22curr_key%22%123%2C%22limit_count%22%3A100%2C%22limit_size%22%3A10000%7D
As you can see this only returns the second page that is jsonData["next_key"]. What I want to do is that for each GET request, the program will read the next_key and put it on the next GET request.
I am thinking to use increment on the curr_key but the key is random and also I do not know how many page there is.
I believe there must be just a simple solution for this but apparently I could not think about it. Thank you for your help and suggestion.
try this
has_next_key = False
nextKey = ""
if "next_key" in jsonData:
has_next_key = True
nextKey = jsonData["next_key"]
while has_next_key:
data = {"limit_count":100, "limit_size":10000,"curr_key":nextKey}
params = {"data":json.dumps(data, separators=(",", ":"))}
req = requests.get(url, params=params, headers=headers).json() ## this should do GET request for the third page and so on...
if "next_key" in req:
nextKey = req["next_key"]
print nextKey # this returns "3321" which is the value for "next_key" in second page
else:
has_next_key = False
# no next_key, stop the loop

Instagram get next_max_tag_id

I'm trying to do some analytics analysis on Instagram photos that are posted with a specified hashtag. So now I'm trying to store all the images in a temporary database that'll be used for the analysis.
I'm using python and I've a celery task to get all the images, but it is not working when I run with a next_max_tag_id, which is probably wrong.
Does someone know how to get the correct next_max_tag_id?
this is the code I'm using:
#task()
def get_latest_photos():
next_max_tag_id = get_option('next_max_tag_id')
if not next_max_tag_id:
next_max_tag_id = 0
url = BASE + '/tags/{tag}/media/recent?client_id={cliend_id}' \
'&max_tag_id={max_id}'.format(**{
'tag': a_tag,
'cliend_id': getattr(settings, 'INSTAGRAM_CLIENT_ID'),
'max_id': next_max_tag_id
})
while url:
request = requests.get(url)
if request.status_code != 200:
pass #TODO: error
json_response = request.json()
if json_response['meta']['code'] != 200:
pass #TODO: error
# do something with json_response['data']:
url = None
if json_response.has_key('pagination'):
pagination = json_response['pagination']
if pagination.has_key('next_url'):
url = json_response['pagination']['next_url']
if pagination.has_key('next_max_tag_id'):
next_max_tag_id = pagination['next_max_tag_id']
update_option('next_max_tag_id', next_max_tag_id)
The flow is basically this:
get next_max_tag_id from the db (defaults to 0)
while we have a valid URL it fetches the data, the next url and the next_max_tag_id
updates the next_max_tag_id
The only thing that seems wrong to me is the next_max_tag_id, because every time I go to the API URL with the last next_max_tag_id I get the old images.
Yes. Here's how to use pagination correctly. You have to loop through the pages and reference the function you're in. You can update the script below that gets everyone you're following and query for next_max_id as well.
currently_following = set([])
def parse_following(next_url=None):
if next_url == None:
urlUserMedia = "https://api.instagram.com/v1/users/self/follows?access_token=%s" % (auth_token)
else:
urlUserMedia = next_url
values = {
'client_id' : client_id}
try:
data = urllib.urlencode(values)
req = urllib2.Request(urlUserMedia,None,headers)
response = urllib2.urlopen(req)
result = response.read()
dataObj = json.loads(result)
next_url = None
if dataObj.get('pagination') is not None:
next_url = dataObj.get('pagination').get('next_url')
currently_following.update(user['id'] for user in dataObj['data'])
if next_url is not None:
parse_following(next_url)
except Exception as e:
print e

Categories

Resources