Paginating API error (Nested 'next' link) - python

I am trying to build a class to automate requests to our suplier, but i am having trouble with pagination.
This is the working snippet so far, but the api limits the request to 1000 records, and if the endpoint has more, then I would have to paginate:
response = requests.get(url, data=params, headers=headers).json()
return response
This is what I tried and failed, getting a KeyError: 'next':
response = requests.get(url, data=params, headers=headers).json()
results = response['data']
while response['links']['next']:
response = requests.get(response['links']['next'], data=params, headers=headers).json()
results.extend(response['data'])
return results
you can check the basic sctructure for the response here in the API doc.
Please enlighten me, thank you very much!

You could simply check
while "next" in response['links']:
# ... code ...
But if it can get other problems then putting all in `try/except can be also usefull.
results = [] # empty list
try:
response = requests.get(url, data=params, headers=headers)
#print(response.status_code) # for debug
#print(response.text) # for debug
json_data = response.json()
results += json_data['data']
while "next" in json_data['links']:
response = requests.get(json_data['links']['next'], data=params, headers=headers)
#print(response.status_code) # for debug
#print(response.text) # for debug
json_data = response.json()
results += json_data['data']
except Exception as ex:
print("Exception:", ex)
return results
But I would reduce it to while True
results = [] # empty list
try:
while True:
response = requests.get(url, data=params, headers=headers)
#print(response.status_code) # for debug
#print(response.text) # for debug
json_data = response.json()
results += json_data['data']
if "next" not in json_data['links']:
break
url = json_data['links']['next']
except Exception as ex:
print("Exception:", ex)
return results

Related

InvalidSchema: No connection adapters were found for "{'url':

I am trying to do pagination using pyspark and getting below error. My pagination link is in the header as Key [Link] and value [rel="next"]. The error is displayed at this line r1 = requests.get(response.links['next']).The issue is baseURL missing from the "next" URL being passed.
getURL = 'https://api.xxx.com/v3/direct-access/abc'
baseURL = 'https://api.xxx.com/v3/direct-access'
headers = {
"accept" : "application/json",
"Content-Type": "application/json",
"Authorization": "Bearer " + str(token)
}
results = []
response = requests.get(getURL, headers=headers)
r = response.json()
for i in r:
results.append(i)
while response.links['next']: ## != response.links['last']:
r1 = requests.get(response.links['next'])
r = r1.json()
for i in r:
results.append(i)
Error: InvalidSchema: No connection adapters were found for "{'url': '/abc? action=next&next_page=%28id%2Ccompletionid%29+%3C+%28840430000754002%2C840430413029241%29&pagesize=10000', 'rel': 'next'}"
InvalidSchema Traceback (most recent call last) <ipython-input-45-
f27cc7bf373e> in <module> 17 18
while response.links['next']: ## != response.links['last']: ---> 19
r1 = requests.get(response.links['next'])
20 r = r1.json()
21 for i in r:
InvalidSchema: No connection adapters were found for "{'url':
'/liners?
action=next&next_page=linerid+%3C+1010031264&pagesize=10000', 'rel':
'next'}"
How can i merge both baseURL and url into one link and pass it in while loop? Something like below
https://api.xxx.com/v3/direct-access/abc?action=next&next_page=%28id%2Ccompletionid%29+%3C+%28840430000754002%2C840430413029241%29&pagesize=10000
response = requests.get(getURL, headers=headers)
r = response.json()
for i in r:
results.append(i)
while response.links.get('next'):
response = requests.get(baseURL + response.links['next']
['url'],headers=headers)
r1 = response.json()
for i in response:
results.append(i)
#######below not returning results, running for ever ######
return results
rdd = spark.sparkContext.parallelize((results))
print(rdd)
df = spark.read.option('multiline','true').json(rdd)
df.repartition(1).write.json(stagingpath,mode="overwrite")
There are several problems with your code.
response.links['next'] is a dict {'url': ...}. requests.get(...) expects a URL.
# requests.get(response.links['next'])
requests.get(response.links['next']['url'])
# requests.get(baseURL + response.links['next']['url']) # With baseURL
headers are not passed in the subsequent calls.
# requests.get(response.links['next']['url'])
requests.get(response.links['next']['url'], headers=headers)
response is not modified, resulting in infinite loop.
while response.links['next']:
# r1 = requests.get(response.links['next']['url'], headers=headers)
response = requests.get(response.links['next']['url'], headers=headers)
For the last link, 'next' will not exist.
# while response.links['next']:
while response.links.get('next'):
Minimal, reproducible example:
import requests
getURL = 'https://api.github.com/users/acjh/repos'
baseURL = ''
headers = {}
results = []
response = requests.get(getURL, headers=headers)
r = response.json()
for i in r:
results.append(i)
while response.links.get('next'):
nextURL = baseURL + response.links['next']['url']
response = requests.get(nextURL, headers=headers)
r = response.json()
for i in r:
results.append(i)
assert len(results) == requests.get(getURL[:-6]).json()['public_repos']
like this:
while response.links['next']:
next_link = response.links['next']['url']
r1 = requests.get(baseURL + next_link, headers=headers)
r = r1.json()
for i in r:
results.append(i)

replace while loop in python with multiprocessing

I have been trying to optimize my code, which fetches the data from a given ip address and has pagination involved in it. I have tried applying multiprocessing/multithreading but can't able to implement them successfully.
My goal of using multiprocessing is to reduce the execution time. Please help me out in this.
has_pagination = True
session_obj = some_value #like this <requests.sessions.Session object at 0x7fac952c4fa0>
headers = {'X-XSRF-TOKEN':token, 'Content-type':'application/json', 'Accept':'application/json', 'Cookie':jsessionid}
while has_pagination:
url = f"https://{self.ip}/data/page?count=100&scrollId={scroll_id}"
response = session_object.get(url=url, headers=headers, verify=False)
try:
resp_json = response.json()
data = resp_json['data']
has_pagination = resp_json['pageInfo']['hasMoreData']
except Exception as e:
print(f'status code: {response.status_code}, {e}')
# # Logging out if session is established
logout_url = f'https://{self.ip}/logout?nocache=123456'
logout_resp = session_obj.get(url=logout_url, headers=headers, verify=False, allow_redirects=False)
print(f'Logging out. Response code: {str(logout_resp.status_code)}')
# # Relogin
print(f'2. Trying to re-login with connnection details Token: {str(token)}. Jsessionid: {str(jsessionid)}. Headers: {str(headers)}.')
login_result = self.login()
if login_result == False:
return False
else:
number_of_relogin += 1
session_obj, token, jsessionid = login_result[0], login_result[1], login_result[2]
headers = {'X-XSRF-TOKEN':token, 'Content-type':'application/json', 'Accept':'application/json', 'Cookie':jsessionid}
logger.info(f'2. New session established with Token: {str(token)}. Jsessionid: {str(jsessionid)}. Headers: {str(headers)}')

handle url pagination with python generator

Currently I'm fetching only first page from the server, part of the json is
{"status":"success","count":100,"total":22188,"next":"https://pimber.ly/api/v2/products/?sinceId=5981e16fcde47c0854dc540b","previous":"https://pimber.ly/api/v2/products/?maxId=5981e01dcde47c0854dc4afd","sinceId":"5981e01dcde47c0854dc4afd","maxId":"5981e16fcde47c0854dc540b","data":[.....]}
and the function is:
_fetch_data = response.json()
while _fetch_data['next'] is not None:
response = requests.get(
url=API_DOMAIN',
headers=headers
)
_page_data = response.json()['data']
for _data in _page_data:
yield _data
Current state of the function is only processing the first page, and it will just do that forever, so how can i fix the function to check next so can fetch total data?
I guess it should be
_fetch_data = response.json()
while _fetch_data['next'] is not None:
response = requests.get(_fetch_data['next'], headers=headers)
_fetch_data = response.json()
for _data in fetch_data['data']:
yield _data

How to read the next page on API using python?

I need help on how to do a loop so each time I make a GET request, it will always be the new page from the API.
I start with getting the first response. It includes a parameter to the next page next_key
{
"result": [
{
...,
...
}
],
"next_key": 123
}
Below is my current attempt
import requests
import json
url = "https://flespi.io/gw/channels/all/messages"
headers = {"Authorization": "FlespiToken 23ggh45"}
def getFirst():
data = {"limit_count":100, "limit_size":10000}
params = {"data":json.dumps(data, separators=(",", ":"))}
reqFirst = requests.get(url, params=params, headers=headers).json()
return reqFirst["next_key"] ## this returns "123"
def getDataNext():
data = {"limit_count":100, "limit_size":10000, "curr_key":getFirst()}
params = {"data":json.dumps(data, separators=(",", ":"))}
reqNext = requests.get(url, params=params, headers=headers)
jsonData = reqNext.json()
while True:
if "next_key" in jsonData:
data = {"limit_count":100, "limit_size":10000,"curr_key":jsonData["next_key"]}
params = {"data":json.dumps(data, separators=(",", ":"))}
req = requests.get(url, params=params, headers=headers).json() ## this should do GET request for the third page and so on...
print req["next_key"] # this returns "3321" which is the value for "next_key" in second page
else:
pass
getDataNext()
The full url including limit count, limit size and curr key is as follows https://flespi.io/gw/channels/all/messages?data=%7B%22curr_key%22%123%2C%22limit_count%22%3A100%2C%22limit_size%22%3A10000%7D
As you can see this only returns the second page that is jsonData["next_key"]. What I want to do is that for each GET request, the program will read the next_key and put it on the next GET request.
I am thinking to use increment on the curr_key but the key is random and also I do not know how many page there is.
I believe there must be just a simple solution for this but apparently I could not think about it. Thank you for your help and suggestion.
try this
has_next_key = False
nextKey = ""
if "next_key" in jsonData:
has_next_key = True
nextKey = jsonData["next_key"]
while has_next_key:
data = {"limit_count":100, "limit_size":10000,"curr_key":nextKey}
params = {"data":json.dumps(data, separators=(",", ":"))}
req = requests.get(url, params=params, headers=headers).json() ## this should do GET request for the third page and so on...
if "next_key" in req:
nextKey = req["next_key"]
print nextKey # this returns "3321" which is the value for "next_key" in second page
else:
has_next_key = False
# no next_key, stop the loop

Http post successes with poster, but fails in python

In the code below, I'm trying to create a repository with http post, but I always get 400 bad request, when I send the http post with poster, I got 201 created, what's wrong with this code?
token = raw_input('Access Token: ')
url = 'https://api.github.com/user/repos?access_token=' + token
values = {"name":"newnewnewnew"}
data = urllib.urlencode(values)
req = urllib2.Request(url,data)
response = urllib2.urlopen(req)
the_page = response.read();
print the_page
Poster:
According to the GitHub API v3 documentation, for POST request, the parameters should be encoded with json and the content-type should be application/json:
import json
....
token = raw_input('Access Token: ')
url = 'https://api.github.com/user/repos?access_token=' + token
values = {"name": "newnewnewnew"}
data = json.dumps(values) # <---
req = urllib2.Request(url, data, headers={'Content-Type': 'application/json'}) # <---
response = urllib2.urlopen(req)
the_page = response.read()
print the_page

Categories

Resources