I am trying to do pagination using pyspark and getting below error. My pagination link is in the header as Key [Link] and value [rel="next"]. The error is displayed at this line r1 = requests.get(response.links['next']).The issue is baseURL missing from the "next" URL being passed.
getURL = 'https://api.xxx.com/v3/direct-access/abc'
baseURL = 'https://api.xxx.com/v3/direct-access'
headers = {
"accept" : "application/json",
"Content-Type": "application/json",
"Authorization": "Bearer " + str(token)
}
results = []
response = requests.get(getURL, headers=headers)
r = response.json()
for i in r:
results.append(i)
while response.links['next']: ## != response.links['last']:
r1 = requests.get(response.links['next'])
r = r1.json()
for i in r:
results.append(i)
Error: InvalidSchema: No connection adapters were found for "{'url': '/abc? action=next&next_page=%28id%2Ccompletionid%29+%3C+%28840430000754002%2C840430413029241%29&pagesize=10000', 'rel': 'next'}"
InvalidSchema Traceback (most recent call last) <ipython-input-45-
f27cc7bf373e> in <module> 17 18
while response.links['next']: ## != response.links['last']: ---> 19
r1 = requests.get(response.links['next'])
20 r = r1.json()
21 for i in r:
InvalidSchema: No connection adapters were found for "{'url':
'/liners?
action=next&next_page=linerid+%3C+1010031264&pagesize=10000', 'rel':
'next'}"
How can i merge both baseURL and url into one link and pass it in while loop? Something like below
https://api.xxx.com/v3/direct-access/abc?action=next&next_page=%28id%2Ccompletionid%29+%3C+%28840430000754002%2C840430413029241%29&pagesize=10000
response = requests.get(getURL, headers=headers)
r = response.json()
for i in r:
results.append(i)
while response.links.get('next'):
response = requests.get(baseURL + response.links['next']
['url'],headers=headers)
r1 = response.json()
for i in response:
results.append(i)
#######below not returning results, running for ever ######
return results
rdd = spark.sparkContext.parallelize((results))
print(rdd)
df = spark.read.option('multiline','true').json(rdd)
df.repartition(1).write.json(stagingpath,mode="overwrite")
There are several problems with your code.
response.links['next'] is a dict {'url': ...}. requests.get(...) expects a URL.
# requests.get(response.links['next'])
requests.get(response.links['next']['url'])
# requests.get(baseURL + response.links['next']['url']) # With baseURL
headers are not passed in the subsequent calls.
# requests.get(response.links['next']['url'])
requests.get(response.links['next']['url'], headers=headers)
response is not modified, resulting in infinite loop.
while response.links['next']:
# r1 = requests.get(response.links['next']['url'], headers=headers)
response = requests.get(response.links['next']['url'], headers=headers)
For the last link, 'next' will not exist.
# while response.links['next']:
while response.links.get('next'):
Minimal, reproducible example:
import requests
getURL = 'https://api.github.com/users/acjh/repos'
baseURL = ''
headers = {}
results = []
response = requests.get(getURL, headers=headers)
r = response.json()
for i in r:
results.append(i)
while response.links.get('next'):
nextURL = baseURL + response.links['next']['url']
response = requests.get(nextURL, headers=headers)
r = response.json()
for i in r:
results.append(i)
assert len(results) == requests.get(getURL[:-6]).json()['public_repos']
like this:
while response.links['next']:
next_link = response.links['next']['url']
r1 = requests.get(baseURL + next_link, headers=headers)
r = r1.json()
for i in r:
results.append(i)
Related
I am trying to build a class to automate requests to our suplier, but i am having trouble with pagination.
This is the working snippet so far, but the api limits the request to 1000 records, and if the endpoint has more, then I would have to paginate:
response = requests.get(url, data=params, headers=headers).json()
return response
This is what I tried and failed, getting a KeyError: 'next':
response = requests.get(url, data=params, headers=headers).json()
results = response['data']
while response['links']['next']:
response = requests.get(response['links']['next'], data=params, headers=headers).json()
results.extend(response['data'])
return results
you can check the basic sctructure for the response here in the API doc.
Please enlighten me, thank you very much!
You could simply check
while "next" in response['links']:
# ... code ...
But if it can get other problems then putting all in `try/except can be also usefull.
results = [] # empty list
try:
response = requests.get(url, data=params, headers=headers)
#print(response.status_code) # for debug
#print(response.text) # for debug
json_data = response.json()
results += json_data['data']
while "next" in json_data['links']:
response = requests.get(json_data['links']['next'], data=params, headers=headers)
#print(response.status_code) # for debug
#print(response.text) # for debug
json_data = response.json()
results += json_data['data']
except Exception as ex:
print("Exception:", ex)
return results
But I would reduce it to while True
results = [] # empty list
try:
while True:
response = requests.get(url, data=params, headers=headers)
#print(response.status_code) # for debug
#print(response.text) # for debug
json_data = response.json()
results += json_data['data']
if "next" not in json_data['links']:
break
url = json_data['links']['next']
except Exception as ex:
print("Exception:", ex)
return results
I am working with an API that paginates results of a request, handily the API also generates some 'hypermedia' values that contains information on the pagination; one of these values is 'next_page'.
I am trying to create a loop that takes a request, logs this 'next_page' value, then runs another request using the value of 'next_page', and logs the next page, so on and so on, until the value of 'next_page' = 'None'.
Here's my function so far, but this is only populating one item in the list, when there should be 3.
pages = []
def build_requests(request):
request = 'https://api.performancehorizon.com'+request
job = requests.get(request, headers=headers, params=params)
req = job.json()
hypermedia_np = req['hypermedia']['pagination']['next_page']
if hypermedia_np != 'None':
next_page = req['hypermedia']['pagination']['next_page']
pages.append(next_page)
job = requests.get('https://api.performancehorizon.com'+next_page, headers=headers, params=params)
req = job.json()
else: print("Done!")
How can I change my code to loop over this job until the value of 'next_page' is 'None'?
You can use the below modified code
pages = []
def build_requests(request):
request = 'https://api.performancehorizon.com' + request
job = requests.get(request, headers=headers, params=params)
req = job.json()
hypermedia_np = req['hypermedia']['pagination']['next_page']
while hypermedia_np is not None or hypermedia_np != 'None':
next_page = req['hypermedia']['pagination']['next_page']
if next_page is not None or next_page != 'None':
pages.append(next_page)
job = requests.get('https://api.performancehorizon.com' + next_page, headers=headers, params=params)
req = job.json()
hypermedia_np = req['hypermedia']['pagination']['next_page']
else:
hypermedia_np = None
print("Done!")
Or you can use this simplified version, this should also work for your case
pages = []
def build_requests(request):
request = 'https://api.performancehorizon.com' + request
job = requests.get(request, headers=headers, params=params)
req = job.json()
hypermedia_np = req['hypermedia']['pagination']['next_page']
while hypermedia_np is not None or hypermedia_np != 'None':
pages.append(hypermedia_np)
job = requests.get('https://api.performancehorizon.com' + hypermedia_np, headers=headers, params=params)
req = job.json()
hypermedia_np = req['hypermedia']['pagination']['next_page']
print("Done!")
I need help on how to do a loop so each time I make a GET request, it will always be the new page from the API.
I start with getting the first response. It includes a parameter to the next page next_key
{
"result": [
{
...,
...
}
],
"next_key": 123
}
Below is my current attempt
import requests
import json
url = "https://flespi.io/gw/channels/all/messages"
headers = {"Authorization": "FlespiToken 23ggh45"}
def getFirst():
data = {"limit_count":100, "limit_size":10000}
params = {"data":json.dumps(data, separators=(",", ":"))}
reqFirst = requests.get(url, params=params, headers=headers).json()
return reqFirst["next_key"] ## this returns "123"
def getDataNext():
data = {"limit_count":100, "limit_size":10000, "curr_key":getFirst()}
params = {"data":json.dumps(data, separators=(",", ":"))}
reqNext = requests.get(url, params=params, headers=headers)
jsonData = reqNext.json()
while True:
if "next_key" in jsonData:
data = {"limit_count":100, "limit_size":10000,"curr_key":jsonData["next_key"]}
params = {"data":json.dumps(data, separators=(",", ":"))}
req = requests.get(url, params=params, headers=headers).json() ## this should do GET request for the third page and so on...
print req["next_key"] # this returns "3321" which is the value for "next_key" in second page
else:
pass
getDataNext()
The full url including limit count, limit size and curr key is as follows https://flespi.io/gw/channels/all/messages?data=%7B%22curr_key%22%123%2C%22limit_count%22%3A100%2C%22limit_size%22%3A10000%7D
As you can see this only returns the second page that is jsonData["next_key"]. What I want to do is that for each GET request, the program will read the next_key and put it on the next GET request.
I am thinking to use increment on the curr_key but the key is random and also I do not know how many page there is.
I believe there must be just a simple solution for this but apparently I could not think about it. Thank you for your help and suggestion.
try this
has_next_key = False
nextKey = ""
if "next_key" in jsonData:
has_next_key = True
nextKey = jsonData["next_key"]
while has_next_key:
data = {"limit_count":100, "limit_size":10000,"curr_key":nextKey}
params = {"data":json.dumps(data, separators=(",", ":"))}
req = requests.get(url, params=params, headers=headers).json() ## this should do GET request for the third page and so on...
if "next_key" in req:
nextKey = req["next_key"]
print nextKey # this returns "3321" which is the value for "next_key" in second page
else:
has_next_key = False
# no next_key, stop the loop
So basically I'm trying to make a request to this website - https://panel.talonro.com/login/ which is supposed to be 301 redirect.
I send data as I should but in the end there is no Location header in my request and status code is 200 instead of 301.
I can't figure out what I am doing wrong. Please help
def do_request():
req = requests.get('https://panel.talonro.com/login/').text
soup = BeautifulSoup(req, 'html.parser')
csrf = soup.find('input', {'name':'csrfKey'}).get('value')
ref = soup.find('input', {'name':'ref'}).get('value')
post_data = {
'auth':'mylogin',
'password':'mypassword',
'login__standard_submitted':'1',
'csrfKey':csrf,
'ref':ref,
'submit':'Go'
}
post = requests.post(url = 'https://forum.talonro.com/login/', data = post_data, headers = {'referer':'https://panel.talonro.com/login/'})
Right now push_data is in do_request(), so you cannot access it outside of that function.
Instead, try this where you return that info and then pass it in:
import requests
from bs4 import BeautifulSoup
def do_request():
req = requests.get('https://panel.talonro.com/login/').text
soup = BeautifulSoup(req, 'html.parser')
csrf = soup.find('input', {'name':'csrfKey'}).get('value')
ref = soup.find('input', {'name':'ref'}).get('value')
post_data = {
'auth':'mylogin',
'password':'mypassword',
'login__standard_submitted':'1',
'csrfKey':csrf,
'ref':ref,
'submit':'Go'
}
return post_data
post = requests.post(url = 'https://forum.talonro.com/login/', data = do_request(), headers = {'referer':'https://panel.talonro.com/login/'})
I'm trying to add a torrent with uTorrent web api (http://www.utorrent.com/community/developers/webapi), in Python using Requests library.
import requests
import re
UTORRENT_URL = 'http://%s:%s/gui/' % ('localhost', '55655')
UTORRENT_URL_TOKEN = '%stoken.html' % UTORRENT_URL
REGEX_UTORRENT_TOKEN = r'<div[^>]*id=[\"\']token[\"\'][^>]*>([^<]*)</div>'
auth = requests.auth.HTTPBasicAuth('x', 'x')
headers = {'content-type': 'application/json'}
r = requests.get(UTORRENT_URL_TOKEN, auth=auth, headers=headers)
token = re.search(REGEX_UTORRENT_TOKEN, r.text).group(1)
guid = r.cookies['GUID']
cookies = dict(GUID = guid)
headers = {'content-type': 'multipart/form-data'}
params = {'action':'add-file','token': token}
files = {'torrent_file':'C:\\x.torrent'}
r = requests.post(UTORRENT_URL, auth=auth, cookies=cookies, headers=headers, params=params, files=files)
print r.json()
Error - torrent file content not supplied in form parameter
Any help appreciated
Ok, the problem was setting the headers. Removing them and it finally works!
import requests
import re
UTORRENT_URL = 'http://%s:%s/gui/' % ('192.168.1.80', '55655')
UTORRENT_URL_TOKEN = '%stoken.html' % UTORRENT_URL
REGEX_UTORRENT_TOKEN = r'<div[^>]*id=[\"\']token[\"\'][^>]*>([^<]*)</div>'
auth = requests.auth.HTTPBasicAuth('x', 'x')
r = requests.get(UTORRENT_URL_TOKEN, auth=auth)
token = re.search(REGEX_UTORRENT_TOKEN, r.text).group(1)
guid = r.cookies['GUID']
cookies = dict(GUID = guid)
params = {'action':'add-file','token': token}
files = {'torrent_file': open('C:\\x.torrent', 'rb')}
r = requests.post(url=UTORRENT_URL, auth=auth, cookies=cookies, params=params, files=files)
You aren't sending the file data. You should read the relevant section of the documentation. The correct code would be this (I marked the line I changed):
import requests
import re
UTORRENT_URL = 'http://%s:%s/gui/' % ('localhost', '55655')
UTORRENT_URL_TOKEN = '%stoken.html' % UTORRENT_URL
REGEX_UTORRENT_TOKEN = r'<div[^>]*id=[\"\']token[\"\'][^>]*>([^<]*)</div>'
auth = requests.auth.HTTPBasicAuth('x', 'x')
headers = {'content-type': 'application/json'}
r = requests.get(UTORRENT_URL_TOKEN, auth=auth, headers=headers)
token = re.search(REGEX_UTORRENT_TOKEN, r.text).group(1)
guid = r.cookies['GUID']
cookies = dict(GUID = guid)
headers = {'content-type': 'multipart/form-data'}
params = {'action':'add-file','token': token}
files = {'torrent_file': open('C:\\x.torrent', 'rb')} # Changed this line
r = requests.post(UTORRENT_URL, auth=auth, cookies=cookies, headers=headers, params=params, files=files)
print r.json()