Get data by pages and merge it into one using Python (pagination) - python

I'm connecting to API which has 500 rows limit per call.
This is my code for a single API call (Works great):
def getdata(data):
auth_token = access_token
hed = {'Authorization': 'Bearer ' + auth_token, 'Accept': 'application/json'}
urlApi = 'https://..../orders?Offset=0&Limit=499'
datar = requests.get(urlApi, data=data, headers=hed, verify=True)
return datar
Now I want to scale it up so it will get me all the records.
This is what I tried to do:
In order to make sure that I have all the rows, I must iterate until there is no more data:
get 1st page
get 2nd page
merge
get 3rd page
merge
etc...
each page is an API call.
This is what I'm trying to do:
def getData(data):
auth_token = access_token
value_offset = 0
hed = {'Authorization': 'Bearer ' + auth_token, 'Accept': 'application/json'}
datarALL = None
while True:
urlApi = 'https://..../orders?Offset=' + value_offset + '&Limit=499'
responsedata = requests.get(urlApi, data=data, headers=hed, verify=True)
if responsedata.ok:
value_offset = value_offset + 499
#to do: merge the result of the get request
datarALL= datarALL+ responsedata (?)
# to do: check if response is empty then break out.
return datarALL
I couldn't find information about how I merge the results of the API calls nor how do I check if I can break the loop.
Edit:
To clear what I'm after.
I can see the results of the API call using:
logger.debug('response is : {0}'.format(datar.json()))
What I want to be able to do:
logger.debug('response is : {0}'.format(datarALL.json()))
and it will show all results from all calls. This requires generate API calls until there is no more data to get.
This is the return sample of API call:
"offset": 0,
"limit": 0,
"total": 0,
"results": [
{
"field1": 0,
"field2": "string",
"field3": "string",
"field4": "string"
}
]
}

In this case, you are almost correct with the idea.
is_valid = True
while is_valid:
is_valid = False
...
...
responsedata = requests.get(urlApi, data=data, headers=hed, verify=True)
if responsedata.status_code == 200: #Use status code to check request status, 200 for successful call
responsedata = responsedata.text
value_offset = value_offset + 499
#to do: merge the result of the get request
jsondata = json.loads(responsedata)
if "results" in jsondata:
if jsondata["results"]:
is_valid = True
if is_valid:
#concat array by + operand
datarALL = datarALL + jsondata["results"]
As I don't know if "results" still exists when the data ran out, so I checked both level.

Related

How do I extract all results from a GET request that spans multiple pages?

I have successfully written code that calls an API and then converts the results into a DataFrame.
wax_wallet = "zqsfm.wam"
# Get Assets from AtomicHub API
response1 = requests.get(
"https://wax.api.atomicassets.io/atomicassets/v1/assets?"
f"owner={wax_wallet}"
"&collection_whitelist=nftdraft2121"
"&page=1"
"&limit=1000"
"&order=asc"
"&sort=name")
# Save Response as JSON
json_assets = response1.json()
# Convert JSON to DataFrame
df = pd.json_normalize(json_assets['data'])
This API returns at most 1000 items per page so I need to have it loop through as many pages as needed and ultimately get the results stored into a DataFrame.
I attempted to solve it with the below code, but was unsuccessful.
asset_count = 2500
pages = int(math.ceil(asset_count / 1000))
# Get Assets from AtomicHub API
all_assets = []
for page in range(1, pages):
url = f'https://wax.api.atomicassets.io/atomicassets/v1/assets?owner={wax_wallet}' \
f'&collection_whitelist=nftdraft2121&page={page}&limit=1000&order=asc&sort=name'
response = rq.get(url)
all_assets.append(json.loads(response.text))["response"]
Thanks in advance for any help!
You can turn them into dataframes and then concatenate the individual frames into a final result:
def get_page(page_num):
wax_wallet = "zqsfm.wam"
response = requests.get(
"https://wax.api.atomicassets.io/atomicassets/v1/assets",
params={
"owner": wax_wallet,
"collection_whitelist": "nftdraft2121",
"page": page_num,
"limit": "1000",
"order": "asc",
"sort": "name"
}
)
json_assets = response.json()
return pd.json_normalize(json_assets['data'])
# The number of pages you want
number_of_pages_requested = 10
# Get all pages as dataframes
pages = [get_page(n + 1) for n in range(number_of_pages_requested)]
# Combine pages to single dataframe
df = pd.concat(pages)
Edit: updated using params based on Olvin Roght's comment
Edit 2: fixed indexing error
I think this should help:-
import requests
all_assets = []
URL = 'https://wax.api.atomicassets.io/atomicassets/v1/assets'
params = {
'owner': 'zqsfm.wam',
'collection_whitelist': 'nftdraft2121',
'page': 1,
'order': 'asc',
'sort': 'name',
'limit': 1000
}
with requests.Session() as session:
while True:
print(f"Getting page {params['page']}")
response = session.get(URL, params=params)
response.raise_for_status()
_j = response.json()
data = _j['data']
if len(data) > 0:
all_assets.append(data)
params['page'] += 1
else:
break
print('Done')

pytest - got empty parameter set

I am trying to automate an api, basically what I want to do is execute test_001_post_desafio, save information from its response in a list and use it in test_002_post_validate, but when executing the latter with pytest I get the error:
"SKIPPED [1] test_email.py:43: got empty parameter set ['otp', 'idOtp', 'subdomain']"
What am I doing wrong?
subdominios = ["20", "21", "22", "23", "24", "25", "99", "22", "11"]
desafios = []
#This works fine!!!
#pytest.mark.parametrize("subdominio", [subdominio for subdominio in subdominios])
def test_001_post_desafio(subdominio, payload, header):
response_create = requests.post(
"some url" + subdominio,
data=json.dumps(payload),
headers=header,
verify=False)
response_json = response_create.json()
#Here i append the data i need for test_002_post_validate
desafios.append((response_json["otp"], response_json["idOtp"], subdominio))
assert response_create.status_code == 200
#here is where i get SKIPPED [1] test_email.py:43: got empty parameter set ['otp', 'idOtp', 'subdominio']
#pytest.mark.parametrize("otp, idOtp, subdominio", [otp for otp in desafios])
def test_002_post_validate(otp, idOtp, subdominio, header):
response = requests.post("some Url 1" + idOtp +
"some url2" + subdominio,
data=json.dumps({"otp": otp}),
headers=header,
verify=False)
assert response.status_code == 204
I could do the whole test inside a testcase, but I think it is not very elegant

How to write a python loop to change a value for a dictionary key in API request?

I am writing an API request that gives paginated results.
To get results from the next page I need to take a value of 'next_page_cursor' and put it in the parameters of my request that is a dictionary.
This is what I have tried so far. Need to keep changing cursor value in params until there are no more pages.
params = {'title': 'Cybertruck',
'per_page':100,
'cursor': '*'
}
response = requests.get("https://api.aylien.com/news/stories",
headers = headers, params=params).json()
if "next_page_cursor" in response:
cursor = response["next_page_cursor"]
You can use a while loop:
params = {
"title": "Cybertruck",
"per_page": 100,
"cursor": "initial_cursor"
}
def make_request(params)
return requests.get("https://api.aylien.com/news/stories",
headers=headers, params=params).json()
result = []
response = make_request(params)
while "next_page_cursor" in response:
params["cursor"] = response["next_page_cursor"]
response = make_request(params)
result.append(response["information_your_are_interested_in"])

python web scraping post form data using requests not working

I am trying to post input data into a form using a requests.session and it's returning a 500 status.
I am expecting to see the search results retrieved.
I was able to get around a previous login issue with __RequestVerificationToken and cookies - thanks to the help of Bertrand Martel. The next step in my process is to get the Search page, which I was able to get successfully. Now failing when I try to post data into the date fields on the form, which make up the search criteria. Works when I manually complete the form and press submit. All seems very straightforward to me, but not sure why it won't work. Is it still a cookies issue? Any help would be appreciated.
Here is my code:
import requests
from bs4 import BeautifulSoup
EMAIL = 'myemail#gmail.com'
PASSWORD = 'somepwd'
LOGIN_URL = 'https://www.idocmarket.com/Security/LogOn'
SEARCH_URL = 'https://www.idocmarket.com/RIOCO/Document/Search'
s = requests.Session()
s.get(LOGIN_URL)
result = s.post(LOGIN_URL, data = {
"Login.Username": EMAIL,
"Login.Password": PASSWORD
})
soup = BeautifulSoup(result.text, "html.parser")
# Report successful login
print("Login succeeded: ", result.ok)
print("Status code:", result.status_code)
result = s.get(SEARCH_URL)
auth_token = soup.find("input", {'name': '__RequestVerificationToken'}).get('value')
print('auth token:', auth_token )
print("Get Search succeaeded: ", result.ok)
print("get Search Statusa code:", result.status_code)
result = s.post(SEARCH_URL, data = {
"__RequestVerificationToken": auth_token,
"StartRecordDate": "03/01/2019",
"EndRecordDate": "03/31/2019",
"StartDocNumber": "",
"EndDocNumber": "",
"Book": "",
"Page": "",
"Instrument": "",
"InstrumentGroup": "",
"PartyType": "Either",
"PartyMatchType": "Contains",
"PartyName": "",
"Subdivision": "",
"StartLot": "",
"EndLot": "",
"Block": "",
"Section":"",
"Township": "",
"Range": "",
"Legal": "",
"CountyKey": "RIOCO"
})
print("post Dates succeeded: ", result.ok)
print("post Dates Status code:", result.status_code)
print(result.text)
It seems that this time, the xsrf token is needed in the post along with all the existing parameters. A simple solution is to get all the input value & pass it to the request :
import requests
from bs4 import BeautifulSoup
LOGIN_URL = 'https://www.idocmarket.com/Security/LogOn'
SEARCH_URL = 'https://www.idocmarket.com/RIOCO/Document/Search'
EMAIL = 'myemail#gmail.com'
PASSWORD = 'somepwd'
s = requests.Session()
s.get(LOGIN_URL)
r = s.post(LOGIN_URL, data = {
"Login.Username": EMAIL,
"Login.Password": PASSWORD
})
if (r.status_code == 200):
r = s.get(SEARCH_URL)
soup = BeautifulSoup(r.text, "html.parser")
payload = {}
for input_item in soup.select("input"):
if input_item.has_attr('name'):
payload[input_item["name"]] = input_item["value"]
payload["StartRecordDate"] = '09/01/2019'
payload["EndRecordDate"] = '09/30/2019'
r = s.post(SEARCH_URL, data = payload)
soup = BeautifulSoup(r.text, "html.parser")
print(soup)
else:
print("authentication failure")
Also using comprehension list for the payload you can write :
temp_pl = [
(t['name'], t['value'])
for t in soup.select("input")
if t.has_attr('name')
]
payload = dict(temp_pl)
payload["StartRecordDate"] = '09/01/2019'
payload["EndRecordDate"] = '09/30/2019'

Python & API : Why do I get KeyErrors when run by python but the value is there in postman?

I am using the below code to return the raw json data from Harvest, by changing the URL called I have successfully created running scripts for 6 files, however I am having trouble with one and I can't work out why ...
import requests, json
AUTH = "Bearer REDACTED"
ACCOUNT = "REDACTED"
URL = "https://api.harvestapp.com/v2/clients/?"
HEADERS = { "Authorization": AUTH,
"Harvest-Account-ID": ACCOUNT,
"Accept":"application/json"}
r = requests.get(url=URL, headers=HEADERS).json()
total_pages = int(r['total_pages'])
total_entries = int(r['total_entries'])
results = []
for x in range(1, total_pages):
response = requests.get(URL+"page="+str(x), headers=HEADERS)
data = response.json()
next_page = data["next_page"]
results.extend(data["time_entries"])
filepath = "Z:/System Administrator/System Backups/08. Harvest/HARVEST_Clients.json"
with open(filepath, 'w') as outfile:
json.dump(results, outfile)
print('Done!')
print('Total Pages : '+str(total_pages))
print('Total Entries : '+str(total_entries))
When I run the above it gives me the desired result of
Done!
Total Pages : 3
Total Entries : 237
However if I try to use a URL with date variables I get KeyErrors. All I do is change the code from :
URL = "https://api.harvestapp.com/v2/clients/?"
to
URL = "https://api.harvestapp.com/v2/time_entries?from=2017-04-01&to=2018-03-31/?"
And the results.extend variable from
results.extend(data["clients"])
to
results.extend(data["time_entries"])
I get the error
Traceback (most recent call last):
File "Z:\System Administrator\System Backups\08. Harvest\Scripts\API_Harvest_Timesheets 2017-18.py", line 19, in
total_pages = int(r['total_pages'])
KeyError: 'total_pages'
When I run the URL and authorisation through postman I get the following result
{
"time_entries": [FULL DATA RESULT HERE]
"per_page": 100,
"total_pages": 138,
"total_entries": 13711,
"next_page": 2,
"previous_page": null,
"page": 1,
"links": {
"first": "https://api.harvestapp.com/v2/time_entries?from=2017-04-01&page=1&per_page=100&to=2018-03-31",
"next": "https://api.harvestapp.com/v2/time_entries?from=2017-04-01&page=2&per_page=100&to=2018-03-31",
"previous": null,
"last": "https://api.harvestapp.com/v2/time_entries?from=2017-04-01&page=138&per_page=100&to=2018-03-31"
}
}
So I can see that the 'total_pages' value is returned from that url and the value is 138 - so why doesn't this code run for this specific URL but runs fine for others ?

Categories

Resources