Retrieve number of last page json request python - python

New in Python, I'd like to webscrape json data from
https://api.gleif.org/api/v1/lei-records?page[size]=200&page[number]=1&filter[entity.names]=*&filter[entity.legalAddress.country]=DE' without hard coding the number of page
Below is the code that I use and work
dfs = []
for i in np.arange(1, 20000):
try:
URL = f'https://api.gleif.org/api/v1/lei-records?page[size]=200&page[number]={i}&filter[entity.names]=*&filter[entity.legalAddress.country]=DE'
r = requests.get(URL, proxies=proxies).json()
v = pd.json_normalize(r['data'])
dfs.append(v)
print(f'Page {i}: Done')
except Exception as e:
print(f'Page {i}: Error', e)
break
Here is the Response
URL = f'https://api.gleif.org/api/v1/lei-records?page[size]=200&page[number]=1&filter[entity.names]=*&filter[entity.legalAddress.country]=DE'
r = requests.get(URL, proxies=proxies).json()
print(r)
Below is the output response
{'meta': {'goldenCopy': {'publishDate': '2020-09-17T00:00:00Z'}, 'pagination': {'currentPage': 1, 'perPage': 200, 'from': 1, 'to': 200, 'total': 139644, 'lastPage': 699}},
Question: How can I store 'lastPage' = 699 in a variable?
The goal would be to use the following loop
for i in np.arange(1, lastPage):
....
Thanks for anyone helping!

lastPage = r.get('meta').get('pagination').get('lastPage')

Related

Iterate through a nested dict inside of a list, with some missing keys

I need your guys' help on how to extract information from a nested dictionary inside a list. Here's the code to get the data:
import requests
import json
import time
all_urls = []
for x in range(5000,5010):
url = f'https://api.jikan.moe/v4/anime/{x}/full'
all_urls.append(url)
all_responses = []
for page_url in all_urls:
response = requests.get(page_url)
all_responses.append(response)
time.sleep(1)
print(all_responses)
data = []
for i in all_responses:
json_data = json.loads(i.text)
data.append(json_data)
print(data)
The sample of the extracted data is as follows:
[{'status': 404,
'type': 'BadResponseException',
'message': 'Resource does not exist',
'error': '404 on https://myanimelist.net/anime/5000/'},
{'status': 404,
'type': 'BadResponseException',
'message': 'Resource does not exist',
'error': '404 on https://myanimelist.net/anime/5001/'},
{'data': {'mal_id': 5002,
'url': 'https://myanimelist.net/anime/5002/Bari_Bari_Densetsu',
'images': {'jpg': {'image_url': 'https://cdn.myanimelist.net/images/anime/4/58873.jpg',
'small_image_url': 'https://cdn.myanimelist.net/images/anime/4/58873t.jpg',
'large_image_url': 'https://cdn.myanimelist.net/images/anime/4/58873l.jpg'},
'webp': {'image_url': 'https://cdn.myanimelist.net/images/anime/4/58873.webp',
'small_image_url': 'https://cdn.myanimelist.net/images/anime/4/58873t.webp',
'large_image_url': 'https://cdn.myanimelist.net/images/anime/4/58873l.webp'}},
'trailer': {'youtube_id': None,
'url': None,
'embed_url': None,
'images': {'image_url': None,
'small_image_url': None,
'medium_image_url': None,
'large_image_url': None,
'maximum_image_url': None}},
'title': 'Bari Bari Densetsu',
'title_english': None,
'title_japanese': 'バリバリ伝説',
'title_synonyms': ['Baribari Densetsu',
......
I need to extract the title from the list of data. Any help is appreciated! Also, any recommendation on a better/simpler/cleaner code to extract the json data from an API is also greatly appreciated!
Firstly, no need to create multiple lists. You can do everything in one loop:
import requests
import json
data = []
for x in range(5000,5010):
page_url = f'https://api.jikan.moe/v4/anime/{x}/full'
response = requests.get(page_url)
json_data = json.loads(response.text)
data.append(json_data)
print(data)
Second, to address your problem, you have two options. You can use dict.get:
for dic in data:
title = dic.get('title', 'no title')
Or use the try/except pattern:
for dic in data:
try:
title = dic['title']
except KeyError:
# deal with case where dict has no title
pass

How do I extract all results from a GET request that spans multiple pages?

I have successfully written code that calls an API and then converts the results into a DataFrame.
wax_wallet = "zqsfm.wam"
# Get Assets from AtomicHub API
response1 = requests.get(
"https://wax.api.atomicassets.io/atomicassets/v1/assets?"
f"owner={wax_wallet}"
"&collection_whitelist=nftdraft2121"
"&page=1"
"&limit=1000"
"&order=asc"
"&sort=name")
# Save Response as JSON
json_assets = response1.json()
# Convert JSON to DataFrame
df = pd.json_normalize(json_assets['data'])
This API returns at most 1000 items per page so I need to have it loop through as many pages as needed and ultimately get the results stored into a DataFrame.
I attempted to solve it with the below code, but was unsuccessful.
asset_count = 2500
pages = int(math.ceil(asset_count / 1000))
# Get Assets from AtomicHub API
all_assets = []
for page in range(1, pages):
url = f'https://wax.api.atomicassets.io/atomicassets/v1/assets?owner={wax_wallet}' \
f'&collection_whitelist=nftdraft2121&page={page}&limit=1000&order=asc&sort=name'
response = rq.get(url)
all_assets.append(json.loads(response.text))["response"]
Thanks in advance for any help!
You can turn them into dataframes and then concatenate the individual frames into a final result:
def get_page(page_num):
wax_wallet = "zqsfm.wam"
response = requests.get(
"https://wax.api.atomicassets.io/atomicassets/v1/assets",
params={
"owner": wax_wallet,
"collection_whitelist": "nftdraft2121",
"page": page_num,
"limit": "1000",
"order": "asc",
"sort": "name"
}
)
json_assets = response.json()
return pd.json_normalize(json_assets['data'])
# The number of pages you want
number_of_pages_requested = 10
# Get all pages as dataframes
pages = [get_page(n + 1) for n in range(number_of_pages_requested)]
# Combine pages to single dataframe
df = pd.concat(pages)
Edit: updated using params based on Olvin Roght's comment
Edit 2: fixed indexing error
I think this should help:-
import requests
all_assets = []
URL = 'https://wax.api.atomicassets.io/atomicassets/v1/assets'
params = {
'owner': 'zqsfm.wam',
'collection_whitelist': 'nftdraft2121',
'page': 1,
'order': 'asc',
'sort': 'name',
'limit': 1000
}
with requests.Session() as session:
while True:
print(f"Getting page {params['page']}")
response = session.get(URL, params=params)
response.raise_for_status()
_j = response.json()
data = _j['data']
if len(data) > 0:
all_assets.append(data)
params['page'] += 1
else:
break
print('Done')

Why urllib.request.urlopen() works time-to-time?

I try to write a parser based on `urllib` and `beautifulsoup` libraries, but don't understand why sometimes I get a request status 200, but sometimes 404 (with the same URL of course), moreover the URL which had been requested by `urllib` and returned 404 opens perfect manually via a browser.
Could anyone explain that behavior?
url = 'https://zakupki.gov.ru/epz/order/extendedsearch/results.html'
params = {'searchString': 'Сакубитрил',
'morphology': 'on',
'pageNumber': 1,
'sortDirection': 'false',
'recordsPerPage': '_10',
'showLotsInfoHidden': 'false',
'sortBy': 'UPDATE_DATE',
'fz44': 'on',
'fz223': 'on',
'af': 'on',
'ca': 'on',
'pc': 'on',
'pa': 'on',
'currencyIdGeneral': -1,
'publishDateFrom': '01.02.2021',
'publishDateTo': '21.02.2021'}
def parser(url, params):
attempt = 0
while attempt < 10:
try:
data = urllib.parse.urlencode(params)
full_url = url + '?' + data
with urllib.request.urlopen(full_url, timeout = 10) as response:
the_page = response.read()
soup = BeautifulSoup(the_page, 'html.parser')
return soup
except Exception: # don't forget replace Exception with something more specific
attempt += 1
time.sleep(5)
continue
data = parser(url, params=params)

Web Scraping with Python - Code stopped at ~10000 rows not returning expected size of output

I have below code written in Python to scrape a price list off the website. According to "total results" there's supposed to be over 100k parts available, however when the output only returns ~10k. Just wondering what might have caused this, any help appreciated!
import pandas as pd
import requests
query = 'GE Healthcare'
payload = {
"facets":[],
"facilityId": 38451,
"id_ins": "a2a3d332-73a7-4194-ad87-fe7412388916",
"limit": 200,
"query": query,
"referer": "/catalog/Service",
"start": 0,
"urlParams":[]
}
r = requests.post('https://prodasf-vip.partsfinder.com/Orion/CatalogService/api/v1/search', json=payload)
if r.status_code == 200:
js = r.json()
df = pd.json_normalize(js["products"])
while len(df) < js["totalResults"] and len(df)<200000:
payload["start"] += 200
r = requests.post('https://prodasf-vip.partsfinder.com/Orion/CatalogService/api/v1/search', json=payload)
if r.status_code == 200:
df = pd.concat([df, pd.json_normalize(r.json()["products"])])
else:
break
print(f"want: {js['totalResults']} got: {len(df)}")
df.to_csv(r'C:\Users\212677036\Documents\output_final.csv')

Get data by pages and merge it into one using Python (pagination)

I'm connecting to API which has 500 rows limit per call.
This is my code for a single API call (Works great):
def getdata(data):
auth_token = access_token
hed = {'Authorization': 'Bearer ' + auth_token, 'Accept': 'application/json'}
urlApi = 'https://..../orders?Offset=0&Limit=499'
datar = requests.get(urlApi, data=data, headers=hed, verify=True)
return datar
Now I want to scale it up so it will get me all the records.
This is what I tried to do:
In order to make sure that I have all the rows, I must iterate until there is no more data:
get 1st page
get 2nd page
merge
get 3rd page
merge
etc...
each page is an API call.
This is what I'm trying to do:
def getData(data):
auth_token = access_token
value_offset = 0
hed = {'Authorization': 'Bearer ' + auth_token, 'Accept': 'application/json'}
datarALL = None
while True:
urlApi = 'https://..../orders?Offset=' + value_offset + '&Limit=499'
responsedata = requests.get(urlApi, data=data, headers=hed, verify=True)
if responsedata.ok:
value_offset = value_offset + 499
#to do: merge the result of the get request
datarALL= datarALL+ responsedata (?)
# to do: check if response is empty then break out.
return datarALL
I couldn't find information about how I merge the results of the API calls nor how do I check if I can break the loop.
Edit:
To clear what I'm after.
I can see the results of the API call using:
logger.debug('response is : {0}'.format(datar.json()))
What I want to be able to do:
logger.debug('response is : {0}'.format(datarALL.json()))
and it will show all results from all calls. This requires generate API calls until there is no more data to get.
This is the return sample of API call:
"offset": 0,
"limit": 0,
"total": 0,
"results": [
{
"field1": 0,
"field2": "string",
"field3": "string",
"field4": "string"
}
]
}
In this case, you are almost correct with the idea.
is_valid = True
while is_valid:
is_valid = False
...
...
responsedata = requests.get(urlApi, data=data, headers=hed, verify=True)
if responsedata.status_code == 200: #Use status code to check request status, 200 for successful call
responsedata = responsedata.text
value_offset = value_offset + 499
#to do: merge the result of the get request
jsondata = json.loads(responsedata)
if "results" in jsondata:
if jsondata["results"]:
is_valid = True
if is_valid:
#concat array by + operand
datarALL = datarALL + jsondata["results"]
As I don't know if "results" still exists when the data ran out, so I checked both level.

Categories

Resources