How to use pandas DF as params in HTTP request - python

I have a list of places from an excel file which I would enrich with the geonames Ids. Starting from the excel file I made a pandas Data Frame then I would use the values from the DF as params in my request.
Here the script I made
import pandas as pd
import requests
import json
require_cols = [1]
required_df = pd.read_excel('grp.xlsx', usecols = require_cols)
print(required_df)
url = 'http://api.geonames.org/searchJSON?'
params = { 'username': "XXXXXXXX",
'name_equals': (required_df),
'maxRows': "1"}
e = requests.get(url, params=params)
pretty_json = json.loads(e.content)
print (json.dumps(pretty_json, indent=2))
The problem is related to the defintion of this parameter:
'name_equals': (required_df)
I would use the Places (around 15k) from the DF as param and recoursively retrieve the related geonames ID and write the output in a separate excel file.
The simple request works:
import requests
import json
url = 'http://api.geonames.org/searchJSON?'
params = { 'username': "XXXXXXX",
'name_equals': "Aire",
'maxRows': "1"}
e = requests.get(url, params=params)
pretty_json = json.loads(e.content)
print (json.dumps(pretty_json, indent=2))
#print(e.content)
As well as the definition of Pandas data frame:
# import pandas lib as pd
import pandas as pd
require_cols = [0,1]
# only read specific columns from an excel file
required_df = pd.read_excel('grp.xlsx', usecols = require_cols)
print(required_df)
I also tried via SPARQL without results so I decided to go via Python.
Thanks for your time.

You can use for-loop
import pandas as pd
df = pd.DataFrame({'Places': ['London', 'Paris', 'Berlin']})
for item in df['Places']:
print('requests for:', item)
# ... rest of code ...
or df.apply()
import pandas as pd
def run(item):
print('requests for:', item)
# ... rest of code ...
return 'result for ' + item
df = pd.DataFrame({'Places': ['London', 'Paris', 'Berlin']})
df['Results'] = df['Places'].apply(run)

Thanks #furas for your reply.
I solved like this:
import pandas as pd
import requests
import json
url = 'http://api.geonames.org/searchJSON?'
df = pd.read_excel('Book.xlsx', sheet_name='Sheet1', usecols="B")
for item in df.place_name:
df.place_name.head()
params ={ 'username': "XXXXXX",
'name_equals': item,
'maxRows': "1"}
e = requests.get(url, params=params)
pretty_json = json.loads(e.content)
for item in pretty_json["geonames"]:
print (json.dumps(item["geonameId"], indent=2))
with open('data.json', 'w', encoding='utf-8') as f:
json.dump(item["geonameId"], f, ensure_ascii=False, indent=4)
#print(e.content)
The only problem now is related to the json output: By print I'm having the complete IDs list however, when I'm going to write the output to a file I'm getting just the last ID from the list.

Related

How to loop through an API request for different parameters such as changing the month and header

I have the following API request that I then clean and sort the data:
Base_URL = "https://api.jao.eu/OWSMP/getauctions?"
headers = {
"AUTH_API_KEY": "06e690fb-697b-4ab2-9325-4268cbd14502"
}
params = {
"horizon":"Daily",
"corridor":"IF1-FR-GB",
"fromdate":"2021-01-01"
}
data = "results"
r = requests.get(Base_URL, headers=headers, params=params, json=data)
j = r.json()
df = pd.DataFrame.from_dict(j)
df=df.explode('results')
df=df.join(pd.json_normalize(df.pop('results')).add_suffix('_new'))
df.drop(['ftroption','identification','horizonName','periodToBeSecuredStart','periodToBeSecuredStop','bidGateOpening','bidGateClosure','isBidGateOpen','atcGateOpening','atcGateClosure','marketPeriodStop','disputeSubmissionGateOpening','disputeSubmissionGateClosure','disputeProcessGateOpening','disputeProcessGateClosure','ltResaleGateOpening','ltResaleGateClosure','maintenances','xnRule','winningParties','operationalMessage','products','lastDataUpdate','cancelled','comment_new','corridorCode_new','productIdentification_new','additionalMessage_new'], axis=1, inplace=True)
df
I then sort it by the date column, which is why it is important to be able to run it for every month, as I need to repeat this process and hopefully automate it in the future:
df['new'] = pd.to_datetime(df['marketPeriodStart']).dt.strftime('%d/%m/%Y')
df = df = df.sort_values(by='new', ascending=True)
df
As the API can only run in one month periods, I am trying to loop-through it to be able to change the "fromdate" param to every month. I can then change the "corridor" param and I would be able to repeat the above for-loop. Thank you!
Get all data:
import pandas as pd
import requests
Base_URL = "https://api.jao.eu/OWSMP/getauctions?"
headers = {
"AUTH_API_KEY": "api_key"
}
final_df=pd.DataFrame() #all data will store here.
#create dates like 2022-01-01, 2022-08-01...
year=['2021','2022']
month=list(range(1,13))
dates=[]
errors=[]
for i in year:
for j in month:
if i =='2022' and j in [11,12]:
pass
else:
dates.append(i+ '-' + f'{j:02}' + '-01')
#dates are ready. let's request for each date and append data to final df.
for i in dates:
params = {
"horizon":"Daily",
"corridor":"IF1-FR-GB",
"fromdate":i
}
data = "results"
r = requests.get(Base_URL, headers=headers, params=params, json=data)
j = r.json()
try:
df = pd.DataFrame.from_dict(j)
final_df=final_df.append(df)
except:
errors.append(j)
#now, let's do same process for final data.
final_df=final_df.explode('results')
final_df=final_df.join(pd.json_normalize(final_df.pop('results')).add_suffix('_new'))
final_df.drop(['ftroption','identification','horizonName','periodToBeSecuredStart','periodToBeSecuredStop','bidGateOpening','bidGateClosure','isBidGateOpen','atcGateOpening','atcGateClosure','marketPeriodStop','disputeSubmissionGateOpening','disputeSubmissionGateClosure','disputeProcessGateOpening','disputeProcessGateClosure','ltResaleGateOpening','ltResaleGateClosure','maintenances','xnRule','winningParties','operationalMessage','products','lastDataUpdate','cancelled','comment_new','corridorCode_new','productIdentification_new','additionalMessage_new'], axis=1, inplace=True)
After you get all the data, if you want to get it automatically every month, you should set it to run on the first day of every month (if you want a different day, you should change the day value in timedelta).
import pandas as pd
import requests
Base_URL = "https://api.jao.eu/OWSMP/getauctions?"
headers = {
"AUTH_API_KEY": "api_key"
}
from datetime import datetime,timedelta
now=(datetime.today() - timedelta(days=2)).strftime('%Y-%m-01')
params = {
"horizon":"Daily",
"corridor":"IF1-FR-GB",
"fromdate":now
}
data = "results"
r = requests.get(Base_URL, headers=headers, params=params, json=data)
j = r.json()
df = pd.DataFrame.from_dict(j)
df=df.append(df)
df=df.explode('results')
df=df.join(pd.json_normalize(df.pop('results')).add_suffix('_new'))
df.drop(['ftroption','identification','horizonName','periodToBeSecuredStart','periodToBeSecuredStop','bidGateOpening','bidGateClosure','isBidGateOpen','atcGateOpening','atcGateClosure','marketPeriodStop','disputeSubmissionGateOpening','disputeSubmissionGateClosure','disputeProcessGateOpening','disputeProcessGateClosure','ltResaleGateOpening','ltResaleGateClosure','maintenances','xnRule','winningParties','operationalMessage','products','lastDataUpdate','cancelled','comment_new','corridorCode_new','productIdentification_new','additionalMessage_new'], axis=1, inplace=True)

Stuck trying to use Pandas Dataframe or Requests to scrape a webpage

Like the title says, I am beating my head against a wall trying to get a useable result with python pandas / requests.
import pandas as pd
import json
import requests
import csv
url = "https://itc.aeso.ca/itc/public/api/v2/interchange?startDate=20220222&endDate=20220222&pageNo=1&pageSize=1"
r = requests.get(url)
pdObj = pd.read_json(url, orient='records')
pdObj = pdObj.iloc[1:]
print(pdObj)
I have tried this method as well as using using the method below... but to be honest I am getting lost in the weeds on this one.
import requests
import time
import urllib.parse as urlparse
from urllib.parse import parse_qs
from datetime import datetime,timedelta
yesterday = datetime.now() - timedelta(1)
yesterday_date = f'{yesterday.strftime("%d")}-{yesterday.strftime("%B")[:3]}-{yesterday.strftime("%Y")}'
original_url = "https://itc.aeso.ca/itc/public/atc/historic/"
parsed = urlparse.urlparse(original_url)
target_url = "https://itc.aeso.ca/itc/public/api/v2/interchange?startDate=20220222&endDate=20220222&pageNo=1&pageSize=1"
stream_report_url = "https://noms.wei-pipeline.com/reports/ci_report/server/streamReport.php"
s = requests.Session()
# load the cookies
s.get(original_url)
#get id
r = s.post(target_url,
params = {
"request.preventCache": int(round(time.time() * 1000))
},
data = {
"startDate": "20220222",
"endDate": "20220222",
"pageNo": "1",
"pageSize": "1"
})
r = s.get(stream_report_url, params = r.json())
print(r.text)
I just want a dataframe that looks like the excel file you can download.

how to remove the error shown by the json file when I change it to frame using pandas

import tkinter as tk
from tkinter import *
import json
import pandas as pd
import requests
import requests
url = "https://corona-virus-world-and-india-data.p.rapidapi.com/api"
headers = {
'x-rapidapi-key': "04b9735d81mshf7bd2b7070903eap1ec6f9jsnbf3d52c11b5d",
'x-rapidapi-host': "corona-virus-world-and-india-data.p.rapidapi.com"
}
response = requests.request("GET", url, headers=headers).json()
print(response)
parsed_data = json.loads(response)
print(parsed_data)
def flatten_json(json):
dict1 = {}
def flatten(i, name= dict1):
if type(i) is dict:
for a in i:
flatten(i[a], name + a + ‘_’)
else:
dict1[name[:-1]] = i
flatten(json)
return dict1
df = pd.DataFrame.from_dict(flatten_json(parsed_data), orient=’index’)
flatten_json(parsed_data)
# print(response)
{
"countries_stat":[
{
"country_name":"USA",
"cases":"29,920,366",
"deaths":"543,571",
...
"deaths_per_1m_population":"1,636",
"total_tests":"374,406,501",
"tests_per_1m_population":"1,126,554"
}
],
"statistic_taken_at":"2021-03-12 00:00:02",
"world_total":{
"total_cases":"119,091,428",
"new_cases":"467,990",
...
"deaths_per_1m_population":"338.8",
"statistic_taken_at":"2021-03-12 00:00:02"
}
}
# print(type(response))
<class 'dict'>
Analyzing the result, the response is already dict type, to transform the countries_stat to dataframe. You can simply do
# The following methods all produce the same output.
df1 = pd.DataFrame(response['countries_stat'])
df2 = pd.DataFrame.from_dict(response['countries_stat'])
df3 = pd.DataFrame.from_records(response['countries_stat'])
# print(df1)
country_name cases deaths region ... total_cases_per_1m_population deaths_per_1m_population total_tests tests_per_1m_population
0 USA 29,920,366 543,571 ... 90,028 1,636 374,406,501 1,126,554
1 India 11,305,877 158,325 ... 8,137 114 224,258,293 161,409

normalize a column in pandas dataframe

I am able to import data from json file using this code...
import requests
from pandas.io.json import json_normalize
url = "https://datameetgeobk.s3.amazonaws.com/image_list.json"
resp = requests.get(url=url)
df = json_normalize(resp.json()['Images'])
df.head()
But the column "BlockDeviceMappings" is actually a list and each item has DeviceName and Ebs parameters those are string and dicts. How do I further normalize my dataframe to include all the details in separate columns?
My screenshot does not match with the one shown in the answer. The Ebs column (second from left) is a dictionary.
import requests
import pandas as pd
url = "https://datameetgeobk.s3.amazonaws.com/image_list.json"
resp = requests.get(url=url)
resp = resp.json()
What you have so far:
df = pd.json_normalize(resp['Images'])
BlockDeviceMappings cast to all columns
inner_keys = [x for x in resp['Images'][0].keys() if x != 'BlockDeviceMappings']
df_bdm = pd.json_normalize(resp['Images'], record_path=['BlockDeviceMappings'], meta=inner_keys, errors='ignore')
Separate bdm_df:
bdm_df = pd.json_normalize(resp['Images'], record_path=['BlockDeviceMappings'])
You will no doubt wonder why df has 39995 entries, while bdm_df has 131691 entries. This is because BlockDeviceMappings is a list of dicts of varying lengths:
bdm_len = [len(x) for x in df.BlockDeviceMappings]
max(bdm_len)
>>> 31
Sample BlockDeviceMappings entry:
[{'DeviceName': '/dev/sda1',
'Ebs': {'DeleteOnTermination': True,
'SnapshotId': 'snap-0aac2591b85fe677e',
'VolumeSize': 80,
'VolumeType': 'gp2',
'Encrypted': False}},
{'DeviceName': 'xvdb',
'Ebs': {'DeleteOnTermination': True,
'SnapshotId': 'snap-0bd8d7828225924a7',
'VolumeSize': 80,
'VolumeType': 'gp2',
'Encrypted': False}}]
df_bdm.head()

JSON reponse from iterate multiple URL to store DataFrame

I have dynamic API URL using the which each URL is getting data in response as JSON which is as following.
{
"#type":"connection",
"id":"001ZOZ0B00000000006Z",
"orgId":"001ZOZ",
"name":"WWW3",
"description":"Test connection2",
"createTime":"2018-07-20T18:28:05.000Z",
"updateTime":"2018-07-20T18:28:53.000Z",
"createdBy":"xx.xx#xx.com.dev",
"updatedBy":"xx.xx#xx.com.dev",
"agentId":"001ZOZ08000000000007",
"runtimeEnvironmentId":"001ZOZ25000000000007",
"instanceName":"ShareConsumer",
"shortDescription":"Test connection2",
"type":"TOOLKIT",
"port":0,
"majorUpdateTime":"2018-07-20T18:28:05.000Z",
"timeout":60,
"connParams":{
"WSDL URL":"https://xxxservices1.work.com/xxx/service/xxport2/n5/Integration%20System/API__Data?wsdl",
"Must Understand":"true",
"DOMAIN":"n5",
"agentId":"001ZOZ0800XXX0007",
"agentGroupId":"001ZOZ25000XXX0007",
"AUTHENTICATION_TYPE":"Auto",
"HTTP Password":"********",
"Encrypt password":"false",
"orgId":"001Z9Z",
"PRIVATE_KEY_FILE":"",
"KEY_FILE_TYPE":"PEM",
"mode":"UPDATE",
"CERTIFICATE_FILE_PASSWORD":null,
"CERTIFICATE_FILE":null,
"TRUST_CERTIFICATES_FILE":null,
"Username":"xxx#xxx",
"CERTIFICATE_FILE_TYPE":"PEM",
"KEY_PASSWORD":null,
"TIMEOUT":"60",
"Endpoint URL":"https://wxxservices1.xx.com/xxx/service/xxport2/n5/Integration%20System/API__Data",
"connectionTypes":"NOAUTH",
"HTTP Username":"API#n5",
"Password":"********"
}
}
Now catch over here is i have close around 50 URLs which gives this type JSON data. I am iterating it using the following code but i am not able to store in Python pandas dataframe as each response from each URL.
It will be either last response only stored there.
I would also like to convert this whole dataframe to CSV.
What is best method to append response of each result of URL response to dataframe and then convert to CSV?
Python Code as following:
import requests
from urllib.request import Request, urlopen
from urllib.request import urlopen, URLError, HTTPError
import urllib.error
import json
import pandas as pd
from pandas.io.json import json_normalize
import os
import csv
#This CSV file where we are getting ID and iterating over it for each url for get JSON data for the each URL
ConnID_data_read=pd.read_csv('ConnID.csv', delimiter = ',')
df = pd.DataFrame(ConnID_data_read)
user_iics_loginURL='https://xx-us.xxx.com/ma/api/v2/user/login'
headers = {
'Content-Type': "application/json",
'Accept': "application/json",
'cache-control': "no-cache"
}
payload = "{\r\n\"#type\": \"login\",\r\n\"username\": \"xx#xx.com.xx\",\r\n\"password\": \"xxxx\"\r\n}"
response = requests.request("POST", user_iics_loginURL, data=payload, headers=headers)
resp_obj = json.loads(response.text)
session_id = resp_obj['SessionId']
server_URL = resp_obj['serverUrl']
print(session_id)
Finaldf = pd.DataFrame()
for index, row in df.iterrows():
api_ver="/api/v2/connection/"+row['id']
#https://xx-us.xxx.com/saas/api/v2/connection/001ZOZ0B000000000066
conndetails_url = server_URL+api_ver
print(conndetails_url)
act_headers = {
'icSessionId': session_id,
'Content-Type': "application/json",
'cache-control': "no-cache",
}
act_response = requests.get(conndetails_url.strip(),headers=act_headers)
print(act_response.text)
print("Creating Data Frame on this***********************")
act_json_data= json.loads(act_response.text)
flat_json = json_normalize(act_json_data)
print(flat_json)
Conndf = pd.DataFrame(flat_json)
Finaldf.append(Conndf)
Finaldf.to_csv('NewTest.csv')
first thing I notice is:
flat_json = json_normalize(act_json_data)
print(flat_json)
Conndf = pd.DataFrame(flat_json)
when you do flat_json = json_normalize(act_json_data), flat_json is already a dataframe. Doing Conndf = pd.DataFrame(flat_json) is unnecessary and redundant, although shouldn't cause a problem, it's just extra code you don't need.
Secondly here's the issue. When you append the dataframe, you need to set it equal to itself. So change:
Finaldf.append(Conndf)
to
Finaldf = Finaldf.append(Conndf)
I'd also just rest the index, as that's just a habit of mine when I append dataframes:
Finaldf = Finaldf.append(Conndf).reset_index(drop=True)
Other than that 1 line, it looks fine and you should get the full dataframe saved to csv with Finaldf.to_csv('NewTest.csv')

Categories

Resources