JSON reponse from iterate multiple URL to store DataFrame - python

I have dynamic API URL using the which each URL is getting data in response as JSON which is as following.
{
"#type":"connection",
"id":"001ZOZ0B00000000006Z",
"orgId":"001ZOZ",
"name":"WWW3",
"description":"Test connection2",
"createTime":"2018-07-20T18:28:05.000Z",
"updateTime":"2018-07-20T18:28:53.000Z",
"createdBy":"xx.xx#xx.com.dev",
"updatedBy":"xx.xx#xx.com.dev",
"agentId":"001ZOZ08000000000007",
"runtimeEnvironmentId":"001ZOZ25000000000007",
"instanceName":"ShareConsumer",
"shortDescription":"Test connection2",
"type":"TOOLKIT",
"port":0,
"majorUpdateTime":"2018-07-20T18:28:05.000Z",
"timeout":60,
"connParams":{
"WSDL URL":"https://xxxservices1.work.com/xxx/service/xxport2/n5/Integration%20System/API__Data?wsdl",
"Must Understand":"true",
"DOMAIN":"n5",
"agentId":"001ZOZ0800XXX0007",
"agentGroupId":"001ZOZ25000XXX0007",
"AUTHENTICATION_TYPE":"Auto",
"HTTP Password":"********",
"Encrypt password":"false",
"orgId":"001Z9Z",
"PRIVATE_KEY_FILE":"",
"KEY_FILE_TYPE":"PEM",
"mode":"UPDATE",
"CERTIFICATE_FILE_PASSWORD":null,
"CERTIFICATE_FILE":null,
"TRUST_CERTIFICATES_FILE":null,
"Username":"xxx#xxx",
"CERTIFICATE_FILE_TYPE":"PEM",
"KEY_PASSWORD":null,
"TIMEOUT":"60",
"Endpoint URL":"https://wxxservices1.xx.com/xxx/service/xxport2/n5/Integration%20System/API__Data",
"connectionTypes":"NOAUTH",
"HTTP Username":"API#n5",
"Password":"********"
}
}
Now catch over here is i have close around 50 URLs which gives this type JSON data. I am iterating it using the following code but i am not able to store in Python pandas dataframe as each response from each URL.
It will be either last response only stored there.
I would also like to convert this whole dataframe to CSV.
What is best method to append response of each result of URL response to dataframe and then convert to CSV?
Python Code as following:
import requests
from urllib.request import Request, urlopen
from urllib.request import urlopen, URLError, HTTPError
import urllib.error
import json
import pandas as pd
from pandas.io.json import json_normalize
import os
import csv
#This CSV file where we are getting ID and iterating over it for each url for get JSON data for the each URL
ConnID_data_read=pd.read_csv('ConnID.csv', delimiter = ',')
df = pd.DataFrame(ConnID_data_read)
user_iics_loginURL='https://xx-us.xxx.com/ma/api/v2/user/login'
headers = {
'Content-Type': "application/json",
'Accept': "application/json",
'cache-control': "no-cache"
}
payload = "{\r\n\"#type\": \"login\",\r\n\"username\": \"xx#xx.com.xx\",\r\n\"password\": \"xxxx\"\r\n}"
response = requests.request("POST", user_iics_loginURL, data=payload, headers=headers)
resp_obj = json.loads(response.text)
session_id = resp_obj['SessionId']
server_URL = resp_obj['serverUrl']
print(session_id)
Finaldf = pd.DataFrame()
for index, row in df.iterrows():
api_ver="/api/v2/connection/"+row['id']
#https://xx-us.xxx.com/saas/api/v2/connection/001ZOZ0B000000000066
conndetails_url = server_URL+api_ver
print(conndetails_url)
act_headers = {
'icSessionId': session_id,
'Content-Type': "application/json",
'cache-control': "no-cache",
}
act_response = requests.get(conndetails_url.strip(),headers=act_headers)
print(act_response.text)
print("Creating Data Frame on this***********************")
act_json_data= json.loads(act_response.text)
flat_json = json_normalize(act_json_data)
print(flat_json)
Conndf = pd.DataFrame(flat_json)
Finaldf.append(Conndf)
Finaldf.to_csv('NewTest.csv')

first thing I notice is:
flat_json = json_normalize(act_json_data)
print(flat_json)
Conndf = pd.DataFrame(flat_json)
when you do flat_json = json_normalize(act_json_data), flat_json is already a dataframe. Doing Conndf = pd.DataFrame(flat_json) is unnecessary and redundant, although shouldn't cause a problem, it's just extra code you don't need.
Secondly here's the issue. When you append the dataframe, you need to set it equal to itself. So change:
Finaldf.append(Conndf)
to
Finaldf = Finaldf.append(Conndf)
I'd also just rest the index, as that's just a habit of mine when I append dataframes:
Finaldf = Finaldf.append(Conndf).reset_index(drop=True)
Other than that 1 line, it looks fine and you should get the full dataframe saved to csv with Finaldf.to_csv('NewTest.csv')

Related

Payload from a defined list in Python

I am pretty new to python and I am trying to create a script that will pull data from a ticketing platform.
I got the list of agents and their ids but when I try to pull the data it's giving me this error:
KeyError: 'data'
Is there a way for me to have the parameter "agents": to automatically update using the agent_id list?
Here is the code, I removed the links and the API key for privacy reasons:
import requests
import json
from cgitb import text
from openpyxl import Workbook
import openpyxl
import requests
from datetime import date
from datetime import timedelta
#Agents list
agents_list = ["Agent1", "Agent2", "Agent3"]
agent_id = []
agents_names = []
today = date.today()
yesterday = today - timedelta(days = 1)
start_date = str(yesterday)
end_date = str(yesterday)
def extragere_date_agenti():
url = "https://x.gorgias.com/api/users?limit=100&order_by=name%3Aasc&roles=agent&roles=admin"
headers = {
"accept": "application/json",
"authorization": "Basic"
}
response = requests.get(url, headers=headers)
text_name_id = json.loads(response.text)
for names in text_name_id["data"]:
agent_name = names["firstname"]
agents_id = names["id"]
if agent_name in agents_list:
agents_names.append(agent_name)
agent_id.append(agents_id)
extragere_date_agenti()
def extragere_numere():
url = "https://x.gorgias.com/api/stats/total-messages-sent"
payload = {"filters": {
"period": {
"start_datetime": start_date + "T00:00:00-05:00",
"end_datetime": end_date + "T23:59:59-05:00"
},
"agents": [agent_id], #This is the value that I want to modify
"channels": ["email"]
}}
headers = {
"accept": "application/json",
"content-type": "application/json",
"authorization": "Basic"
}
response = requests.post(url, json=payload, headers=headers)
text_numere = json.loads(response.text)
numere_finale = text_numere["data"]["data"]["value"]
print(numere_finale)
I've tried to do a for loop but it's giving me the same error. Any suggestions?
First, add the condition to check the response status code
Also, add another condition to prevent this type of key error:
if "data" in text_name_id:
Your Error:
KeyError: 'data'
Means that in text_name_id is no Key named "data".
Difficult to tell you how to fix it without any more info...
Are you sure, that request returns a positiv status?? I see no ErrorHandling, if respone.status_code == 200: should be enough to check.
Are you sure that the response json has a Key named "data"? Try this to set a default if key is missing:
text_name_id.get("data", [{"firstname": "error", "id": 0}])
--- Edit ---
Okay, is that the right one I don't see a "id" or "firstname" key. But if it is the right JSON, than you can't iterate over dict like you did in Python.
To do so you would want to do this:
for key, value in text_name_id['data']['data'].items():
...

JSON link from google developer tools not working in Python (or in browser)

I am trying to extract the data in the table at https://www.ecoregistry.io/emit-certifications/ra/10
Using the google developer tools>network tab, I am able to get the json link where the data for this table is stored: https://api-front.ecoregistry.io/api/project/10/emitcertifications
I am able to manually copy this json data and extract the information using this code I've written:
import json
import pandas as pd
data = '''PASTE JSON DATA HERE'''
info = json.loads(data)
columns = ['# Certificate', 'Carbon offsets destination', 'Final user', 'Taxpayer subject','Date','Tons delivered']
dat = list()
for x in info['emitcertifications']:
dat.append([x['consecutive'],x['reasonUsingCarbonOffsets'],x['userEnd'],x['passiveSubject'],x['date'],x['quantity']])
df = pd.DataFrame(dat,columns=columns)
df.to_csv('Data.csv')
I want to automate it such that I can extract the data from the json link: https://api-front.ecoregistry.io/api/project/10/emitcertifications directly instead of manually pasting json data in:
data = '''PASTE JSON DATA HERE'''
The link is not working in python or even in browser directly:
import requests
import json
url = ('https://api-front.ecoregistry.io/api/project/10/emitcertifications')
response = requests.get(url)
print(json.dumps(info, indent=4))
The error output I get is:
{'status': 0, 'codeMessages': [{'codeMessage': 'ERROR_401', 'param': 'invalid', 'message': 'No autorizado'}]}
When I download the data from the developer tools then this dictionary has 'status':1 and after that all the data is there.
Edit: I tried adding request headers to the url but it still did not work:
import requests
import json
url = ('https://api-front.ecoregistry.io/api/project/10/emitcertifications')
hdrs = {"accept": "application/json","accept-language": "en-IN,en;q=0.9,hi-IN;q=0.8,hi;q=0.7,en-GB;q=0.6,en-US;q=0.5","authorization": "Bearer null", "content-type": "application/json","if-none-match": "W/\"1326f-t9xxnBEIbEANJdito3ai64aPjqA\"", "lng": "en", "platform": "ecoregistry","sec-ch-ua": "\" Not A;Brand\";v=\"99\", \"Chromium\";v=\"100\", \"Google Chrome\";v=\"100\"", "sec-ch-ua-mobile": "?0", "sec-ch-ua-platform": "\"Windows\"", "sec-fetch-dest": "empty","sec-fetch-mode": "cors", "sec-fetch-site": "same-site" }
response = requests.get(url, headers = hdrs)
print(response)
info = response.json()
print(json.dumps(info, indent=4))
print(response) give output as '<Response [304]>' while info = response.json() gives traceback error 'Expecting value: line 1 column 1 (char 0)'
Can someone please point me in the right direction?
Thanks in advance!
Posting comment as an answer:
The headers required for that api in order to retrieve data
is platform: ecoregistry.
import requests as req
import json
req = req.get('https://api-front.ecoregistry.io/api/project/10/emitcertifications', headers={'platform': 'ecoregistry'})
data = json.loads(data)
print(data.keys())
# dict_keys(['status', 'projectSerialYear', 'yearValidation', 'project', 'emitcertifications'])
print(data['emitcertifications'][0].keys())
# dict_keys(['id', 'auth', 'operation', 'typeRemoval', 'consecutive', 'serialInit', 'serialEnd', 'serial', 'passiveSubject', 'passiveSubjectNit', 'isPublicEndUser', 'isAccept', 'isCanceled', 'isCancelProccess', 'isUpdated', 'isKg', 'reasonUsingCarbonOffsetsId', 'reasonUsingCarbonOffsets', 'quantity', 'date', 'nitEnd', 'userEnd'])

Urllib and databricks access token

Im trying to score user input againt a model hosted in azure databricks. For this I can only use urllib which is a standard library inside python. I did use Requests for my need and below is the code.
import numpy as np
import pandas as pd
import requests
def create_tf_serving_json(data):
return {'inputs': {name: data[name].tolist() for name in data.keys()} if isinstance(data, dict) else data.tolist()}
def score_model(model_uri, databricks_token, data):
headers = {
"Authorization": f"Bearer {databricks_token}",
"Content-Type": "application/json",
}
data_json = data.to_dict(orient='records') if isinstance(data, pd.DataFrame) else create_tf_serving_json(data)
response = requests.request(method='POST', headers=headers, url=model_uri, json=data_json)
if response.status_code != 200:
raise Exception(f"Request failed with status {response.status_code}, {response.text}")
return response.json()
# Scoring a model that accepts pandas DataFrames
data = pd.DataFrame([{
"Pclass": 1,
"Age": 22,
"SibSp": 1,
"Parch": 1,
"Fare" :50
}])
score_model("My Model URL", 'Databricks Token', data)
Can someone help me to do the same with urllib.
Thanks in advance!

Display .csv in excel correctly

I have written a python script that fetches the data via API and writes it to a CSV. But now I have the problem that when I open the CSV in an Excel, everything is displayed in one column. I don't know what exactly I'm doing wrong. I have specified "sep=";" ( or "sep=",") , yet the same result is returned.
This is what my code looks like.
url = "https://api.bexio.com/2.0/article"
r = requests.get('https://api.bexio.com/2.0/article')
headers = {
'Accept': "application/json",
'Content-Type': "application/json",
'Authorization': "",
}
# Grab Dataframes from Bexio-API and export to CSV
response = requests.request("GET", url, headers=headers)
dic = response.json()
df = pd.DataFrame(dic)
column_map = {'intern_code': 'ProviderKey', 'stock_available_nr': 'QuantityOnStock',}
df.columns = df.columns.map(lambda x: column_map.get(x, x))
df = df[column_map.values()]
df = df[df['ProviderKey'].str.contains('MNM')]
df.to_csv('StockData.csv', index=False, sep=";")
# Send Data to FTP
ftp = FTP("")
ftp.login("","")
Output_Directory = ""
File2Send=""
ftp.cwd(Output_Directory)
with open(File2Send, "rb") as f:
ftp.storbinary('STOR ' + os.path.basename(File2Send), f)
Does anyone here have any idea what I am doing wrong?
Thanks a lot!
Update: added screenshot from csv in vi

How to use pandas DF as params in HTTP request

I have a list of places from an excel file which I would enrich with the geonames Ids. Starting from the excel file I made a pandas Data Frame then I would use the values from the DF as params in my request.
Here the script I made
import pandas as pd
import requests
import json
require_cols = [1]
required_df = pd.read_excel('grp.xlsx', usecols = require_cols)
print(required_df)
url = 'http://api.geonames.org/searchJSON?'
params = { 'username': "XXXXXXXX",
'name_equals': (required_df),
'maxRows': "1"}
e = requests.get(url, params=params)
pretty_json = json.loads(e.content)
print (json.dumps(pretty_json, indent=2))
The problem is related to the defintion of this parameter:
'name_equals': (required_df)
I would use the Places (around 15k) from the DF as param and recoursively retrieve the related geonames ID and write the output in a separate excel file.
The simple request works:
import requests
import json
url = 'http://api.geonames.org/searchJSON?'
params = { 'username': "XXXXXXX",
'name_equals': "Aire",
'maxRows': "1"}
e = requests.get(url, params=params)
pretty_json = json.loads(e.content)
print (json.dumps(pretty_json, indent=2))
#print(e.content)
As well as the definition of Pandas data frame:
# import pandas lib as pd
import pandas as pd
require_cols = [0,1]
# only read specific columns from an excel file
required_df = pd.read_excel('grp.xlsx', usecols = require_cols)
print(required_df)
I also tried via SPARQL without results so I decided to go via Python.
Thanks for your time.
You can use for-loop
import pandas as pd
df = pd.DataFrame({'Places': ['London', 'Paris', 'Berlin']})
for item in df['Places']:
print('requests for:', item)
# ... rest of code ...
or df.apply()
import pandas as pd
def run(item):
print('requests for:', item)
# ... rest of code ...
return 'result for ' + item
df = pd.DataFrame({'Places': ['London', 'Paris', 'Berlin']})
df['Results'] = df['Places'].apply(run)
Thanks #furas for your reply.
I solved like this:
import pandas as pd
import requests
import json
url = 'http://api.geonames.org/searchJSON?'
df = pd.read_excel('Book.xlsx', sheet_name='Sheet1', usecols="B")
for item in df.place_name:
df.place_name.head()
params ={ 'username': "XXXXXX",
'name_equals': item,
'maxRows': "1"}
e = requests.get(url, params=params)
pretty_json = json.loads(e.content)
for item in pretty_json["geonames"]:
print (json.dumps(item["geonameId"], indent=2))
with open('data.json', 'w', encoding='utf-8') as f:
json.dump(item["geonameId"], f, ensure_ascii=False, indent=4)
#print(e.content)
The only problem now is related to the json output: By print I'm having the complete IDs list however, when I'm going to write the output to a file I'm getting just the last ID from the list.

Categories

Resources