normalize a column in pandas dataframe - python

I am able to import data from json file using this code...
import requests
from pandas.io.json import json_normalize
url = "https://datameetgeobk.s3.amazonaws.com/image_list.json"
resp = requests.get(url=url)
df = json_normalize(resp.json()['Images'])
df.head()
But the column "BlockDeviceMappings" is actually a list and each item has DeviceName and Ebs parameters those are string and dicts. How do I further normalize my dataframe to include all the details in separate columns?
My screenshot does not match with the one shown in the answer. The Ebs column (second from left) is a dictionary.

import requests
import pandas as pd
url = "https://datameetgeobk.s3.amazonaws.com/image_list.json"
resp = requests.get(url=url)
resp = resp.json()
What you have so far:
df = pd.json_normalize(resp['Images'])
BlockDeviceMappings cast to all columns
inner_keys = [x for x in resp['Images'][0].keys() if x != 'BlockDeviceMappings']
df_bdm = pd.json_normalize(resp['Images'], record_path=['BlockDeviceMappings'], meta=inner_keys, errors='ignore')
Separate bdm_df:
bdm_df = pd.json_normalize(resp['Images'], record_path=['BlockDeviceMappings'])
You will no doubt wonder why df has 39995 entries, while bdm_df has 131691 entries. This is because BlockDeviceMappings is a list of dicts of varying lengths:
bdm_len = [len(x) for x in df.BlockDeviceMappings]
max(bdm_len)
>>> 31
Sample BlockDeviceMappings entry:
[{'DeviceName': '/dev/sda1',
'Ebs': {'DeleteOnTermination': True,
'SnapshotId': 'snap-0aac2591b85fe677e',
'VolumeSize': 80,
'VolumeType': 'gp2',
'Encrypted': False}},
{'DeviceName': 'xvdb',
'Ebs': {'DeleteOnTermination': True,
'SnapshotId': 'snap-0bd8d7828225924a7',
'VolumeSize': 80,
'VolumeType': 'gp2',
'Encrypted': False}}]
df_bdm.head()

Related

read data come from firebase using pandas in python

I have read data from firebase in python. and i want to read it using pandas.
The code:
import pandas as pd
from firebase_admin import credentials
from firebase_admin import firestore
import firebase_admin
# get list data
cred = "card.json"
login = credentials.Certificate(cred)
#initialize firebase
# firebase_admin.initialize_app(login)
#reading from the database
db = firestore.client()
lists = db.collection("Lists").stream()
It work just fine. but when I try to use pandas. I'm not sure how to do it. I tried this.
listing = pd.DataFrame(lists)
print(listing)
it printed this:
Empty DataFrame
Columns: []
Index: []
am I missing something? am I doing something wrong? please help.
I created empty data frame. just added the column name.
then go through the data coming from firebase then add it to the data frame that I created.
like this:
data = {'ListID': [], 'Title': [], 'Description': [], 'Cover': [], 'Access': [], 'uid': []}
df = pd.DataFrame(data)
for list in lists:
l = list.to_dict()
df2 = pd.DataFrame({'ListID': [l.get("ListID")],
'Title': [l.get("Title")],
'Description': [l.get("Description")],
'Cover': [l.get("Cover")],
'Access': [l.get("Access")],
'uid': [l.get("uid")]})
df = pd.concat([df, df2], ignore_index = True, axis = 0)

Creating multiple dataframe using loop or function

I'm trying to extract the hash rate for 3 cryptocurrencies and I have attached the code for the same below. Now, I want to pass three urls and in return I need three different different dictionaries which should have the values. I'm stuck and I don't understand how should I go about it. I have tried using loops but it is not working out for me.
url = {'Bitcoin' : 'https://bitinfocharts.com/comparison/bitcoin-hashrate.html#3y',
'Ethereum': 'https://bitinfocharts.com/comparison/ethereum-hashrate.html#3y',
'Litecoin': 'https://bitinfocharts.com/comparison/litecoin-hashrate.html'}
for ele in url:
#### requesting the page and extracting the script which has date and values
session = requests.Session()
page = session.get(ele[i])
soup = BeautifulSoup(page.content, 'html.parser')
values = str(soup.find_all('script')[4])
values = values.split('d = new Dygraph(document.getElementById("container"),')[1]
#create an empty dict to append date and hashrates
dict([("crypto_1 %s" % i,[]) for i in range(len(url))])
#run a loop over all the dates and adding to dictionary
for i in range(values.count('new Date')):
date = values.split('new Date("')[i+1].split('"')[0]
value = values.split('"),')[i+1].split(']')[0]
dict([("crypto_1 %s" % i)[date] = value
You can use next example how to get data from all 3 URLs and create a dataframe/dictionary from it:
import re
import requests
import pandas as pd
url = {
"Bitcoin": "https://bitinfocharts.com/comparison/bitcoin-hashrate.html#3y",
"Ethereum": "https://bitinfocharts.com/comparison/ethereum-hashrate.html#3y",
"Litecoin": "https://bitinfocharts.com/comparison/litecoin-hashrate.html",
}
data = []
for name, u in url.items():
html_doc = requests.get(u).text
for date, hash_rate in re.findall(
r'\[new Date\("(.*?)"\),(.*?)\]', html_doc
):
data.append(
{
"Name": name,
"Date": date,
"Hash Rate": float("nan")
if hash_rate == "null"
else float(hash_rate),
}
)
df = pd.DataFrame(data)
df["Date"] = pd.to_datetime(df["Date"])
# here save df to CSV
# this will create a dictionary, where the keys are crypto names and values
# are dicts with keys Date/HashRate:
out = {}
for name, g in df.groupby("Name"):
out[name] = g[["Date", "Hash Rate"]].to_dict(orient="list")
print(out)
Prints:
{
"Bitcoin": {
"Date": [
Timestamp("2009-01-03 00:00:00"),
Timestamp("2009-01-04 00:00:00"),
Timestamp("2009-01-05 00:00:00"),
...

Json file not formatted correctly when writing json differences with pandas and numpy

I am trying to compare two json and then write another json with columns names and with differences as yes or no. I am using pandas and numpy
The below is sample files i am including actually, these json are dynamic, that mean we dont know how many key will be there upfront
Input files:
fut.json
[
{
"AlarmName": "test",
"StateValue": "OK"
}
]
Curr.json:
[
{
"AlarmName": "test",
"StateValue": "OK"
}
]
Below code I have tried:
import pandas as pd
import numpy as np
with open(r"c:\csv\fut.json", 'r+') as f:
data_b = json.load(f)
with open(r"c:\csv\curr.json", 'r+') as f:
data_a = json.load(f)
df_a = pd.json_normalize(data_a)
df_b = pd.json_normalize(data_b)
_, df_a = df_b.align(df_a, fill_value=np.NaN)
_, df_b = df_a.align(df_b, fill_value=np.NaN)
with open(r"c:\csv\report.json", 'w') as _file:
for col in df_a.columns:
df_temp = pd.DataFrame()
df_temp[col + '_curr'], df_temp[col + '_fut'], df_temp[col + '_diff'] = df_a[col], df_b[col], np.where((df_a[col] == df_b[col]), 'No', 'Yes')
#[df_temp.rename(columns={c:'Missing'}, inplace=True) for c in df_temp.columns if df_temp[c].isnull().all()]
df_temp.fillna('Missing', inplace=True)
with pd.option_context('display.max_colwidth', -1):
_file.write(df_temp.to_json(orient='records'))
Expected output:
[
{
"AlarmName_curr": "test",
"AlarmName_fut": "test",
"AlarmName_diff": "No"
},
{
"StateValue_curr": "OK",
"StateValue_fut": "OK",
"StateValue_diff": "No"
}
]
Coming output: Not able to parse it in json validator, below is the problem, those [] should be replaed by ',' to get right json dont know why its printing like that
[{"AlarmName_curr":"test","AlarmName_fut":"test","AlarmName_diff":"No"}][{"StateValue_curr":"OK","StateValue_fut":"OK","StateValue_diff":"No"}]
Edit1:
Tried below as well
_file.write(df_temp.to_json(orient='records',lines=True))
now i get json which is again not parsable, ',' is missing and unless i add , between two dic and [ ] at beginning and end manually , its not parsing..
[{"AlarmName_curr":"test","AlarmName_fut":"test","AlarmName_diff":"No"}{"StateValue_curr":"OK","StateValue_fut":"OK","StateValue_diff":"No"}]
Honestly pandas is overkill for this... however
load dataframes as you did
concat them as columns. rename columns
do calcs and map boolean to desired Yes/No
to_json() returns a string so json.loads() to get it back into a list/dict. Filter columns to get to your required format
import json
data_b = [
{
"AlarmName": "test",
"StateValue": "OK"
}
]
data_a = [
{
"AlarmName": "test",
"StateValue": "OK"
}
]
df_a = pd.json_normalize(data_a)
df_b = pd.json_normalize(data_b)
df = pd.concat([df_a, df_b], axis=1)
df.columns = [c+"_curr" for c in df_a.columns] + [c+"_fut" for c in df_a.columns]
df["AlarmName_diff"] = df["AlarmName_curr"] == df["AlarmName_fut"]
df["StateValue_diff"] = df["StateValue_curr"] == df["StateValue_fut"]
df = df.replace({True:"Yes", False:"No"})
js = json.loads(df.loc[:,(c for c in df.columns if c.startswith("Alarm"))].to_json(orient="records"))
js += json.loads(df.loc[:,(c for c in df.columns if c.startswith("State"))].to_json(orient="records"))
js
output
[{'AlarmName_curr': 'test', 'AlarmName_fut': 'test', 'AlarmName_diff': 'Yes'},
{'StateValue_curr': 'OK', 'StateValue_fut': 'OK', 'StateValue_diff': 'Yes'}]

How to use pandas DF as params in HTTP request

I have a list of places from an excel file which I would enrich with the geonames Ids. Starting from the excel file I made a pandas Data Frame then I would use the values from the DF as params in my request.
Here the script I made
import pandas as pd
import requests
import json
require_cols = [1]
required_df = pd.read_excel('grp.xlsx', usecols = require_cols)
print(required_df)
url = 'http://api.geonames.org/searchJSON?'
params = { 'username': "XXXXXXXX",
'name_equals': (required_df),
'maxRows': "1"}
e = requests.get(url, params=params)
pretty_json = json.loads(e.content)
print (json.dumps(pretty_json, indent=2))
The problem is related to the defintion of this parameter:
'name_equals': (required_df)
I would use the Places (around 15k) from the DF as param and recoursively retrieve the related geonames ID and write the output in a separate excel file.
The simple request works:
import requests
import json
url = 'http://api.geonames.org/searchJSON?'
params = { 'username': "XXXXXXX",
'name_equals': "Aire",
'maxRows': "1"}
e = requests.get(url, params=params)
pretty_json = json.loads(e.content)
print (json.dumps(pretty_json, indent=2))
#print(e.content)
As well as the definition of Pandas data frame:
# import pandas lib as pd
import pandas as pd
require_cols = [0,1]
# only read specific columns from an excel file
required_df = pd.read_excel('grp.xlsx', usecols = require_cols)
print(required_df)
I also tried via SPARQL without results so I decided to go via Python.
Thanks for your time.
You can use for-loop
import pandas as pd
df = pd.DataFrame({'Places': ['London', 'Paris', 'Berlin']})
for item in df['Places']:
print('requests for:', item)
# ... rest of code ...
or df.apply()
import pandas as pd
def run(item):
print('requests for:', item)
# ... rest of code ...
return 'result for ' + item
df = pd.DataFrame({'Places': ['London', 'Paris', 'Berlin']})
df['Results'] = df['Places'].apply(run)
Thanks #furas for your reply.
I solved like this:
import pandas as pd
import requests
import json
url = 'http://api.geonames.org/searchJSON?'
df = pd.read_excel('Book.xlsx', sheet_name='Sheet1', usecols="B")
for item in df.place_name:
df.place_name.head()
params ={ 'username': "XXXXXX",
'name_equals': item,
'maxRows': "1"}
e = requests.get(url, params=params)
pretty_json = json.loads(e.content)
for item in pretty_json["geonames"]:
print (json.dumps(item["geonameId"], indent=2))
with open('data.json', 'w', encoding='utf-8') as f:
json.dump(item["geonameId"], f, ensure_ascii=False, indent=4)
#print(e.content)
The only problem now is related to the json output: By print I'm having the complete IDs list however, when I'm going to write the output to a file I'm getting just the last ID from the list.

What is the data format returned by the AdWords API TargetingIdeaPage service?

When I query the AdWords API to get search volume data and trends through their TargetingIdeaSelector using the Python client library the returned data looks like this:
(TargetingIdeaPage){
totalNumEntries = 1
entries[] =
(TargetingIdea){
data[] =
(Type_AttributeMapEntry){
key = "KEYWORD_TEXT"
value =
(StringAttribute){
Attribute.Type = "StringAttribute"
value = "keyword phrase"
}
},
(Type_AttributeMapEntry){
key = "TARGETED_MONTHLY_SEARCHES"
value =
(MonthlySearchVolumeAttribute){
Attribute.Type = "MonthlySearchVolumeAttribute"
value[] =
(MonthlySearchVolume){
year = 2016
month = 2
count = 2900
},
...
(MonthlySearchVolume){
year = 2015
month = 3
count = 2900
},
}
},
},
}
This isn't JSON and appears to just be a messy Python list. What's the easiest way to flatten the monthly data into a Pandas dataframe with a structure like this?
Keyword | Year | Month | Count
keyword phrase 2016 2 10
The output is a sudsobject. I found that this code does the trick:
import suds.sudsobject as sudsobject
import pandas as pd
a = [sudsobject.asdict(x) for x in output]
df = pd.DataFrame(a)
Addendum: This was once correct but new versions of the API (I tested
201802) now return a zeep.objects. However, zeep.helpers.serialize_object should do the same trick.
link
Here's the complete code that I used to query the TargetingIdeaSelector, with requestType STATS, and the method I used to parse the data to a useable dataframe; note the section starting "Parse results to pandas dataframe" as this takes the output given in the question above and converts it to a dataframe. Probably not the fastest or best, but it works! Tested with Python 2.7.
"""This code pulls trends for a set of keywords, and parses into a dataframe.
The LoadFromStorage method is pulling credentials and properties from a
"googleads.yaml" file. By default, it looks for this file in your home
directory. For more information, see the "Caching authentication information"
section of our README.
"""
from googleads import adwords
import pandas as pd
adwords_client = adwords.AdWordsClient.LoadFromStorage()
PAGE_SIZE = 10
# Initialize appropriate service.
targeting_idea_service = adwords_client.GetService(
'TargetingIdeaService', version='v201601')
# Construct selector object and retrieve related keywords.
offset = 0
stats_selector = {
'searchParameters': [
{
'xsi_type': 'RelatedToQuerySearchParameter',
'queries': ['donald trump', 'bernie sanders']
},
{
# Language setting (optional).
# The ID can be found in the documentation:
# https://developers.google.com/adwords/api/docs/appendix/languagecodes
'xsi_type': 'LanguageSearchParameter',
'languages': [{'id': '1000'}],
},
{
# Location setting
'xsi_type': 'LocationSearchParameter',
'locations': [{'id': '1027363'}] # Burlington,Vermont
}
],
'ideaType': 'KEYWORD',
'requestType': 'STATS',
'requestedAttributeTypes': ['KEYWORD_TEXT', 'TARGETED_MONTHLY_SEARCHES'],
'paging': {
'startIndex': str(offset),
'numberResults': str(PAGE_SIZE)
}
}
stats_page = targeting_idea_service.get(stats_selector)
##########################################################################
# Parse results to pandas dataframe
stats_pd = pd.DataFrame()
if 'entries' in stats_page:
for stats_result in stats_page['entries']:
stats_attributes = {}
for stats_attribute in stats_result['data']:
#print (stats_attribute)
if stats_attribute['key'] == 'KEYWORD_TEXT':
kt = stats_attribute['value']['value']
else:
for i, val in enumerate(stats_attribute['value'][1]):
data = {'keyword': kt,
'year': val['year'],
'month': val['month'],
'count': val['count']}
data = pd.DataFrame(data, index = [i])
stats_pd = stats_pd.append(data, ignore_index=True)
print(stats_pd)

Categories

Resources