Would you help, please, to parce 2-arrayed json via python, json_normalize.
Here is the code:
import json
from pandas.io.json import json_normalize
data5 = {
"id": "0001",
"type": "donut",
"name": "Cake",
"ppu": 0.55,
"batters":
{
"batter":
[
{ "id": "1001", "type": "Regular" },
{ "id": "1002", "type": "Chocolate" },
{ "id": "1003", "type": "Blueberry" },
{ "id": "1004", "type": "Devil's Food" }
]
},
"topping":
[
{ "id": "5001", "type": "None" },
{ "id": "5002", "type": "Glazed" },
{ "id": "5005", "type": "Sugar" },
{ "id": "5007", "type": "Powdered Sugar" },
{ "id": "5006", "type": "Chocolate with Sprinkles" },
{ "id": "5003", "type": "Chocolate" },
{ "id": "5004", "type": "Maple" }
]
}
df2 = json_normalize(data5
, record_path = ['topping']
, meta = ['id', 'type', 'name', 'ppu', 'batters']
, record_prefix='_'
, errors='ignore'
)
This parces "topping" object but doesn't parce the "batters".
To parce the "batters" may be applied the code:
# parce the part of json string into another dataframe
df3 = json_normalize(data5
,record_path = ['batters', 'batter'])
# cross join 2 dataframes
df2['key_'] = 1
df3['key_'] = 1
result = pd.merge(df2, df3, on ='key_').drop("key_", 1)
But this looks complicated.
Is it possible to combine 2 steps above in one query? E.g.:
df2 = json_normalize(data5
, record_path = ['topping', ['batters', 'batter']]
, meta = ['id', 'type', 'name', 'ppu', ]
, record_prefix='_'
, errors='ignore'
)
Thank you.
I don't think you can specify that within json_normalize. However, you can avoid creating the key_ column by specifying how="cross" in pd.merge (also no need to keep batters in df2):
import pandas as pd
df2 = pd.json_normalize(data5
, record_path = ['topping']
, meta = ['id', 'type', 'name', 'ppu']
, record_prefix='_'
)
df3 = pd.json_normalize(data5
,record_path = ['batters', 'batter'])
pd.merge(df2, df3, how="cross")
Related
I am trying to convert an object/dictionary to a Python DataFrame using the following code:
sr = pd.Series(object)
df = pd.DataFrame(sr.values.tolist())
display(df)
It works well but some of the output columns are of object/dictionary type, and I would like to break them up to multiple columns, for example, if column "Items" produces the following value in a cell:
obj = {
"item1": {
"id": "item1",
"relatedItems": [
{
"id": "1111",
"category": "electronics"
},
{
"id": "9999",
"category": "electronics",
"subcategory": "computers"
},
{
"id": "2222",
"category": "electronics",
"subcategory": "computers",
"additionalData": {
"createdBy": "Doron",
"inventory": 100
}
}
]
},
"item2": {
"id": "item2",
"relatedItems": [
{
"id": "4444",
"category": "furniture",
"subcategory": "sofas"
},
{
"id": "5555",
"category": "books",
},
{
"id": "6666",
"category": "electronics",
"subcategory": "computers",
"additionalData": {
"createdBy": "Joe",
"inventory": 5,
"condition": {
"name": "new",
"inspectedBy": "Doron"
}
}
}
]
}
}
The desired output is:
I tried using df.explode, but it multiplies the row to multiple rows, I am looking for a way to achieve the same but split into columns and retain a single row.
Any suggestions?
You can use the pd.json_normalize function to flatten the nested dictionary into multiple columns, with the keys joined with a dot (.).
sr = pd.Series({
'Items': {
'item_name': 'name',
'item_value': 'value'
}
})
df = pd.json_normalize(sr, sep='.')
display(df)
This will give you the following df
Items.item_name Items.item_value
0 name value
You can also specify the level of nesting by passing the record_path parameter to pd.json_normalize, for example, to only flatten the 'Items' key:
df = pd.json_normalize(sr, 'Items', sep='.')
display(df)
Seems like you're looking for pandas.json_normalize which has a (sep) parameter:
obj = {
'name': 'Doron Barel',
'items': {
'item_name': 'name',
'item_value': 'value',
'another_item_prop': [
{
'subitem1_name': 'just_another_name',
'subitem1_value': 'just_another_value',
},
{
'subitem2_name': 'one_more_name',
'subitem2_value': 'one_more_value',
}
]
}
}
df = pd.json_normalize(obj, sep='.')
ser = df.pop('items.another_item_prop').explode()
out = (df.join(pd.DataFrame(ser.tolist(), index=s.index)
.rename(columns= lambda x: ser.name+"."+x))
.groupby("name", as_index=False).first()
)
Output :
print(out)
name items.item_name items.item_value items.another_item_prop.subitem1_name items.another_item_prop.subitem1_value items.another_item_prop.subitem2_name items.another_item_prop.subitem2_value
0 Doron Barel name value just_another_name just_another_value one_more_name one_more_value
I'm trying to transform the JSON from this:
{
"meta": {
"ver": "3.0"
},
"cols": [
{
"name": "val"
}
],
"rows": [
"cols": [
{
"name": "ts"
},
{
"name": "v0"
},
{
"name": "v1"
},
{
"name": "v2"
},
{
"name": "v3"
},
{
"ts": {
"_kind": "dateTime",
"val": "2021-07-07T00:10:00-07:00",
"tz": "Los_Angeles"
},
"v3": {
"_kind": "number",
"val": 6167699.5,
"unit": "kWh"
}
},
{
"ts": {
"_kind": "dateTime",
"val": "2021-07-07T00:15:00-07:00",
"tz": "Los_Angeles"
},
"v0": {
"_kind": "number",
"val": 808926.0625,
"unit": "m\\u00b3"
},
"v1": {
"_kind": "number",
"val": 112999.3046875,
"unit": "m\\u00b3"
},
"v2": {
"_kind": "number",
"val": 8823498,
"unit": "kWh"
}
}
]
}
to a more simplified form using the pyjq module:
{
"data": {
"v0": [
[
"first timestamp",
val
],
[
"second timestamp",
val
]
],
"v1": [
[
"first timestamp",
val
],
[
"second timestamp",
val
]
]
}
}
I got started with the pyjq module, however I'm unsure about how to proceed with place two values (one str, one float) within the [] separated by a comma. Here's my code (returns error as expected).
import json
import pyjq
with open('file.json') as f:
data = json.load(f)
transformed = pyjq.all('{data: { meter_id_1: [[[.rows[].val.rows[].ts.val + "," + .rows[].val.rows[].v0.val]]}}', data)
Thanks in advance.
I am trying to link several Altair charts that share aspects of the same data. I can do this by merging all the data into one data frame, but because of the nature of the data the merged data frame is much larger than is needed to have two separate data frames for each of the two charts. This is because the columns unique to each chart have many repeated rows for each entry in the shared column.
Would using transform_lookup save space over just using the merged data frame, or does transform_lookup end up doing the whole merge internally?
No, the entire dataset is still included in the vegaspec when you use transform_lookup. You can see this by printing the json spec of the charts you create. With the example from the docs:
import altair as alt
import pandas as pd
from vega_datasets import data
people = data.lookup_people().head(3)
people
name age height
0 Alan 25 180
1 George 32 174
2 Fred 39 182
groups = data.lookup_groups().head(3)
groups
group person
0 1 Alan
1 1 George
2 1 Fred
With pandas merge:
merged = pd.merge(groups, people, how='left',
left_on='person', right_on='name')
print(alt.Chart(merged).mark_bar().encode(
x='mean(age):Q',
y='group:O'
).to_json())
{
"$schema": "https://vega.github.io/schema/vega-lite/v4.8.1.json",
"config": {
"view": {
"continuousHeight": 300,
"continuousWidth": 400
}
},
"data": {
"name": "data-b41b97ffc89b39c92e168871d447e720"
},
"datasets": {
"data-b41b97ffc89b39c92e168871d447e720": [
{
"age": 25,
"group": 1,
"height": 180,
"name": "Alan",
"person": "Alan"
},
{
"age": 32,
"group": 1,
"height": 174,
"name": "George",
"person": "George"
},
{
"age": 39,
"group": 1,
"height": 182,
"name": "Fred",
"person": "Fred"
}
]
},
"encoding": {
"x": {
"aggregate": "mean",
"field": "age",
"type": "quantitative"
},
"y": {
"field": "group",
"type": "ordinal"
}
},
"mark": "bar"
}
With transform lookup all the data is there but as to separate dataset (so technically it takes a little bit of more space with the additional braces and the transform):
print(alt.Chart(groups).mark_bar().encode(
x='mean(age):Q',
y='group:O'
).transform_lookup(
lookup='person',
from_=alt.LookupData(data=people, key='name',
fields=['age'])
).to_json())
{
"$schema": "https://vega.github.io/schema/vega-lite/v4.8.1.json",
"config": {
"view": {
"continuousHeight": 300,
"continuousWidth": 400
}
},
"data": {
"name": "data-5fe242a79352d1fe243b588af570c9c6"
},
"datasets": {
"data-2b374d1509415e1d327c3a7521f8117c": [
{
"age": 25,
"height": 180,
"name": "Alan"
},
{
"age": 32,
"height": 174,
"name": "George"
},
{
"age": 39,
"height": 182,
"name": "Fred"
}
],
"data-5fe242a79352d1fe243b588af570c9c6": [
{
"group": 1,
"person": "Alan"
},
{
"group": 1,
"person": "George"
},
{
"group": 1,
"person": "Fred"
}
]
},
"encoding": {
"x": {
"aggregate": "mean",
"field": "age",
"type": "quantitative"
},
"y": {
"field": "group",
"type": "ordinal"
}
},
"mark": "bar",
"transform": [
{
"from": {
"data": {
"name": "data-2b374d1509415e1d327c3a7521f8117c"
},
"fields": [
"age",
"height"
],
"key": "name"
},
"lookup": "person"
}
]
}
When transform_lookup can save space is if you use it with the URLs of two dataset:
people = data.lookup_people.url
groups = data.lookup_groups.url
print(alt.Chart(groups).mark_bar().encode(
x='mean(age):Q',
y='group:O'
).transform_lookup(
lookup='person',
from_=alt.LookupData(data=people, key='name',
fields=['age'])
).to_json())
{
"$schema": "https://vega.github.io/schema/vega-lite/v4.8.1.json",
"config": {
"view": {
"continuousHeight": 300,
"continuousWidth": 400
}
},
"data": {
"url": "https://vega.github.io/vega-datasets/data/lookup_groups.csv"
},
"encoding": {
"x": {
"aggregate": "mean",
"field": "age",
"type": "quantitative"
},
"y": {
"field": "group",
"type": "ordinal"
}
},
"mark": "bar",
"transform": [
{
"from": {
"data": {
"url": "https://vega.github.io/vega-datasets/data/lookup_people.csv"
},
"fields": [
"age",
"height"
],
"key": "name"
},
"lookup": "person"
}
]
}
How can we convert this to dataframes? I have tried multiple ways on how it can be achived, i have tried with json file on w3school but it is working correctly, i am new with python, any recommendations on this?
Json format is
[
{
"id": 14256,
"city": {
"id": {
"$numberLong": "14256"
},
"name": "Azadshahr",
"findname": "AZADSHAHR",
"country": "IR",
"coord": {
"lon": 48.570728,
"lat": 34.790878
},
"zoom": {
"$numberLong": "10"
}
}
},
{
"id": {
"$numberLong": "465726"
},
"city": {
"id": {
"$numberLong": "465726"
},
"name": "Zadonsk",
"findname": "ZADONSK",
"country": "RU",
"coord": {
"lon": 38.926102,
"lat": 52.3904
},
"zoom": {
"$numberLong": "16"
}
}
}
]
The expected output is :
it tried to do a conversion but i am receiving error and it is not the whole data
with open('data/history.city.list.json') as f:
data = json.load(f)
but not able to load as data, This is what i have tried but i feel
_id = []
country = []
coord_lat = []
coord_lon = []
counter = 0
for i in data:
_id.append(data[counter]['id'])
country.append(data[counter]['city']['country'])
coord_lat.append(data[counter]['city']['coord']['lon'])
coord_lat.append(data[counter]['city']['coord']['lat'])
counter += 1
When i have tried to print it as a dataframe
df = pd.DataFrame({'Longtitude' : coord_lat , 'Latitude' : coord_lat})
df.head(10)
This was able to set it to dataframe, but as soon as i add 'Country' to pd.dataframe() , it will return as ValueError: arrays must all be same length.
i understand that country column does not match the other columns but can we achieve this and is there a simpler way to do it ?
You can use json_normalize() as described here:
import pandas as pd
d = [
{
"id": 14256,
"city": {
"id": {
"$numberLong": "14256"
},
"name": "Azadshahr",
"findname": "AZADSHAHR",
"country": "IR",
"coord": {
"lon": 48.570728,
"lat": 34.790878
},
"zoom": {
"$numberLong": "10"
}
}
},
{
"id": {
"$numberLong": "465726"
},
"city": {
"id": {
"$numberLong": "465726"
},
"name": "Zadonsk",
"findname": "ZADONSK",
"country": "RU",
"coord": {
"lon": 38.926102,
"lat": 52.3904
},
"zoom": {
"$numberLong": "16"
}
}
}
]
pd.io.json.json_normalize(d)
Output:
id city.id.$numberLong city.name city.findname city.country city.coord.lon city.coord.lat city.zoom.$numberLong id.$numberLong
0 14256.0 14256 Azadshahr AZADSHAHR IR 48.570728 34.790878 10 NaN
1 NaN 465726 Zadonsk ZADONSK RU 38.926102 52.390400 16 465726
The column names do not match your expected output, but you can change that easily with df.columns = ['Id', 'city', ... 'Zoom']
I am trying to load json from a url and convert to a Pandas dataframe, so that the dataframe would look like the sample below.
I've tried json_normalize, but it duplicates the columns, one for each data type (value and stringValue). Is there a simpler way than this method and then dropping and renaming columns after creating the dataframe? I want to keep the stringValue.
Person ID Position ID Job ID Manager
0 192 936 93 Tom
my_json = {
"columns": [
{
"alias": "c3",
"label": "Person ID",
"dataType": "integer"
},
{
"alias": "c36",
"label": "Position ID",
"dataType": "string"
},
{
"alias": "c40",
"label": "Job ID",
"dataType": "integer",
"entityType": "job"
},
{
"alias": "c19",
"label": "Manager",
"dataType": "integer"
},
],
"data": [
{
"c3": {
"value": 192,
"stringValue": "192"
},
"c36": {
"value": "936",
"stringValue": "936"
},
"c40": {
"value": 93,
"stringValue": "93"
},
"c19": {
"value": 12412453,
"stringValue": "Tom"
}
}
]
}
If c19 is of type string, this should work
alias_to_label = {x['alias']: x['label'] for x in my_json["columns"]}
is_str = {x['alias']: ('string' == x['dataType']) for x in my_json["columns"]}
data = []
for x in my_json["data"]:
data.append({
k: v["stringValue" if is_str[k] else 'value']
for k, v in x.items()
})
df = pd.DataFrame(data).rename(columns=alias_to_label)