How do I turn JSON Objects into a dict? - python

Working on a freshwater fish conservation project. I scraped a JSON file that looks like this:
{
"fish": [
{
"id": 0,
"n": "NO INFORMATION",
"a": "NONE",
"i": "none.png"
},
{
"id": 1,
"n": "Hampala barb",
"a": "Hampala macrolepidota",
"i": "hampala.png"
},
{
"id": 2,
"n": "Giant snakehead",
"a": "Channa micropeltes",
"i": "toman.png"
},
{
"id": 3,
"n": "Clown featherback",
"a": "Chitala ornata",
"i": "belida.png"
}
]
}
And I'm trying to extract the keys "id" and "a" into a python dictionary like this:
fish_id = {
0 : "NONE",
1 : "Hampala macrolepidota",
2 : "Channa micropeltes",
3 : "Chitala ornata"
}

import json
data = """{
"fish": [
{
"id": 0,
"n": "NO INFORMATION",
"a": "NONE",
"i": "none.png"
},
{
"id": 1,
"n": "Hampala barb",
"a": "Hampala macrolepidota",
"i": "hampala.png"
},
{
"id": 2,
"n": "Giant snakehead",
"a": "Channa micropeltes",
"i": "toman.png"
},
{
"id": 3,
"n": "Clown featherback",
"a": "Chitala ornata",
"i": "belida.png"
}
]
}"""
data_dict = json.loads(data)
fish_id = {}
for item in data_dict["fish"]:
fish_id[item["id"]] = item["a"]
print(fish_id)

First create a fish.json file and get your JSON file;
with open('fish.json') as json_file:
data = json.load(json_file)
Then, take your fishes;
fish1 = data['fish'][0]
fish2 = data['fish'][1]
fish3 = data['fish'][2]
fish4 = data['fish'][3]
After that take only values for each, because you want to create a dictionary only from values;
value_list1=list(fish1.values())
value_list2=list(fish2.values())
value_list3=list(fish3.values())
value_list4=list(fish4.values())
Finally, create fish_id dictionary;
fish_id = {
f"{value_list1[0]}" : f"{value_list1[2]}",
f"{value_list2[0]}" : f"{value_list2[2]}",
f"{value_list3[0]}" : f"{value_list3[2]}",
f"{value_list4[0]}" : f"{value_list4[2]}",
}
if you run;
print(fish_id)
Result will be like below, but if you can use for loops, it can be more effective.
{'0': 'NONE', '1': 'Hampala macrolepidota', '2': 'Channa micropeltes', '3': 'Chitala ornata'}

Related

Extract events data from GA4 via bigquery in ADF synapse delta table

We need to extract the events table from GA4 through bigquery (not connecting via Google API directly as it limits both - the number of rows & number of dimensions/metrics), however as there are several nested columns, the ADF reads data in the given format:
{
"v": [{
"v": {
"f": [{
"v": "firebase_conversion"
}, {
"v": {
"f": [{
"v": null
}, {
"v": "0"
}, {
"v": null
}, {
"v": null
}]
}
}]
}
}, {
"v": {
"f": [{
"v": "ga_session_id"
}, {
"v": {
"f": [{
"v": null
}, {
"v": "123"
}, {
"v": null
}, {
"v": null
}]
}
}]
}
}, {
"v": {
"f": [{
"v": "engaged_session_event"
}, {
"v": {
"f": [{
"v": null
}, {
"v": "1"
}, {
"v": null
}, {
"v": null
}]
}
}]
}
}, {
"v": {
"f": [{
"v": "ga_session_number"
}, {
"v": {
"f": [{
"v": null
}, {
"v": "9"
}, {
"v": null
}, {
"v": null
}]
}
}]
}
}, {
"v": {
"f": [{
"v": "page_referrer"
}, {
"v": {
"f": [{
"v": "ABC"
}, {
"v": null
}, {
"v": null
}, {
"v": null
}]
}
}]
}
}, {
"v": {
"f": [{
"v": "page_title"
}, {
"v": {
"f": [{
"v": "ABC"
}, {
"v": null
}, {
"v": null
}, {
"v": null
}]
}
}]
}
}, {
"v": {
"f": [{
"v": "page_location"
}, {
"v": {
"f": [{
"v": "xyz"
}, {
"v": null
}, {
"v": null
}, {
"v": null
}]
}
}]
}
}, {
"v": {
"f": [{
"v": "session_engaged"
}, {
"v": {
"f": [{
"v": null
}, {
"v": "1"
}, {
"v": null
}, {
"v": null
}]
}
}]
}
}]
}
Unnesting is a problem as there are several columns with such data structure, and unnest will increase the number of rows (3.5mn records becomes 40mn). The plan is to maybe extract data as is & use azure functions with python functions to flatten it as JSON, but again the null values are creating trouble there.
Someone can suggest the best way to get data on daily basis without extrapolation in the desired format in the data lake?

reformatting a json file (python)

I have a json file which is split into 3 sections..
{
"columns": {
"0": "Account Number",
"1": "Airport",
"2": "Terminal",
},
"rows": [
[
[
{
"v": "1234 "
},
{
"v": "LHR - London Heathrow"
},
{
"v": "T3"
}
]
]
]
,"types": [
{
"0": "TEXT",
"1": "TEXT",
"2": "TEXT"
}
]
}
what i want it to be like is this :
{
"Account Number" : "1234",
"Airport" : "LHR - London Heathrow",
"Terminal" : "T3"
}
How can acheive this please?
Dependencies
import json
import ast
Reading Json as file
with open("file.json") as f:
data = ast.literal_eval(f.read())
Reading Json as String
askersString = """{
"columns": {
"0": "Account Number",
"1": "Airport",
"2": "Terminal",
},
"rows": [
[
[
{
"v": "1234 "
},
{
"v": "LHR - London Heathrow"
},
{
"v": "T3"
}
]
]
]
,"types": [
{
"0": "TEXT",
"1": "TEXT",
"2": "TEXT"
}
]
}"""
data = ast.literal_eval(askersString)
Creating new json
columns = data["columns"]
a = data["rows"][0][0]
newJson = {}
for k, v in columns.items():
newJson[v] = a[int(k)]['v']
updatedJson = json.dumps(newJson, indent=4)
print(updatedJson)
Output
{
"Account Number": "1234 ",
"Airport": "LHR - London Heathrow",
"Terminal": "T3"
}

How to custom indent json dump?

I use indent = 2, but I want the first level of indentation to be zero. For example:
Partial Code
json.dump(json_data, json_file, indent=2)
Output
{
"a": 1,
"b": "2",
"list": [
{
"c": 3,
"d": 4,
}
]
}
What I want instead
{
"a": 1,
"b": "2",
"list": [
{
"c": 3,
"d": 4,
}
]
}
As stated in the comments, it doesn't make functional difference and you will need custom pretty-print. something like
import json
import textwrap
spam = {"a": 1, "b": "2",
"list": [{"c": 3, "d": 4,}]}
eggs = json.dumps(spam, indent=2).splitlines()
eggs = '\n'.join([eggs[0], textwrap.dedent('\n'.join(eggs[1:-1])), eggs[-1]])
print(eggs)
with open('spam.json', 'w') as f:
f.write(eggs)
output
{
"a": 1,
"b": "2",
"list": [
{
"c": 3,
"d": 4
}
]
}

Extracting a subset of attributes with JSONPath2 from two nodes of a json document

I have the following JSON document:
{
"A": "A_VALUE",
"B": {
"C": [
{
"D": {
"E": "E_VALUE1",
"F": "F_VALUE1",
"G": "G_VALUE1"
},
"H": ["01", "23" ]
},
{
"D": {
"E": "E_VALUE2",
"F": "F_VALUE2",
"G": "G_VALUE3"
},
"H": ["45", "67" ]
}
]
}
}
and I would like to extract field H using a jsonpath2 expression where I specify a value for E field,
for example :
$..C[?(#.D.G="G_VALUE1")].H[1]
The code I use to parse this is the following ( jsonpath version 0.4.3 ):
from jsonpath2.path import Path
s='{ "A": "A_VALUE", "B": { "C": [ { "D": { "E": "E_VALUE1", "F": "F_VALUE1", "G": "G_VALUE1" }, "H": ["01", "23" ] }, { "D": { "E": "E_VALUE2", "F": "F_VALUE2", "G": "G_VALUE3" }, "H": ["45", "67" ] } ] } }"'
p = Path.parse_str("$..C[?(#.D.E=\"E_VALUE1\")].H[1]")
print ([m.current_value for m in p.match(s)])
output
[]
Now, if I use JsonPath evaluator on https://jsonpath.com/ I obtain the following result which is not exatly what I need
$..C[?(#.D.E="E_VALUE1")].H[1]
output
[23,67]
But If I change the expression this way than it works and I obtain what I need;
$..C[?(#.D.E=="E_VALUE1")].H[1]
output
[23]
Same results with other online evaluator such as https://codebeautify.org/jsonpath-tester
So what would be the correct jsonpath expression I should use with jsonpath2 api in order to correctly extract the two required fields ?
You have to use [*] to access individual objects inside an array. This code works -
from jsonpath2.path import Path
import json
s='{ "A": "A_VALUE", "B": { "C": [ { "D": { "E": "E_VALUE1", "F": "F_VALUE1", "G": "G_VALUE1" }, "H": ["01", "23" ] }, { "D": { "E": "E_VALUE2", "F": "F_VALUE2", "G": "G_VALUE3" }, "H": ["45", "67" ] } ] } }'
jso = json.loads(s)
p = Path.parse_str('$..C[*][?(#.D.E="E_VALUE1")].H[1]') # C[*] access each bject in the array
print (*[m.current_value for m in p.match(jso)]) # 23
You can refer to this example from the jsonpath2 docs
You should use the == syntax.
Full disclosure: I've never heard of jsonpath before coming across your question, but being somewhat familiar with XPath, I figured I would read about this tool. I came across a site that can evaluate your expresssion using diffeernt implementations: http://jsonpath.herokuapp.com. The net result was that your expression with = could not be parsed by 3 of the 4 implementations. Moreover, the Goessner implementation returned results that you weren't expecting (all C elements matched and the result was [23,67]. With the == boolean expression, 3 of the 4 implementations provided the expected result of [23]. The Nebhale implementation again complained about the expresssion.

Convert a CSV column to nested field in json

I have a csv file with some as the columns in the format x;y;z. I am using pandas to read this data, do some pre-processing and convert to a list of json objects using to_json/to_dict methods of pandas. While converting these special columns, the json object for that column should be of the format {x: {y: {z: value}}}. There could be different columns like x:y:z and x:y:a and these 2 have to be merged together into a single object in the resultant record json in the format i.e., {x: {y: {z: value1, a: value2}}}
CSV:
Id,Name,X;Y;Z,X;Y;A,X;B;Z
101,Adam,1,2,3
102,John,4,5,6
103,Sara,7,8,9
Output:
[
{
"Id":101,
"Name":"Adam",
"X":{
"Y":{
"Z":1,
"A":2
},
"B":{
"Z":3
}
}
},
{
"Id":102,
"Name":"John",
"X":{
"Y":{
"Z":4,
"A":5
},
"B":{
"Z":6
}
}
},
{
"Id":103,
"Name":"Sara",
"X":{
"Y":{
"Z":7,
"A":8
},
"B":{
"Z":9
}
}
}
]
I found it easier to use pandas to dump the data as a dict then use a recursive function to iterate through the keys and where I encounter a key which contains a ; then i split the key by this deliminator and recursively create the nested dicts. When i reach the last element in the split key i update the key with the original value and the remove the original key from the dict.
import pandas as pd
from io import StringIO
import json
def split_key_to_nested_dict(original_dict, original_key, nested_dict, nested_keys):
if nested_keys[0] not in nested_dict:
nested_dict[nested_keys[0]] = {}
if len(nested_keys) == 1:
nested_dict[nested_keys[0]] = original_dict[original_key]
del original_dict[original_key]
else:
split_key_to_nested_dict(original_dict, original_key, nested_dict[nested_keys[0]], nested_keys[1:])
csv_data = StringIO("""Id,Name,X;Y;Z,X;Y;A,X;B;Z
101,Adam,1,2,3
102,John,4,5,6
103,Sara,7,8,9""")
df = pd.DataFrame.from_csv(csv_data)
df.insert(0, df.index.name, df.index)
dict_data = df.to_dict('records')
for data in dict_data:
keys = list(data.keys())
for key in keys:
if ';' in key:
nested_keys = key.split(';')
split_key_to_nested_dict(data, key, data, nested_keys)
print(json.dumps(dict_data))
OUTPUT
[{"Id": 101, "Name": "Adam", "X": {"Y": {"Z": 1, "A": 2}, "B": {"Z": 3}}}, {"Id": 102, "Name": "John", "X": {"Y": {"Z": 4, "A": 5}, "B": {"Z": 6}}}, {"Id": 103, "Name": "Sara", "X": {"Y": {"Z": 7, "A": 8}, "B": {"Z": 9}}}]
FORMATED OUTPUT
[
{
"Id": 101,
"Name": "Adam",
"X": {
"Y": {
"Z": 1,
"A": 2
},
"B": {
"Z": 3
}
}
},
{
"Id": 102,
"Name": "John",
"X": {
"Y": {
"Z": 4,
"A": 5
},
"B": {
"Z": 6
}
}
},
{
"Id": 103,
"Name": "Sara",
"X": {
"Y": {
"Z": 7,
"A": 8
},
"B": {
"Z": 9
}
}
}
]

Categories

Resources