ETL by parsing JSON dynamically, Python - python

I new to python.
I want read auth column from PostgreSQL which gives a json. I need to parse it and get the relevant api credentials in it. Then based on these, I want to get the data which is again json but this time its deeply nested json and objects can be more or less in different json. Now, from these JSON, I want to get all the keys and insert these in Source column names in the source table as rows of sourceColumnNames column. Target Column may have less columns then source lets say only a and d from source as name and PostalCode.
I am wondering how can I achieve this. It looks to be done something like scala case classes, target and source model classes but its needed to be done in python. How?
Data in AuthColumn is
{ "url": "https://api.myUrl.com/v2",
"headers": {
"Authorization": "TheSecretAccessToken2022",
"Content-Type": "application/json"
},
"data": {
"query": "{ boards{ items{ name column_values {a b c d} } } }"
} }
I need to parse it to get credentials and execute the query.
Then it will return some JSON which I need to parse.
This JSON could be like this
{
"data": {
"boards": [{
"name": "DP",
"id": "123",
"description": null,
"items": [{
"name": "TheColumn",
"column_values": [{
"a": "PDs",
"b": "PDs",
"c": "CI",
"d": "PV"
}, {
"a": "SLUD",
"b": "SLUD",
"c": "d",
"d": "MFO"
}, {
"a": "ST",
"b": "ST",
"c": "CI",
"d": "UC"
}, {
"a": "c",
"b": "c",
"c": "CI",
"d": "NC"
}, {
"a": "OP",
"b": "op",
"c": "CI",
"d": "0 days"
}, {
"a": "OPd",
"b": "OPd",
"c": "CI",
"d": "2022-02-25"
}, {
"a": "CD",
"b": "cd",
"c": "d",
"d": "2022-02-25"
}, {
"a": "cld",
"b": "cld",
"c": "d",
"d": "2022-04-22"
}, {
"a": "SoDce",
"b": "soDce",
"c": "CI",
"d": ""
}, {
"a": "MOD",
"b": "MOD",
"c": "date",
"d": ""
}, {
"a": "PP",
"b": "PP",
"c": "nuUDic",
"d": "625000"
}, {
"a": "UD",
"b": "UD",
"c": "nuUDic",
"d": ""
}, {
"a": "PAVSP",
"b": "PAVSP",
"c": "neUDic",
"d": ""
}, {
"a": "LendeUD",
"b": "lendeUD",
"c": "CI",
"d": "TBD"
}, {
"a": "ESP",
"b": "ESP",
"c": "CI",
"d": ""
}, {
"a": "ac",
"b": "ac",
"c": "CI",
"d": "Chicago"
}, {
"a": "SLd",
"b": "SLd",
"c": "CI",
"d": ""
}, {
"a": "UA",
"b": "UA",
"c": "CI",
"d": ""
}, {
"a": "UD",
"b": "UD",
"c": "CI",
"d": ""
}, {
"a": "R?",
"b": "R",
"c": "CI",
"d": ""
}, {
"a": "DDE",
"b": "DDE",
"c": "CI",
"d": ""
}, {
"a": "SOD",
"b": "SOD",
"c": "CI",
"d": ""
}, {
"a": "NOS",
"b": "NOS",
"c": "d",
"d": ""
}]
}, {
"name": "BBB",
"column_values": [{
"a": "PeUDs",
"b": "PeUDs",
"c": "CI",
"d": "PV"
}, {
"a": "SLUD",
"b": "SLUD",
"c": "d",
"d": "Ddd"
}, {
"a": "ST",
"b": "ST",
"c": "CI",
"d": "UC"
}, {
"a": "c",
"b": "c",
"c": "CI",
"d": "NC"
}, {
"a": "OP",
"b": "op",
"c": "CI",
"d": "0 days"
}, {
"a": "OPd",
"b": "OPd",
"c": "CI",
"d": "2022-02-23"
}, {
"a": "CD",
"b": "cd",
"c": "d",
"d": "2022-02-23"
}, {
"a": "cld",
"b": "cld",
"c": "d",
"d": "2022-03-04"
}, {
"a": "SoDce",
"b": "soDce",
"c": "CI",
"d": ""
}, {
"a": "MOD",
"b": "MOD",
"c": "date",
"d": ""
}, {
"a": "PP",
"b": "PP",
"c": "nuUDic",
"d": "3200"
}, {
"a": "UD",
"b": "UD",
"c": "numeic",
"d": ""
}, {
"a": "PDVSP",
"b": "PDVSP",
"c": "nueUDic",
"d": ""
}, {
"a": "ESP",
"b": "ESP",
"c": "CI",
"d": ""
}, {
"a": "ac",
"b": "ac",
"c": "CI",
"d": "Chicago a"
}, {
"a": "SLd",
"b": "SLd",
"c": "CI",
"d": ""
}, {
"a": "UA",
"b": "UA",
"c": "CI",
"d": ""
}, {
"a": "UD",
"b": "UD",
"c": "CI",
"d": ""
}, {
"a": "R?",
"b": "R",
"c": "CI",
"d": ""
}, {
"a": "DDE",
"b": "DDE",
"c": "CI",
"d": "DooU"
}, {
"a": "SOD",
"b": "SOD",
"c": "CI",
"d": ""
}, {
"a": "IU",
"b": "IU",
"c": "CI",
"d": ""
},{ "a": "DD",
"b": "DD",
"c": "CI",
"d": ""
}, {
"a": "LOS",
"b": "LOS",
"c": "num",
"d": ""
}, {
"a": "NOS",
"b": "NOS",
"c": "d",
"d": ""
}] }] }] }}
Now, I want to parse this Json and get keys and insert then to columnNames column in Meta Data Table
as
sourceColumnNames
name
id
description
items_name
a
b
c
d
Then I will query auth, get creds, and get values based on these source columns.
So far,
I have parsed JSON by json in python using index.
import json
with open('path/file.json') as myJson:
read_myjson = json.load(myJson)
read_data = read_myjson['data']
read_board = read_myjson['data']['boards']
board_name = read_myjson['data']['boards'][0]['name']
board_id = read_myjson['data']['boards'][0]['id']
board_description = read_myjson['data']['boards'][0]['description']
board_items = read_myjson['data']['boards'][0]['items']
board_items_name = read_myjson['data']['boards'][0]['items'][0]['name']
board_items_columnValues = read_myjson['data']['boards'][0]['items'][0]['column_values']
board_items_columnValues_title = read_myjson['data']['boards'][0]['items'][0]['column_values'][0]['a']
board_items_columnValues_id = read_myjson['data']['boards'][0]['items'][0]['column_values'][0]['b']
board_items_columnValues_type = read_myjson['data']['boards'][0]['items'][0]['column_values'][0]['c']
board_items_columnValues_text = read_myjson['data']['boards'][0]['items'][0]['column_values'][0]['d']
# for loop on Header
print("printing Header loop : ")
for key, val in read_myjson.items():
print(key, ":::", val)
headerKey = key
headerValue = val
print("printing data loop : it gives board key and its value")
for key, val in read_data.items():
# print(key, ":::", val)
datakey = key
dataValue = val
# print(datakey, "::::", dataValue)
print(" items loop")
# for key, val in read_board.items():
for item in board_items:
for key, val in item.items():
# print(key, ":::", val)
compDataAsKey = key
compDataAsValue = val
print(" Items_column_values loop")
columnKeys = []
columnValues = []
for items in board_items_columnValues:
for key, val in items.items():
# print(key, ":", val)
# compColumnKey = key
# compColumnValue = val
columnKeys.append(key)
columnValues.append(val)
I have also tried dataclasses in python but cant actually map the class to json parse etc.
import json
import orjson, dataclasses
with open('path/AuthJsonSample.json') as myJson:
read_myjson = json.load(myJson)
#dataclasses.dataclass
class AuthData:
url: str
headers: str
data: str
How can I make this etl pipeline?

Related

how to save multi level dict per line?

i have this dict
dd = {
"A": {"a": {"1": "b", "2": "f"}, "z": ["z", "q"]},
"B": {"b": {"1": "c", "2": "g"}, "z": ["x", "p"]},
"C": {"c": {"1": "d", "2": "h"}, "z": ["y", "o"]},
}
and i wanna have it formated in one line like this in a file i used
with open('file.json', 'w') as file: json.dump(dd, file, indent=1)
# result
{
"A": {
"a": {
"1": "b",
"2": "f"
},
"z": [
"z",
"q"
]
},
"B": {
"b": {
"1": "c",
"2": "g"
},
"z": [
"x",
"p"
]
},
"C": {
"c": {
"1": "d",
"2": "h"
},
"z": [
"y",
"o"
]
}
}
i also tried but gave me string and list wrong
with open('file.json', 'w') as file: file.write('{\n' +',\n'.join(json.dumps(f"{i}: {dd[i]}") for i in dd) +'\n}')
# result
{
"A: {'a': {'1': 'b', '2': 'f'}, 'z': ['z', 'q']}",
"B: {'b': {'1': 'c', '2': 'g'}, 'z': ['x', 'p']}",
"C: {'c': {'1': 'd', '2': 'h'}, 'z': ['y', 'o']}"
}
the result i wanna is
{
"A": {"a": {"1": "b", "2": "f"}, "z": ["z", "q"]},
"B": {"b": {"1": "c", "2": "g"}, "z": ["x", "p"]},
"C": {"c": {"1": "d", "2": "h"}, "z": ["y", "o"]},
}
how do i print the json content one line per dict while all inside is one line too?
i plan to read it using json.load
Stdlib json module does not really support that, but you should be able to write a function which does similar pretty easily. Something like:
import json
def my_dumps(dd):
lines = []
for k, v in dd.items():
lines.append(json.dumps({k: v})[1:-1])
return "{\n" + ",\n".join(lines) + "\n}"
If all you wanted was to wrap json to some more human-friendly line width, without totally spacing out everything like using indent option does, then another option might be using textwrap:
>>> print("\n".join(textwrap.wrap(json.dumps(dd), 51)))
{"A": {"a": {"1": "b", "2": "f"}, "z": ["z", "q"]},
"B": {"b": {"1": "c", "2": "g"}, "z": ["x", "p"]},
"C": {"c": {"1": "d", "2": "h"}, "z": ["y", "o"]}}
x = ['{\n']
for i in dd :
x.append('"'+i+'": '+str(dd[i]).replace("'",'"')+",\n")
x[-1] = x[-1][:-2]
x.append("\n}")
with open('file.json', 'w') as file:
file.writelines(x)
Image of the output :-

How to matching two Dictionaries after comparing the index values

I want the key in the resulting dict to be the key in dict1 (i.e. k1) and the value in the resulting dict is the key from dict2 (k2) that has a value v2 equal to the value for k1 in dict1 i.e. v1 (v2==v1)
First dict1
dict1 = {
"a": "121",
"b": "132",
"c": "312",
"d": "434",
"e": "564",
"f": "663",
}
The second one is -
dict2 = {
"a": "312",
"b": "121",
"c": "564",
"d": "663",
"e": "434",
"f": "132",
}
The result should look like this -
Results = {
"a": "b",
"b": "f",
"c": "a",
"d": "e",
"e": "c",
"f": "d",
}
Dict is key-value pair. I would like to compare the value of dict1 with the value of dict2 and print the key of dict2
You need to create a new dict that swap key and value from dict2. Note that your requirement imply the values in dict2 are unique and can serve as keys.
Then
dict1 = {
"0": "1",
"1": "4",
"2": "5",
"3": "6",
"4": "7",
"5": "8",
}
dict2 = {
"0": "6",
"1": "8",
"2": "4",
"3": "1",
"4": "5",
"5": "7",
}
dict3 = {val:key for key, val in dict2.items()}
result = {key:dict3.get(val) for key, val in dict1.items()}
print(result)

How do I turn JSON Objects into a dict?

Working on a freshwater fish conservation project. I scraped a JSON file that looks like this:
{
"fish": [
{
"id": 0,
"n": "NO INFORMATION",
"a": "NONE",
"i": "none.png"
},
{
"id": 1,
"n": "Hampala barb",
"a": "Hampala macrolepidota",
"i": "hampala.png"
},
{
"id": 2,
"n": "Giant snakehead",
"a": "Channa micropeltes",
"i": "toman.png"
},
{
"id": 3,
"n": "Clown featherback",
"a": "Chitala ornata",
"i": "belida.png"
}
]
}
And I'm trying to extract the keys "id" and "a" into a python dictionary like this:
fish_id = {
0 : "NONE",
1 : "Hampala macrolepidota",
2 : "Channa micropeltes",
3 : "Chitala ornata"
}
import json
data = """{
"fish": [
{
"id": 0,
"n": "NO INFORMATION",
"a": "NONE",
"i": "none.png"
},
{
"id": 1,
"n": "Hampala barb",
"a": "Hampala macrolepidota",
"i": "hampala.png"
},
{
"id": 2,
"n": "Giant snakehead",
"a": "Channa micropeltes",
"i": "toman.png"
},
{
"id": 3,
"n": "Clown featherback",
"a": "Chitala ornata",
"i": "belida.png"
}
]
}"""
data_dict = json.loads(data)
fish_id = {}
for item in data_dict["fish"]:
fish_id[item["id"]] = item["a"]
print(fish_id)
First create a fish.json file and get your JSON file;
with open('fish.json') as json_file:
data = json.load(json_file)
Then, take your fishes;
fish1 = data['fish'][0]
fish2 = data['fish'][1]
fish3 = data['fish'][2]
fish4 = data['fish'][3]
After that take only values for each, because you want to create a dictionary only from values;
value_list1=list(fish1.values())
value_list2=list(fish2.values())
value_list3=list(fish3.values())
value_list4=list(fish4.values())
Finally, create fish_id dictionary;
fish_id = {
f"{value_list1[0]}" : f"{value_list1[2]}",
f"{value_list2[0]}" : f"{value_list2[2]}",
f"{value_list3[0]}" : f"{value_list3[2]}",
f"{value_list4[0]}" : f"{value_list4[2]}",
}
if you run;
print(fish_id)
Result will be like below, but if you can use for loops, it can be more effective.
{'0': 'NONE', '1': 'Hampala macrolepidota', '2': 'Channa micropeltes', '3': 'Chitala ornata'}

How to custom indent json dump?

I use indent = 2, but I want the first level of indentation to be zero. For example:
Partial Code
json.dump(json_data, json_file, indent=2)
Output
{
"a": 1,
"b": "2",
"list": [
{
"c": 3,
"d": 4,
}
]
}
What I want instead
{
"a": 1,
"b": "2",
"list": [
{
"c": 3,
"d": 4,
}
]
}
As stated in the comments, it doesn't make functional difference and you will need custom pretty-print. something like
import json
import textwrap
spam = {"a": 1, "b": "2",
"list": [{"c": 3, "d": 4,}]}
eggs = json.dumps(spam, indent=2).splitlines()
eggs = '\n'.join([eggs[0], textwrap.dedent('\n'.join(eggs[1:-1])), eggs[-1]])
print(eggs)
with open('spam.json', 'w') as f:
f.write(eggs)
output
{
"a": 1,
"b": "2",
"list": [
{
"c": 3,
"d": 4
}
]
}

Extracting a subset of attributes with JSONPath2 from two nodes of a json document

I have the following JSON document:
{
"A": "A_VALUE",
"B": {
"C": [
{
"D": {
"E": "E_VALUE1",
"F": "F_VALUE1",
"G": "G_VALUE1"
},
"H": ["01", "23" ]
},
{
"D": {
"E": "E_VALUE2",
"F": "F_VALUE2",
"G": "G_VALUE3"
},
"H": ["45", "67" ]
}
]
}
}
and I would like to extract field H using a jsonpath2 expression where I specify a value for E field,
for example :
$..C[?(#.D.G="G_VALUE1")].H[1]
The code I use to parse this is the following ( jsonpath version 0.4.3 ):
from jsonpath2.path import Path
s='{ "A": "A_VALUE", "B": { "C": [ { "D": { "E": "E_VALUE1", "F": "F_VALUE1", "G": "G_VALUE1" }, "H": ["01", "23" ] }, { "D": { "E": "E_VALUE2", "F": "F_VALUE2", "G": "G_VALUE3" }, "H": ["45", "67" ] } ] } }"'
p = Path.parse_str("$..C[?(#.D.E=\"E_VALUE1\")].H[1]")
print ([m.current_value for m in p.match(s)])
output
[]
Now, if I use JsonPath evaluator on https://jsonpath.com/ I obtain the following result which is not exatly what I need
$..C[?(#.D.E="E_VALUE1")].H[1]
output
[23,67]
But If I change the expression this way than it works and I obtain what I need;
$..C[?(#.D.E=="E_VALUE1")].H[1]
output
[23]
Same results with other online evaluator such as https://codebeautify.org/jsonpath-tester
So what would be the correct jsonpath expression I should use with jsonpath2 api in order to correctly extract the two required fields ?
You have to use [*] to access individual objects inside an array. This code works -
from jsonpath2.path import Path
import json
s='{ "A": "A_VALUE", "B": { "C": [ { "D": { "E": "E_VALUE1", "F": "F_VALUE1", "G": "G_VALUE1" }, "H": ["01", "23" ] }, { "D": { "E": "E_VALUE2", "F": "F_VALUE2", "G": "G_VALUE3" }, "H": ["45", "67" ] } ] } }'
jso = json.loads(s)
p = Path.parse_str('$..C[*][?(#.D.E="E_VALUE1")].H[1]') # C[*] access each bject in the array
print (*[m.current_value for m in p.match(jso)]) # 23
You can refer to this example from the jsonpath2 docs
You should use the == syntax.
Full disclosure: I've never heard of jsonpath before coming across your question, but being somewhat familiar with XPath, I figured I would read about this tool. I came across a site that can evaluate your expresssion using diffeernt implementations: http://jsonpath.herokuapp.com. The net result was that your expression with = could not be parsed by 3 of the 4 implementations. Moreover, the Goessner implementation returned results that you weren't expecting (all C elements matched and the result was [23,67]. With the == boolean expression, 3 of the 4 implementations provided the expected result of [23]. The Nebhale implementation again complained about the expresssion.

Categories

Resources