Extract events data from GA4 via bigquery in ADF synapse delta table - python

We need to extract the events table from GA4 through bigquery (not connecting via Google API directly as it limits both - the number of rows & number of dimensions/metrics), however as there are several nested columns, the ADF reads data in the given format:
{
"v": [{
"v": {
"f": [{
"v": "firebase_conversion"
}, {
"v": {
"f": [{
"v": null
}, {
"v": "0"
}, {
"v": null
}, {
"v": null
}]
}
}]
}
}, {
"v": {
"f": [{
"v": "ga_session_id"
}, {
"v": {
"f": [{
"v": null
}, {
"v": "123"
}, {
"v": null
}, {
"v": null
}]
}
}]
}
}, {
"v": {
"f": [{
"v": "engaged_session_event"
}, {
"v": {
"f": [{
"v": null
}, {
"v": "1"
}, {
"v": null
}, {
"v": null
}]
}
}]
}
}, {
"v": {
"f": [{
"v": "ga_session_number"
}, {
"v": {
"f": [{
"v": null
}, {
"v": "9"
}, {
"v": null
}, {
"v": null
}]
}
}]
}
}, {
"v": {
"f": [{
"v": "page_referrer"
}, {
"v": {
"f": [{
"v": "ABC"
}, {
"v": null
}, {
"v": null
}, {
"v": null
}]
}
}]
}
}, {
"v": {
"f": [{
"v": "page_title"
}, {
"v": {
"f": [{
"v": "ABC"
}, {
"v": null
}, {
"v": null
}, {
"v": null
}]
}
}]
}
}, {
"v": {
"f": [{
"v": "page_location"
}, {
"v": {
"f": [{
"v": "xyz"
}, {
"v": null
}, {
"v": null
}, {
"v": null
}]
}
}]
}
}, {
"v": {
"f": [{
"v": "session_engaged"
}, {
"v": {
"f": [{
"v": null
}, {
"v": "1"
}, {
"v": null
}, {
"v": null
}]
}
}]
}
}]
}
Unnesting is a problem as there are several columns with such data structure, and unnest will increase the number of rows (3.5mn records becomes 40mn). The plan is to maybe extract data as is & use azure functions with python functions to flatten it as JSON, but again the null values are creating trouble there.
Someone can suggest the best way to get data on daily basis without extrapolation in the desired format in the data lake?

Related

Generate Hash Value for Nested JSON Object

I have two nested JSON objects with different order of elements and would like to generate hash value for both. Now, I'm comparing these two hash values and it needs to be same. How can I achieve this mechanism?
First JSON Object
{
"X":{
"Y":[
{
"A":"1",
"B":{
"b1":"2",
"b2":"2"
}
},
{
"C":"4",
"D":{
"d1":"5",
"d2":"6"
}
},
],
"Z":[
{
"E":{
"e1":"7",
"e2":"8"
},
"F":"9"
}
]
}
}
Second JSON Object
{
"X":{
"Y":[
{
"C":"4",
"D":{
"d1":"5",
"d2":"6"
}
},
{
"A":"1",
"B":{
"b1":"2",
"b2":"2"
}
},
],
"Z":[
{
"E":{
"e1":"7",
"e2":"8"
},
"F":"9"
}
]
}
}
So, here goal is I want to generate same hash value for both JSON object. How can I achieve this in Python or Golang?
The JSON object is unsorted as well the map[string]interface{}, so you should sort the maps. However, sorting could be complicated and time consuming.
Instead of using the objects I would like to use the JSON as string, this way the JSON can be Unmarshaled and the string can be sorted to create the SHA value.
var json1 = `
{
"X": {
"Y": [
{
"A": "1",
"B": {
"b1": "2",
"b2": "2"
}
},
{
"C": "4",
"D": {
"d1": "5",
"d2": "6"
}
}
],
"Z": [
{
"E": {
"e1": "7",
"e2": "8"
},
"F": "9"
}
]
}
}
`
var json2 = `
{
"X": {
"Y": [
{
"C": "4",
"D": {
"d1": "5",
"d2": "6"
}
},
{
"A": "1",
"B": {
"b1": "2",
"b2": "2"
}
}
],
"Z": [
{
"E": {
"e1": "7",
"e2": "8"
},
"F": "9"
}
]
}
}
`
type JSON struct {
X struct {
Y []struct {
A string `json:"A"`
B struct {
B1 string `json:"b1"`
B2 string `json:"b2"`
} `json:"B"`
C string `json:"C"`
D struct {
D1 string `json:"d1"`
D2 string `json:"d2"`
} `json:"D"`
} `json:"Y"`
Z []struct {
E struct {
E1 string `json:"e1"`
E2 string `json:"e2"`
} `json:"E"`
F string `json:"F"`
} `json:"Z"`
} `json:"X"`
}
func SortString(w string) string {
s := strings.Split(w, "")
sort.Strings(s)
return strings.Join(s, "")
}
func main() {
var v1, v2 interface{}
json.Unmarshal([]byte(json1), &v1)
json.Unmarshal([]byte(json2), &v2)
fmt.Println(reflect.DeepEqual(v1, v2))
var m1, m2 JSON
json.Unmarshal([]byte(json1), &m1)
json.Unmarshal([]byte(json2), &m2)
fmt.Println(reflect.DeepEqual(m1, m2))
json1 = SortString(json1)
json2 = SortString(json2)
fmt.Println(reflect.DeepEqual(json1, json2))
}
Keep in mind that the objects are unsorted, so you should evaluate if creating a JSONSort function is important (considering that JSON could be different each time).
go run json.go
false
false
true

How do I turn JSON Objects into a dict?

Working on a freshwater fish conservation project. I scraped a JSON file that looks like this:
{
"fish": [
{
"id": 0,
"n": "NO INFORMATION",
"a": "NONE",
"i": "none.png"
},
{
"id": 1,
"n": "Hampala barb",
"a": "Hampala macrolepidota",
"i": "hampala.png"
},
{
"id": 2,
"n": "Giant snakehead",
"a": "Channa micropeltes",
"i": "toman.png"
},
{
"id": 3,
"n": "Clown featherback",
"a": "Chitala ornata",
"i": "belida.png"
}
]
}
And I'm trying to extract the keys "id" and "a" into a python dictionary like this:
fish_id = {
0 : "NONE",
1 : "Hampala macrolepidota",
2 : "Channa micropeltes",
3 : "Chitala ornata"
}
import json
data = """{
"fish": [
{
"id": 0,
"n": "NO INFORMATION",
"a": "NONE",
"i": "none.png"
},
{
"id": 1,
"n": "Hampala barb",
"a": "Hampala macrolepidota",
"i": "hampala.png"
},
{
"id": 2,
"n": "Giant snakehead",
"a": "Channa micropeltes",
"i": "toman.png"
},
{
"id": 3,
"n": "Clown featherback",
"a": "Chitala ornata",
"i": "belida.png"
}
]
}"""
data_dict = json.loads(data)
fish_id = {}
for item in data_dict["fish"]:
fish_id[item["id"]] = item["a"]
print(fish_id)
First create a fish.json file and get your JSON file;
with open('fish.json') as json_file:
data = json.load(json_file)
Then, take your fishes;
fish1 = data['fish'][0]
fish2 = data['fish'][1]
fish3 = data['fish'][2]
fish4 = data['fish'][3]
After that take only values for each, because you want to create a dictionary only from values;
value_list1=list(fish1.values())
value_list2=list(fish2.values())
value_list3=list(fish3.values())
value_list4=list(fish4.values())
Finally, create fish_id dictionary;
fish_id = {
f"{value_list1[0]}" : f"{value_list1[2]}",
f"{value_list2[0]}" : f"{value_list2[2]}",
f"{value_list3[0]}" : f"{value_list3[2]}",
f"{value_list4[0]}" : f"{value_list4[2]}",
}
if you run;
print(fish_id)
Result will be like below, but if you can use for loops, it can be more effective.
{'0': 'NONE', '1': 'Hampala macrolepidota', '2': 'Channa micropeltes', '3': 'Chitala ornata'}

reformatting a json file (python)

I have a json file which is split into 3 sections..
{
"columns": {
"0": "Account Number",
"1": "Airport",
"2": "Terminal",
},
"rows": [
[
[
{
"v": "1234 "
},
{
"v": "LHR - London Heathrow"
},
{
"v": "T3"
}
]
]
]
,"types": [
{
"0": "TEXT",
"1": "TEXT",
"2": "TEXT"
}
]
}
what i want it to be like is this :
{
"Account Number" : "1234",
"Airport" : "LHR - London Heathrow",
"Terminal" : "T3"
}
How can acheive this please?
Dependencies
import json
import ast
Reading Json as file
with open("file.json") as f:
data = ast.literal_eval(f.read())
Reading Json as String
askersString = """{
"columns": {
"0": "Account Number",
"1": "Airport",
"2": "Terminal",
},
"rows": [
[
[
{
"v": "1234 "
},
{
"v": "LHR - London Heathrow"
},
{
"v": "T3"
}
]
]
]
,"types": [
{
"0": "TEXT",
"1": "TEXT",
"2": "TEXT"
}
]
}"""
data = ast.literal_eval(askersString)
Creating new json
columns = data["columns"]
a = data["rows"][0][0]
newJson = {}
for k, v in columns.items():
newJson[v] = a[int(k)]['v']
updatedJson = json.dumps(newJson, indent=4)
print(updatedJson)
Output
{
"Account Number": "1234 ",
"Airport": "LHR - London Heathrow",
"Terminal": "T3"
}

Extract specific data from JSON data from Google Cloud Vision

I am quite new to Raspberry Pi and Python coding but I was successful in configuring Google Cloud Vision. However the JSON dump looks like:
{
"responses": [
{
"faceAnnotations": [
{
"angerLikelihood": "UNLIKELY",
"blurredLikelihood": "VERY_UNLIKELY",
"boundingPoly": {
"vertices": [
{
"x": 129
},
{
"x": 370
},
{
"x": 370,
"y": 240
},
{
"x": 129,
"y": 240
}
]
},
"detectionConfidence": 0.99543685,
"fdBoundingPoly": {
"vertices": [
{
"x": 162,
"y": 24
},
{
"x": 337,
"y": 24
},
{
"x": 337,
"y": 199
},
{
"x": 162,
"y": 199
}
]
},
"headwearLikelihood": "VERY_UNLIKELY",
"joyLikelihood": "VERY_UNLIKELY",
"landmarkingConfidence": 0.77542377,
"landmarks": [
{
"position": {
"x": 210.93373,
"y": 92.71409,
"z": -0.00025338508
},
"type": "LEFT_EYE"
},
{
"position": {
"x": 280.00177,
"y": 82.57283,
"z": 0.49017733
},
"type": "RIGHT_EYE"
},
{
"position": {
"x": 182.08047,
"y": 77.89372,
"z": 6.825161
},
"type": "LEFT_OF_LEFT_EYEBROW"
},
{
"position": {
"x": 225.82335,
"y": 72.88091,
"z": -13.963233
},
"type": "RIGHT_OF_LEFT_EYEBROW"
},
{
"position": {
"x": 260.4491,
"y": 66.19005,
"z": -13.798634
},
"type": "LEFT_OF_RIGHT_EYEBROW"
},
{
"position": {
"x": 303.87503,
"y": 59.69522,
"z": 7.8336163
},
"type": "RIGHT_OF_RIGHT_EYEBROW"
},
{
"position": {
"x": 244.57729,
"y": 83.701904,
"z": -15.022567
},
"type": "MIDPOINT_BETWEEN_EYES"
},
{
"position": {
"x": 251.58353,
"y": 124.68004,
"z": -36.52176
},
"type": "NOSE_TIP"
},
{
"position": {
"x": 255.39096,
"y": 151.87607,
"z": -19.560472
},
"type": "UPPER_LIP"
},
{
"position": {
"x": 259.96045,
"y": 178.62886,
"z": -14.095398
},
"type": "LOWER_LIP"
},
{
"position": {
"x": 232.35422,
"y": 167.2542,
"z": -1.0750997
},
"type": "MOUTH_LEFT"
},
{
"position": {
"x": 284.49316,
"y": 159.06075,
"z": -0.078973025
},
"type": "MOUTH_RIGHT"
},
{
"position": {
"x": 256.94714,
"y": 163.11235,
"z": -14.0897665
},
"type": "MOUTH_CENTER"
},
{
"position": {
"x": 274.47885,
"y": 125.8553,
"z": -7.8479633
},
"type": "NOSE_BOTTOM_RIGHT"
},
{
"position": {
"x": 231.2164,
"y": 132.60686,
"z": -8.418254
},
"type": "NOSE_BOTTOM_LEFT"
},
{
"position": {
"x": 252.96692,
"y": 135.81783,
"z": -19.805998
},
"type": "NOSE_BOTTOM_CENTER"
},
{
"position": {
"x": 208.6943,
"y": 86.72571,
"z": -4.8503814
},
"type": "LEFT_EYE_TOP_BOUNDARY"
},
{
"position": {
"x": 223.4354,
"y": 90.71454,
"z": 0.42966545
},
"type": "LEFT_EYE_RIGHT_CORNER"
},
{
"position": {
"x": 210.67189,
"y": 96.09362,
"z": -0.62435865
},
"type": "LEFT_EYE_BOTTOM_BOUNDARY"
},
{
"position": {
"x": 195.00711,
"y": 93.783226,
"z": 6.6310787
},
"type": "LEFT_EYE_LEFT_CORNER"
},
{
"position": {
"x": 208.30045,
"y": 91.73073,
"z": -1.7749802
},
"type": "LEFT_EYE_PUPIL"
},
{
"position": {
"x": 280.8329,
"y": 75.722244,
"z": -4.3266015
},
"type": "RIGHT_EYE_TOP_BOUNDARY"
},
{
"position": {
"x": 295.9134,
"y": 78.8241,
"z": 7.3644505
},
"type": "RIGHT_EYE_RIGHT_CORNER"
},
{
"position": {
"x": 281.82813,
"y": 85.56999,
"z": -0.09711724
},
"type": "RIGHT_EYE_BOTTOM_BOUNDARY"
},
{
"position": {
"x": 266.6147,
"y": 83.689865,
"z": 0.6850431
},
"type": "RIGHT_EYE_LEFT_CORNER"
},
{
"position": {
"x": 282.31485,
"y": 80.471725,
"z": -1.3341979
},
"type": "RIGHT_EYE_PUPIL"
},
{
"position": {
"x": 202.4563,
"y": 66.06882,
"z": -8.493092
},
"type": "LEFT_EYEBROW_UPPER_MIDPOINT"
},
{
"position": {
"x": 280.76108,
"y": 54.08935,
"z": -7.895889
},
"type": "RIGHT_EYEBROW_UPPER_MIDPOINT"
},
{
"position": {
"x": 168.31839,
"y": 134.46411,
"z": 89.73161
},
"type": "LEFT_EAR_TRAGION"
},
{
"position": {
"x": 332.23724,
"y": 109.35637,
"z": 90.81501
},
"type": "RIGHT_EAR_TRAGION"
},
{
"position": {
"x": 242.81676,
"y": 67.845825,
"z": -16.629877
},
"type": "FOREHEAD_GLABELLA"
},
{
"position": {
"x": 264.32065,
"y": 208.95119,
"z": -4.0186276
},
"type": "CHIN_GNATHION"
},
{
"position": {
"x": 183.4723,
"y": 179.30655,
"z": 59.87147
},
"type": "CHIN_LEFT_GONION"
},
{
"position": {
"x": 331.6927,
"y": 156.69931,
"z": 60.93835
},
"type": "CHIN_RIGHT_GONION"
}
],
"panAngle": 0.41165036,
"rollAngle": -8.687789,
"sorrowLikelihood": "VERY_UNLIKELY",
"surpriseLikelihood": "VERY_UNLIKELY",
"tiltAngle": 0.2050134,
"underExposedLikelihood": "POSSIBLE"
}
]
}
]
}
Yes, it's an eyesore to look at. I am only wanting to extract the likelihood. Preferably in this format:
Anger likelihood is UNLIKEY
Joy likelihood is VERY_UNLIKELY
Sorrow likelihood is VERY_UNLIKELY
Suprise likelihood is VERY_UNLIKELY
Python code can be found here:
https://github.com/DexterInd/GoogleVisionTutorials/blob/master/camera-vision-face.py
Answered my own question in perhaps the most noobiest way:
print "Anger likelihood is:",
print(response['responses'][0]['faceAnnotations'][0]['angerLikelihood'])
print "Joy likelihood is:",
print(response['responses'][0]['faceAnnotations'][0]['joyLikelihood'])
print "Sorrow likelihood is:",
print(response['responses'][0]['faceAnnotations'][0]['sorrowLikelihood'])
print "Surprise likelihood is:",
print(response['responses'][0]['faceAnnotations'][0]['surpriseLikelihood'])
Came out looking like:
Anger likelihood is: VERY_UNLIKELY
Joy likelihood is: VERY_LIKELY
Sorrow likelihood is: VERY_UNLIKELY
Surprise likelihood is: VERY_UNLIKELY
You can go with dictionary comprehensions. Given that you have your response in variable result, the following code will output exactly what you want.
import json
likelihood = {
attr[:len(attr) - 10].capitalize(): value
for attr, value
in json.loads(result)['responses'][0]['faceAnnotations'][0].items()
if attr.find('Likelihood') != -1
}
print(*[
'{} likelihood is {}'.format(e, p) for e, p in likelihood.items()
], sep='\n')
Keep in mind that this code works correctly if there is only one item in both responses and faceAnnotations arrays. If there's more, the code will handle only the first items. It's also kinda ugly.
In len(attr) - 10, 10 is the length of word "Likelihood".

Dynamic (Changing) JSON to Hive Schema using UDF

I am having a JSON file with below structure:
{
"A": {
"AId": {
"AId": "123",
"idType": "XYZ"
},
"fN": "RfN",
"oN": "ON",
"mail": [
"abc#kml.com",
"xyz#kml.com"
],
"ph": [
{
"nu": "999-999-9999",
"t": "Of",
"ext": "1234"
},
{
"nu": "999-999-9999",
"t": "Of",
"ext": "1234"
}
],
"add": {
"addLines": [
"Addr Line 1",
"Addr Line 2"
],
"c": "C",
"sC": "S"
},
"c": [
{
"cT": "CT",
"cN": "9999"
}
],
"serId": "XXX"
},
"int": {
"endTS": null,
"cId": {
"cId": "null",
"cC": "null"
},
"cmpgn": null,
"sTC": null,
"cCID": {
"tIC": "null",
"tC": "null",
"cC": []
},
"int": "Un",
"rep": [],
"pp": "null",
"cf": {
"a": 1234,
"b": 1234
},
"iA": {
"sId": {
"s": "null",
"sId": "null"
},
"cId": "null",
"lId": "null"
},
"sRequest": null,
"vBu": "VBU",
"fId": "FId",
"k": [
"k"
],
"eng": [
{
"EC": "E_CODE::12345",
"cT": "2011-01-28T23:12:12.666Z",
"up": null,
"rep": {
"rep": {
"type": "B",
"id": "ID"
},
"fullName": "FullName"
}
}
]
}
}
Few Points:
From the above structure need to create hive schema.
JSON structure can change dynamically. For each change in JSON structure. Need to regenerate hive schema.
I tried, using JSON library of Python; but not of much use. I was not able to obtain tag names, which can be used as field names of hive schema.
Want to auto mate the process of generating JSON to Hive schema.
Exploring Python JSON Encoder, Decoder class; to parse the JSON and put own logic to create Hive schema out of it. But there is no good example available to use JSON Encoder, Decoder class.
Finally, want to put everything in form of Python UDF. I good with any Java UDF alternative too.
Note: The above JSON can be structured using http://jsonlint.com/

Categories

Resources