Efficient way to calculate metrics from nested JSON, via python?

Efficient way to calculate metrics from nested JSON, via python? - python

What is the most efficient way to calculate metrics from nested JSON, via python?
Given the JSON blob below, how can I calculate the user (ie profileId) with the most events - without using the pandas library and not having multiple nested for loops? I am having trouble writing the code the would not rely on O(N2).
{
"kind":"admin#reports#activities",
"etag":"\"5g8\"",
"nextPageToken":"A:1651795128914034:-4002873813067783265:151219070090:C02f6wppb",
"items":[
{
"kind":"admin#reports#activity",
"id":{
"time":"2022-05-05T23:59:39.421Z",
"uniqueQualifier":"5526793068617678141",
"applicationName":"token",
"customerId":"cds"
},
"etag":"\"jkYcURYoi8\"",
"actor":{
"email":"blah#blah.net",
"profileId":"1323"
},
"ipAddress":"107.178.193.87",
"events":[
{
"type":"auth",
"name":"activity",
"parameters":[
{
"name":"api_name",
"value":"admin"
},
{
"name":"method_name",
"value":"directory.users.list"
},
{
"name":"client_id",
"value":"722230783769-dsta4bi9fkom72qcu0t34aj3qpcoqloq.apps.googleusercontent.com"
},
{
"name":"num_response_bytes",
"intValue":"7158"
},
{
"name":"product_bucket",
"value":"GSUITE_ADMIN"
},
{
"name":"app_name",
"value":"Untitled project"
},
{
"name":"client_type",
"value":"WEB"
}
]
}
]
},
{
"kind":"admin#reports#activity",
"id":{
"time":"2022-05-05T23:58:48.914Z",
"uniqueQualifier":"-4002873813067783265",
"applicationName":"token",
"customerId":"df"
},
"etag":"\"5T53xK7dpLei95RNoKZd9uz5Xb8LJpBJb72fi2HaNYM/9DTdB8t7uixvUbjo4LUEg53_gf0\"",
"actor":{
"email":"blah.blah#bebe.net",
"profileId":"1324"
},
"ipAddress":"54.80.168.30",
"events":[
{
"type":"auth",
"name":"activity",
"parameters":[
{
"name":"api_name",
"value":"gmail"
},
{
"name":"method_name",
"value":"gmail.users.messages.list"
},
{
"name":"client_id",
"value":"927538837578.apps.googleusercontent.com"
},
{
"name":"num_response_bytes",
"intValue":"2"
},
{
"name":"product_bucket",
"value":"GMAIL"
},
{
"name":"app_name",
"value":"Zapier"
},
{
"name":"client_type",
"value":"WEB"
}
]
}
]
}
]
}

This would be a good place to start:
data = # your dict.
ids = [x['actor']['profileId'] for x in data['items']]
print(ids)
Output:
['1323', '1324']

Related

Query an elasticsearch index by an attribute, with a given range?

I want to query my index so that it matches whenever a particular attribute shows up called sitename, but I want all the data from a certain time range. I thought it might be something of the below but unsure:
{
"query": {
"range": {
"timestamp": {
"gte": "now-1h/h",
"lt": "now/h"
}
},
"match": {"sitename" : "HARB00ZAF0" }
}
}

You're almost there, but you need to leverage the bool queries
{
"query": {
"bool": {
"filter": [
{
"range": {
"timestamp": {
"gte": "now-1h/h",
"lt": "now/h"
}
}
}
],
"must": [
{
"match": {
"sitename": "HARB00ZAF0"
}
}
]
}
}
}

PySpark - Convert a heterogeneous array JSON array to Spark dataframe and flatten it

I have streaming data coming in as JSON array and I want flatten it out as a single row in a Spark dataframe using Python.
Here is how the JSON data looks like:
{
"event": [
{
"name": "QuizAnswer",
"count": 1
}
],
"custom": {
"dimensions": [
{
"title": "Are you:"
},
{
"question_id": "5965"
},
{
"option_id": "19029"
},
{
"option_title": "Non-binary"
},
{
"item": "Non-binary"
},
{
"tab_index": "3"
},
{
"tab_count": "4"
},
{
"tab_initial_index": "4"
},
{
"page": "home"
},
{
"environment": "testing"
},
{
"page_count": "0"
},
{
"widget_version": "2.2.44"
},
{
"session_count": "1"
},
{
"quiz_settings_id": "1020"
},
{
"quiz_session": "6e5a3b5c-9961-4c1b-a2af-3374bbeccede"
},
{
"shopify_customer_id": "noid"
},
{
"cart_token": ""
},
{
"app_version": "2.2.44"
},
{
"shop_name": "safety-valve.myshopify.com"
}
],
"metrics": []
}
}
}

Getting the track's ID from a playlist with Spotify's API in Python

I'm trying to use the Spotify's API to get from a playlist all the track's id
I think that I just don't really know how to use json on Python as it's my first time using API's
This is the json that I get when requesting for a random playlist:
{
"items":[
{
"added_at":"2020-02-20T19:08:11Z",
"added_by":{
},
"is_local":False,
"primary_color":"None",
"track":{
"album":{
"album_type":"single",
"artists":[
{
"external_urls":{
"spotify":"https://open.spotify.com/artist/1k5UEOU4igPC0NoHjEekha"
},
"href":"https://api.spotify.com/v1/artists/1k5UEOU4igPC0NoHjEekha",
"id":"1k5UEOU4igPC0NoHjEekha",
"name":"Milkoi",
"type":"artist",
"uri":"spotify:artist:1k5UEOU4igPC0NoHjEekha"
},
{
"external_urls":{
"spotify":"https://open.spotify.com/artist/3U2oWd07HPgl60o8RBMG4P"
},
"href":"https://api.spotify.com/v1/artists/3U2oWd07HPgl60o8RBMG4P",
"id":"3U2oWd07HPgl60o8RBMG4P",
"name":"Miraie",
"type":"artist",
"uri":"spotify:artist:3U2oWd07HPgl60o8RBMG4P"
}
],
"available_markets":[ ],
"external_urls":{
"spotify":"https://open.spotify.com/album/69Y9i1D5TyQGxWdqFNRIhC"
},
"href":"https://api.spotify.com/v1/albums/69Y9i1D5TyQGxWdqFNRIhC",
"id":"69Y9i1D5TyQGxWdqFNRIhC",
"images":[
{
"height":640,
"url":"https://i.scdn.co/image/4ea41b9dde13c6cb31fff8fe3c5ee90076370885",
"width":640
},
{
"height":300,
"url":"https://i.scdn.co/image/6edf03567c0379d246c750147fd31a74574e4e27",
"width":300
},
{
"height":64,
"url":"https://i.scdn.co/image/fbbc5cca3adbaf433f43917012939c3e2c35c5eb",
"width":64
}
],
"name":"ミユキ",
"release_date":"2018-09-30",
"release_date_precision":"day",
"total_tracks":1,
"type":"album",
"uri":"spotify:album:69Y9i1D5TyQGxWdqFNRIhC"
},
"artists":[
{
"external_urls":{
"spotify":"https://open.spotify.com/artist/1k5UEOU4igPC0NoHjEekha"
},
"href":"https://api.spotify.com/v1/artists/1k5UEOU4igPC0NoHjEekha",
"id":"1k5UEOU4igPC0NoHjEekha",
"name":"Milkoi",
"type":"artist",
"uri":"spotify:artist:1k5UEOU4igPC0NoHjEekha"
},
{
"external_urls":{
"spotify":"https://open.spotify.com/artist/3U2oWd07HPgl60o8RBMG4P"
},
"href":"https://api.spotify.com/v1/artists/3U2oWd07HPgl60o8RBMG4P",
"id":"3U2oWd07HPgl60o8RBMG4P",
"name":"Miraie",
"type":"artist",
"uri":"spotify:artist:3U2oWd07HPgl60o8RBMG4P"
}
],
"available_markets":[ ],
"disc_number":1,
"duration_ms":211090,
"episode":False,
"explicit":False,
"external_ids":{
"isrc":"QM42K1817396"
},
"external_urls":{
"spotify":"https://open.spotify.com/track/77xwKl9jpVLO6VmNlwGwtm"
},
"href":"https://api.spotify.com/v1/tracks/77xwKl9jpVLO6VmNlwGwtm",
"id":"77xwKl9jpVLO6VmNlwGwtm",
"is_local":False,
"name":"ミユキ",
"popularity":43,
"preview_url":"https://p.scdn.co/mp3-preview/45e0b6cf4f358f5fbf6bebc1f019e67a780fa3f8?cid=2cd60e0da58b47518a61cec560d21ccd",
"track":True,
"track_number":1,
"type":"track",
"uri":"spotify:track:77xwKl9jpVLO6VmNlwGwtm"
},
"video_thumbnail":{
"url":"None"
}
},
{
"added_at":"2020-02-20T19:08:21Z",
"added_by":{
},
"href":"https://api.spotify.com/v1/users/akqpr9b7ycor7uw08afmc3hx4",
"id":"akqpr9b7ycor7uw08afmc3hx4",
"type":"user",
"uri":"spotify:user:akqpr9b7ycor7uw08afmc3hx4"
},
"is_local":False,
"primary_color":"None",
"track":{
"album":{
"album_type":"album",
"artists":[
{
"external_urls":{
"spotify":"https://open.spotify.com/artist/24HASvYQG1OvEFRWVWmOfx"
},
"href":"https://api.spotify.com/v1/artists/24HASvYQG1OvEFRWVWmOfx",
"id":"24HASvYQG1OvEFRWVWmOfx",
"name":"Kano",
"type":"artist",
"uri":"spotify:artist:24HASvYQG1OvEFRWVWmOfx"
}
],
"available_markets":[ ],
"external_urls":{
"spotify":"https://open.spotify.com/album/72sG7hFVmyFlxg9e7PfV0K"
},
"href":"https://api.spotify.com/v1/albums/72sG7hFVmyFlxg9e7PfV0K",
"id":"72sG7hFVmyFlxg9e7PfV0K",
"images":[
{
"height":640,
"url":"https://i.scdn.co/image/ab67616d0000b27327dfa5f6ab057a4ec5c53235",
"width":640
},
{
"height":300,
"url":"https://i.scdn.co/image/ab67616d00001e0227dfa5f6ab057a4ec5c53235",
"width":300
},
{
"height":64,
"url":"https://i.scdn.co/image/ab67616d0000485127dfa5f6ab057a4ec5c53235",
"width":64
}
],
"name":"rye",
"release_date":"2018-12-19",
"release_date_precision":"day",
"total_tracks":14,
"type":"album",
"uri":"spotify:album:72sG7hFVmyFlxg9e7PfV0K"
},
"artists":[
{
"external_urls":{
"spotify":"https://open.spotify.com/artist/24HASvYQG1OvEFRWVWmOfx"
},
"href":"https://api.spotify.com/v1/artists/24HASvYQG1OvEFRWVWmOfx",
"id":"24HASvYQG1OvEFRWVWmOfx",
"name":"Kano",
"type":"artist",
"uri":"spotify:artist:24HASvYQG1OvEFRWVWmOfx"
}
],
"available_markets":[ ],
"disc_number":2,
"duration_ms":222249,
"episode":False,
"explicit":False,
"external_ids":{
"isrc":"JPTE01809900"
},
"external_urls":{
"spotify":"https://open.spotify.com/track/6c9llTTjTcLgoHbKaJVw4f"
},
"href":"https://api.spotify.com/v1/tracks/6c9llTTjTcLgoHbKaJVw4f",
"id":"6c9llTTjTcLgoHbKaJVw4f",
"is_local":False,
"name":"六兆年と一夜物語",
"popularity":39,
"preview_url":"https://p.scdn.co/mp3-preview/3421753cafdf34dc1e34bba479f048ebd613f39f?cid=2cd60e0da58b47518a61cec560d21ccd",
"track":True,
"track_number":3,
"type":"track",
"uri":"spotify:track:6c9llTTjTcLgoHbKaJVw4f"
},
"video_thumbnail":{
"url":"None"
},
],
"limit":100,
"next":"None",
"offset":0,
"previous":"None",
"total":11
}
The ID is located at "items" -> "track" -> "id" for each track
I'm trying to get track's id but from all the tracks of the playlist so I can put them in a list.
If someone could help me it would be cool
Thanks in advance
I'm using Python 3

What I would do is put the response string into a response dictionary like so (if you haven't already)
resp_dict = json.load(your_response_string)
Then you could loop through and append all the track ids to a new list
newList = []
for x in response_dict['items']:
newList.append(x['track']['id'])
# To see if it worked
for item in newList:
print(item)
Let me know if that makes sense or you need some further explanation :)

How to merge multiple JSON files in Python

I have had to create multple JSON files for reasons of processing a corpus (using GNRD http://gnrd.globalnames.org/ for scientific name extraction). I now want to use these JSON files to annotate said corpus as a whole.
I am trying to merge the multiple JSON files in Python. The contents of each JSON files are arrays of just scientific_name (key) and the name found (value). Below is an example of one of the shorter files:
{
"file":"biodiversity_trophic_9.txt",
"names":[
{
"scientificName":"Bufo"
},
{
"scientificName":"Eleutherodactylus talamancae"
},
{
"scientificName":"E. punctariolus"
},
{
"scientificName":"Norops lionotus"
},
{
"scientificName":"Centrolenella prosoblepon"
},
{
"scientificName":"Sibon annulatus"
},
{
"scientificName":"Colostethus flotator"
},
{
"scientificName":"C. inguinalis"
},
{
"scientificName":"Eleutherodactylus"
},
{
"scientificName":"Hyla columba"
},
{
"scientificName":"Bufo haematiticus"
},
{
"scientificName":"S. annulatus"
},
{
"scientificName":"Leptodeira septentrionalis"
},
{
"scientificName":"Imantodes cenchoa"
},
{
"scientificName":"Oxybelis brevirostris"
},
{
"scientificName":"Cressa"
},
{
"scientificName":"Coloma"
},
{
"scientificName":"Perlidae"
},
{
"scientificName":"Hydropsychidae"
},
{
"scientificName":"Hyla"
},
{
"scientificName":"Norops"
},
{
"scientificName":"Hyla colymbiphyllum"
},
{
"scientificName":"Colostethus inguinalis"
},
{
"scientificName":"Oxybelis"
},
{
"scientificName":"Rana warszewitschii"
},
{
"scientificName":"R. warszewitschii"
},
{
"scientificName":"Rhyacophilidae"
},
{
"scientificName":"Daphnia magna"
},
{
"scientificName":"Hyla colymba"
},
{
"scientificName":"Centrolenella"
},
{
"scientificName":"Orconectes nais"
},
{
"scientificName":"Orconectes neglectus"
},
{
"scientificName":"Campostoma anomalum"
},
{
"scientificName":"Caridina"
},
{
"scientificName":"Decapoda"
},
{
"scientificName":"Atyidae"
},
{
"scientificName":"Cerastoderma edule"
},
{
"scientificName":"Rana aurora"
},
{
"scientificName":"Riffle"
},
{
"scientificName":"Calopterygidae"
},
{
"scientificName":"Elmidae"
},
{
"scientificName":"Gyrinidae"
},
{
"scientificName":"Gerridae"
},
{
"scientificName":"Naucoridae"
},
{
"scientificName":"Oligochaeta"
},
{
"scientificName":"Veliidae"
},
{
"scientificName":"Libellulidae"
},
{
"scientificName":"Philopotamidae"
},
{
"scientificName":"Ephemeroptera"
},
{
"scientificName":"Psephenidae"
},
{
"scientificName":"Baetidae"
},
{
"scientificName":"Corduliidae"
},
{
"scientificName":"Zygoptera"
},
{
"scientificName":"B. buto"
},
{
"scientificName":"C. euknemos"
},
{
"scientificName":"C. ilex"
},
{
"scientificName":"E. padi noblei"
},
{
"scientificName":"E. padi"
},
{
"scientificName":"E. bufo"
},
{
"scientificName":"E. butoni"
},
{
"scientificName":"E. crassi"
},
{
"scientificName":"E. cruentus"
},
{
"scientificName":"H. colymbiphyllum"
},
{
"scientificName":"N. aterina"
},
{
"scientificName":"S. ilex"
},
{
"scientificName":"Anisoptera"
},
{
"scientificName":"Riffle delta"
}
],
"total":67,
"status":200,
"unique":true,
"engines":[
"TaxonFinder",
"NetiNeti"
],
"verbatim":false,
"input_url":null,
"token_url":"http://gnrd.globalnames.org/name_finder.html?token=2rtc4e70st",
"parameters":{
"engine":0,
"return_content":false,
"best_match_only":false,
"data_source_ids":[
],
"detect_language":true,
"all_data_sources":false,
"preferred_data_sources":[
]
},
"execution_time":{
"total_duration":3.1727607250213623,
"find_names_duration":1.9656541347503662,
"text_preparation_duration":1.000107765197754
},
"english_detected":true
}
The issue I have is that there may be duplicates across the files, which I want to remove (otherwise I could just concatenate the files I guess). The queries I have seen otherwise are referring to merging extra keys and values to extend the arrays themselves.
Can anyone give me guidance on how to overcome this issue?

If I understand correctly, you want to get all "scientificNames" values in the "names" elements of a batch of files. If I'm wrong, you should give an expected output to make things easier to understand.
I'd do something like that:
all_names = set() # use a set to avoid duplicates
# put all your files in there
for filename in ('file1.json', 'file2.json', ....):
try:
with open(filename, 'rt') as finput:
data = json.load(finput)
for name in data.get('names'):
all_names.add(name.get('scientificName')
except Exception as exc:
print("Skipped file {} because exception {}".format(filename, str(exc))
print(all_names)
And in case you want to get a similar format than the initial files, add:
import pprint
pprint({"names:": {"scientificName": name for name in all_names}, "total": len(all_names)})

Filter timestamp with mongoDb

suppose this Json export from my mongoDb
{
"_id":{
"$oid":"51ca002d9e67460354bb0089"
},
"node":{
"id":"1",
"components":[
{
"sensors":[
{
"name":"backup-job-name",
"value":"Job_2"
},
{
"name":"backup-job-id",
"value":"187"
},
{
"name":"backup-start-date",
"value":"1372138227"
},
{
"name":"backup-stop-date",
"value":"1372138235"
},
{
"name":"backup-nb-errors",
"value":"0"
},
{
"name":"backup-nb-warnings",
"value":"0"
},
{
"name":"backup-source-files",
"value":"402"
},
{
"name":"backup-source-bytes",
"value":"168571449"
},
{
"name":"backup-status",
"value":null
}
],
"type":"backup",
"serial":"1-backup-backup-job-name-Job_2"
},
{
"sensors":[
{
"name":"backup-job-name",
"value":"Test_bckp"
},
{
"name":"backup-job-id",
"value":""
},
{
"name":"backup-start-date",
"value":"0"
},
{
"name":"backup-stop-date",
"value":"0"
},
{
"name":"backup-nb-errors",
"value":"0"
},
{
"name":"backup-nb-warnings",
"value":"0"
},
{
"name":"backup-source-files",
"value":"0"
},
{
"name":"backup-source-bytes",
"value":"0"
},
{
"name":"backup-status",
"value":null
}
],
"type":"backup",
"serial":"1-backup-backup-job-name-Test_bckp"
}
]
},
"timestamp":1372192813
}
I work with python and i'd like to get documents where "backup-start-date" (stored as timestamp) il greater than a given value.
I've tried with
collection.find({
'node.components.type': 'backup',
'node.components.sensors': {'name': 'backup-start-date', 'value': {'$gte': ts_to_compare_with}}}):
but any documents in the result. Is my query wrong ?

There are two things - you are wanting to match an sensor document in the components array and $gte only works on integers and dates.
If you convert the data to be ints then you can use $gte and query like so:
db.test.find({
'node.components.type': 'backup',
'node.components.sensors': {
$elemMatch: {'name': 'backup-start-date',
'value': {'$gte': 168571445}}}})

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Efficient way to calculate metrics from nested JSON, via python? - python

This would be a good place to start: data = # your dict. ids = [x['actor']['profileId'] for x in data['items']] print(ids) Output: ['1323', '1324']

Related

Query an elasticsearch index by an attribute, with a given range?

PySpark - Convert a heterogeneous array JSON array to Spark dataframe and flatten it

Getting the track's ID from a playlist with Spotify's API in Python

How to merge multiple JSON files in Python

Filter timestamp with mongoDb

Categories

Resources