suppose this Json export from my mongoDb
{
"_id":{
"$oid":"51ca002d9e67460354bb0089"
},
"node":{
"id":"1",
"components":[
{
"sensors":[
{
"name":"backup-job-name",
"value":"Job_2"
},
{
"name":"backup-job-id",
"value":"187"
},
{
"name":"backup-start-date",
"value":"1372138227"
},
{
"name":"backup-stop-date",
"value":"1372138235"
},
{
"name":"backup-nb-errors",
"value":"0"
},
{
"name":"backup-nb-warnings",
"value":"0"
},
{
"name":"backup-source-files",
"value":"402"
},
{
"name":"backup-source-bytes",
"value":"168571449"
},
{
"name":"backup-status",
"value":null
}
],
"type":"backup",
"serial":"1-backup-backup-job-name-Job_2"
},
{
"sensors":[
{
"name":"backup-job-name",
"value":"Test_bckp"
},
{
"name":"backup-job-id",
"value":""
},
{
"name":"backup-start-date",
"value":"0"
},
{
"name":"backup-stop-date",
"value":"0"
},
{
"name":"backup-nb-errors",
"value":"0"
},
{
"name":"backup-nb-warnings",
"value":"0"
},
{
"name":"backup-source-files",
"value":"0"
},
{
"name":"backup-source-bytes",
"value":"0"
},
{
"name":"backup-status",
"value":null
}
],
"type":"backup",
"serial":"1-backup-backup-job-name-Test_bckp"
}
]
},
"timestamp":1372192813
}
I work with python and i'd like to get documents where "backup-start-date" (stored as timestamp) il greater than a given value.
I've tried with
collection.find({
'node.components.type': 'backup',
'node.components.sensors': {'name': 'backup-start-date', 'value': {'$gte': ts_to_compare_with}}}):
but any documents in the result. Is my query wrong ?
There are two things - you are wanting to match an sensor document in the components array and $gte only works on integers and dates.
If you convert the data to be ints then you can use $gte and query like so:
db.test.find({
'node.components.type': 'backup',
'node.components.sensors': {
$elemMatch: {'name': 'backup-start-date',
'value': {'$gte': 168571445}}}})
Related
What is the most efficient way to calculate metrics from nested JSON, via python?
Given the JSON blob below, how can I calculate the user (ie profileId) with the most events - without using the pandas library and not having multiple nested for loops? I am having trouble writing the code the would not rely on O(N2).
{
"kind":"admin#reports#activities",
"etag":"\"5g8\"",
"nextPageToken":"A:1651795128914034:-4002873813067783265:151219070090:C02f6wppb",
"items":[
{
"kind":"admin#reports#activity",
"id":{
"time":"2022-05-05T23:59:39.421Z",
"uniqueQualifier":"5526793068617678141",
"applicationName":"token",
"customerId":"cds"
},
"etag":"\"jkYcURYoi8\"",
"actor":{
"email":"blah#blah.net",
"profileId":"1323"
},
"ipAddress":"107.178.193.87",
"events":[
{
"type":"auth",
"name":"activity",
"parameters":[
{
"name":"api_name",
"value":"admin"
},
{
"name":"method_name",
"value":"directory.users.list"
},
{
"name":"client_id",
"value":"722230783769-dsta4bi9fkom72qcu0t34aj3qpcoqloq.apps.googleusercontent.com"
},
{
"name":"num_response_bytes",
"intValue":"7158"
},
{
"name":"product_bucket",
"value":"GSUITE_ADMIN"
},
{
"name":"app_name",
"value":"Untitled project"
},
{
"name":"client_type",
"value":"WEB"
}
]
}
]
},
{
"kind":"admin#reports#activity",
"id":{
"time":"2022-05-05T23:58:48.914Z",
"uniqueQualifier":"-4002873813067783265",
"applicationName":"token",
"customerId":"df"
},
"etag":"\"5T53xK7dpLei95RNoKZd9uz5Xb8LJpBJb72fi2HaNYM/9DTdB8t7uixvUbjo4LUEg53_gf0\"",
"actor":{
"email":"blah.blah#bebe.net",
"profileId":"1324"
},
"ipAddress":"54.80.168.30",
"events":[
{
"type":"auth",
"name":"activity",
"parameters":[
{
"name":"api_name",
"value":"gmail"
},
{
"name":"method_name",
"value":"gmail.users.messages.list"
},
{
"name":"client_id",
"value":"927538837578.apps.googleusercontent.com"
},
{
"name":"num_response_bytes",
"intValue":"2"
},
{
"name":"product_bucket",
"value":"GMAIL"
},
{
"name":"app_name",
"value":"Zapier"
},
{
"name":"client_type",
"value":"WEB"
}
]
}
]
}
]
}
This would be a good place to start:
data = # your dict.
ids = [x['actor']['profileId'] for x in data['items']]
print(ids)
Output:
['1323', '1324']
I have streaming data coming in as JSON array and I want flatten it out as a single row in a Spark dataframe using Python.
Here is how the JSON data looks like:
{
"event": [
{
"name": "QuizAnswer",
"count": 1
}
],
"custom": {
"dimensions": [
{
"title": "Are you:"
},
{
"question_id": "5965"
},
{
"option_id": "19029"
},
{
"option_title": "Non-binary"
},
{
"item": "Non-binary"
},
{
"tab_index": "3"
},
{
"tab_count": "4"
},
{
"tab_initial_index": "4"
},
{
"page": "home"
},
{
"environment": "testing"
},
{
"page_count": "0"
},
{
"widget_version": "2.2.44"
},
{
"session_count": "1"
},
{
"quiz_settings_id": "1020"
},
{
"quiz_session": "6e5a3b5c-9961-4c1b-a2af-3374bbeccede"
},
{
"shopify_customer_id": "noid"
},
{
"cart_token": ""
},
{
"app_version": "2.2.44"
},
{
"shop_name": "safety-valve.myshopify.com"
}
],
"metrics": []
}
}
}
I was working in this query that you can see below and pythons IDE gives me a syntax error in "$group"´s line.The error is an "invalid syntax string" but I can not find where the problem is.
ID="01"
yearInicial="2018"
monthInicial="02"
dayInicial="11"
yearFinal="2018"
monthFinal="06"
dayFinal="22"
total=0
totalDias=0
result=list(db.alojamientos.aggregate( [ {'$match': { '$and': [ {"location.thcod":ID}, { "prices.year": { '$gte': yearInicial } },
{ "prices.year": { '$lte': yearFinal } }, {"prices.months.month": { '$gte': monthInicial } },
{ "prices.months.month": { '$lte': monthFinal } }, { "prices.months.days.day": { '$gte': dayInicial } },
{ "prices.months.days.day": { '$lte': dayFinal } }]}},
{'$unwind':"$prices"},
{'$unwind':"$prices.months"},
{'$unwind':"$prices.months.days"},
{'$unwind':"$prices.months.days.csvs"},
{'$unwind':"$prices.months.days.csvs.price"},
{'$group':{'_id':{ID, 'day':"$prices.months.days.day",'month': "$prices.months.month",'year':"$prices.year"},total:{'$sum':"$prices.months.days.csvs.price.price"}, count: { '$sum': 1 }
}}]))
Thanks you in advance
Delete the 3rd { in this line {'$group':{'_id':{ID, ....
And it should looks like {'$group':{'_id':ID, ...
I have had to create multple JSON files for reasons of processing a corpus (using GNRD http://gnrd.globalnames.org/ for scientific name extraction). I now want to use these JSON files to annotate said corpus as a whole.
I am trying to merge the multiple JSON files in Python. The contents of each JSON files are arrays of just scientific_name (key) and the name found (value). Below is an example of one of the shorter files:
{
"file":"biodiversity_trophic_9.txt",
"names":[
{
"scientificName":"Bufo"
},
{
"scientificName":"Eleutherodactylus talamancae"
},
{
"scientificName":"E. punctariolus"
},
{
"scientificName":"Norops lionotus"
},
{
"scientificName":"Centrolenella prosoblepon"
},
{
"scientificName":"Sibon annulatus"
},
{
"scientificName":"Colostethus flotator"
},
{
"scientificName":"C. inguinalis"
},
{
"scientificName":"Eleutherodactylus"
},
{
"scientificName":"Hyla columba"
},
{
"scientificName":"Bufo haematiticus"
},
{
"scientificName":"S. annulatus"
},
{
"scientificName":"Leptodeira septentrionalis"
},
{
"scientificName":"Imantodes cenchoa"
},
{
"scientificName":"Oxybelis brevirostris"
},
{
"scientificName":"Cressa"
},
{
"scientificName":"Coloma"
},
{
"scientificName":"Perlidae"
},
{
"scientificName":"Hydropsychidae"
},
{
"scientificName":"Hyla"
},
{
"scientificName":"Norops"
},
{
"scientificName":"Hyla colymbiphyllum"
},
{
"scientificName":"Colostethus inguinalis"
},
{
"scientificName":"Oxybelis"
},
{
"scientificName":"Rana warszewitschii"
},
{
"scientificName":"R. warszewitschii"
},
{
"scientificName":"Rhyacophilidae"
},
{
"scientificName":"Daphnia magna"
},
{
"scientificName":"Hyla colymba"
},
{
"scientificName":"Centrolenella"
},
{
"scientificName":"Orconectes nais"
},
{
"scientificName":"Orconectes neglectus"
},
{
"scientificName":"Campostoma anomalum"
},
{
"scientificName":"Caridina"
},
{
"scientificName":"Decapoda"
},
{
"scientificName":"Atyidae"
},
{
"scientificName":"Cerastoderma edule"
},
{
"scientificName":"Rana aurora"
},
{
"scientificName":"Riffle"
},
{
"scientificName":"Calopterygidae"
},
{
"scientificName":"Elmidae"
},
{
"scientificName":"Gyrinidae"
},
{
"scientificName":"Gerridae"
},
{
"scientificName":"Naucoridae"
},
{
"scientificName":"Oligochaeta"
},
{
"scientificName":"Veliidae"
},
{
"scientificName":"Libellulidae"
},
{
"scientificName":"Philopotamidae"
},
{
"scientificName":"Ephemeroptera"
},
{
"scientificName":"Psephenidae"
},
{
"scientificName":"Baetidae"
},
{
"scientificName":"Corduliidae"
},
{
"scientificName":"Zygoptera"
},
{
"scientificName":"B. buto"
},
{
"scientificName":"C. euknemos"
},
{
"scientificName":"C. ilex"
},
{
"scientificName":"E. padi noblei"
},
{
"scientificName":"E. padi"
},
{
"scientificName":"E. bufo"
},
{
"scientificName":"E. butoni"
},
{
"scientificName":"E. crassi"
},
{
"scientificName":"E. cruentus"
},
{
"scientificName":"H. colymbiphyllum"
},
{
"scientificName":"N. aterina"
},
{
"scientificName":"S. ilex"
},
{
"scientificName":"Anisoptera"
},
{
"scientificName":"Riffle delta"
}
],
"total":67,
"status":200,
"unique":true,
"engines":[
"TaxonFinder",
"NetiNeti"
],
"verbatim":false,
"input_url":null,
"token_url":"http://gnrd.globalnames.org/name_finder.html?token=2rtc4e70st",
"parameters":{
"engine":0,
"return_content":false,
"best_match_only":false,
"data_source_ids":[
],
"detect_language":true,
"all_data_sources":false,
"preferred_data_sources":[
]
},
"execution_time":{
"total_duration":3.1727607250213623,
"find_names_duration":1.9656541347503662,
"text_preparation_duration":1.000107765197754
},
"english_detected":true
}
The issue I have is that there may be duplicates across the files, which I want to remove (otherwise I could just concatenate the files I guess). The queries I have seen otherwise are referring to merging extra keys and values to extend the arrays themselves.
Can anyone give me guidance on how to overcome this issue?
If I understand correctly, you want to get all "scientificNames" values in the "names" elements of a batch of files. If I'm wrong, you should give an expected output to make things easier to understand.
I'd do something like that:
all_names = set() # use a set to avoid duplicates
# put all your files in there
for filename in ('file1.json', 'file2.json', ....):
try:
with open(filename, 'rt') as finput:
data = json.load(finput)
for name in data.get('names'):
all_names.add(name.get('scientificName')
except Exception as exc:
print("Skipped file {} because exception {}".format(filename, str(exc))
print(all_names)
And in case you want to get a similar format than the initial files, add:
import pprint
pprint({"names:": {"scientificName": name for name in all_names}, "total": len(all_names)})
test_cursor = db.command({
"aggregate": "New_layout",
"pipeline": [
{ "$match": { "$and": [
{ "FIRST_DATE": { "$gte": new_date } },
{ "CHAIN_ID": { "$ne": "" } }
] } },
{ "$unwind": { "path": "$ENTERS", "includeArrayIndex": "Date" } },
{ "$project": {
"_id": 0,
"SITE_ID": "$SITE_ID",
"CHAIN_ID": "$CHAIN_ID",
"SEGMENT_ID": "$SEGMENT_ID",
"ZIP": "$ZIP",
"ZIP3": "$ZIP3",
"MARKET_ID": "$MARKET_ID",
"REGION": "$REGION",
"MALL_CODE": "$MALL_CODE",
"MALL_AREA": "$MALL_AREA",
"MALL_NAME": "$MALL_NAME",
"FIRST_DATE": "$FIRST_DATE",
"MARKET_AREA": "$MARKET_AREA",
"REGION_AREA": "$REGION_AREA",
"ZIP_AREA": "$ZIP_AREA",
"ZIP3_AREA": "$ZIP3_AREA",
"DATE": "$Date",
"ENTERS": "$ENTERS"
} }
],
"allowDiskUse": bool(1),
"cursor": {}
})
asd=list(test_cursor)
The contents of the cursor are as below :-
[u'cursor', u'ok', u'waitedMS'] .
However with an $out statement, the output collection has the expected contents.
I am running pymongo v3.2.2 and mongo 3.2. I was told this problem is experienced with v3.0 or lesser, but this is something I am not able to figure out
You should use aggregate() instead of command().
test_cursor = db.New_layout.aggregate([
{ "$match": { "$and": [
{ "FIRST_DATE": { "$gte": new_date } },
{ "CHAIN_ID": { "$ne": "" } }
] } },
{ "$unwind": { "path": "$ENTERS", "includeArrayIndex": "Date" } },
{ "$project": {
"_id": 0,
"SITE_ID": "$SITE_ID",
"CHAIN_ID": "$CHAIN_ID",
"SEGMENT_ID": "$SEGMENT_ID",
"ZIP": "$ZIP",
"ZIP3": "$ZIP3",
"MARKET_ID": "$MARKET_ID",
"REGION": "$REGION",
"MALL_CODE": "$MALL_CODE",
"MALL_AREA": "$MALL_AREA",
"MALL_NAME": "$MALL_NAME",
"FIRST_DATE": "$FIRST_DATE",
"MARKET_AREA": "$MARKET_AREA",
"REGION_AREA": "$REGION_AREA",
"ZIP_AREA": "$ZIP_AREA",
"ZIP3_AREA": "$ZIP3_AREA",
"DATE": "$Date",
"ENTERS": "$ENTERS"
} }
],
allowDiskUse=True)