Iterate Python dict convert Mongo ObjectId into string if value is ObjectId - python

This my Dict structure, I cannot iterate entire dict to find ObjectId as value
My Input data:
{
"_id" : ObjectId("5671947d29c23846797d836a"),
"event_version" : "1.0",
"event_time" : ISODate("2015-12-16T16:42:37.501Z"),
"event_name" : "Create_Assignment",
"user_id" : "admin",
"tenant" : "Demo_Tenant",
"sourceIPAddress" : "",
"user_agent" : "",
"request_parameters" : {
"username" : "admin",
"status" : "active",
"first_name" : "",
"last_name" : "",
"is_deleted" : false,
"updated_by" : "admin",
"roles" : [
{
"_ref" : {
"$ref" : "role",
"$id" : ObjectId("5671947d29c23846797d8362")
},
"_cls" : "Role"
},
{
"_ref" : {
"$ref" : "role",
"$id" : ObjectId("5671947d29c23846797d8366")
},
"_cls" : "Role"
}
]
}
I have tried :
def todict(self, data, obj=None):
for key in data:
if isinstance(data[key], (ObjectId)):
print '>>>>>>>>>>', data[key]
obj[key]=str(data[key])
else:
if not isinstance(data[key], (str, unicode, list, datetime, bool)):
self.todict(data[key],obj)
else:
obj[key]=data[key]
return obj
But this it doesnt work properly. I need recursive function to convert all ObjectID values into str
Expected JSON:
{
"_id" : "5671947d29c23846797d836a",
"event_version" : "1.0",
"event_time" : ISODate("2015-12-16T16:42:37.501Z"),
"event_name" : "Create_Assignment",
"user_id" : "admin",
"tenant" : "Demo_Tenant",
"sourceIPAddress" : "",
"user_agent" : "",
"request_parameters" : {
"username" : "admin",
"status" : "active",
"first_name" : "",
"last_name" : "",
"is_deleted" : false,
"updated_by" : "admin",
"roles" : [
{
"_ref" : {
"$ref" : "role",
"$id" : "5671947d29c23846797d8362"
},
"_cls" : "Role"
},
{
"_ref" : {
"$ref" : "role",
"$id" : "5671947d29c23846797d8366"
},
"_cls" : "Role"
}
]
}

Never too late, hope this helps someone...
from bson import ObjectId
# convert recursively all ObjectIds to strings in a dictionary
def obj_to_str(data):
if isinstance(data, dict):
return {obj_to_str(key): obj_to_str(value) for key, value in data.items()}
elif isinstance(data, list):
return [obj_to_str(element) for element in data]
elif isinstance(data, ObjectId):
return str(data)
else:
return data

you can try this:
...
from bson import ObjectId
...
def todict(self, data, obj=None):
for key in data:
if isinstance(data[key], ObjectId):
obj[key]=str(data[key])
... ...

Related

Mongodb join two collections giving duplicate results

def get(self):
res = json.loads(dumps(
self.devices_col.aggregate([
{"$lookup": {
"from": "participants",
"localField": "_id.docgroupid",
"foreignField": "device_id",
"as": "participants"
}
},
{
"$unwind": "$participants"
}
])
))
return res
participants document
{
"_id" : ObjectId("5f7230502930714468ed892c"),
"hash" : "83a84e8bf170114cffcc3b1e178d6468",
"name" : "BOMW0000029529",
"persona_id" : "i123",
"command" : "start",
"va_info" : [
{
"device_id" : "5f722a742930714468ed8929",
"automation_config" : "",
"status" : "false",
"remote_path" : "/datadrive/gatewayfolder",
"version" : "1.3.0.9",
"latest_va_version" : "1.3.1.2",
"version_updated_on" : "",
"latest_va_build_number" : "20200525",
"last_connected_on" : "02/08/2020 11:25:55",
"last_seen_on" : "02/08/2020 11:25:55",
"last_activity_processed_on" : "02/07/2020 11:25:55"
}
],
"inclusions" : [
"myfinancewnscom",
"OUTLOOK",
"jp2launcher",
"EXCEL"
],
"created_by" : "",
"created_on" : "",
"modified_by" : "",
"modified_on" : ""
}
devices document
{
"_id" : ObjectId("5f722a742930714468ed8929"),
"name" : "",
"unique_id" : "u168381",
"os" : {
"version" : "6.2.9200.0",
"name" : "Microsoft Windows 10 Home",
"locale" : {
"geo_location" : null,
"time_zone" : "IST",
"day_light_saving_support" : false
},
"culture" : {
"name" : "en-US",
"LCID" : "1032",
"language" : "English (United States)"
},
"browser" : [
{
"name" : "IE",
"value" : "9.11.17763.0"
},
{
"name" : "Chrome",
"value" : "84.0.4147.105"
},
{
"name" : "Firefox",
"value" : "Not Found"
}
]
},
"created_by" : "",
"created_on" : "",
"modified_by" : "",
"modified_on" : ISODate("2020-07-21T06:08:50.876Z")
}
Here is my data.
Here is my piece of python code. i am using pymongo client to make query from mongodb
In above code i am trying to join two collection (devices and participants) with device_id (Which is inside participants)
collections.
I have only two records in each collections.
But, output giving me 4 result.
Two duplicate records it is giving.
Please have a look where i am making wrong.
It doesn't doubles, it multiplies: number of devices * number of participants.
In your pipeline you join the collections as:
{"$lookup": {
"from": "participants",
"localField": "_id.docgroupid",
"foreignField": "device_id",
"as": "participants"
}
}
There is no _id.docgroupid field in devices and there are no device_id field in participants so it makes a perfect match of each participant to each device.
After the lookup stage the participants field hold whole participants collection. When you unwind it you see the same parent document with each single participant. Even tho the _id values of the documents are the same they are not identical duplicates and differ by participants field.

Python - regex to get dynamic usernames from JSON text

I am trying to extract value of 'login' from a dump of JSON which is in the form of text (response.text)
Here's the string:
{
"name":"master",
"commit":{
"sha":"adc3208a9ac76262250a",
"commit":{
"author":{
"name":"root",
"email":"dan.ja#foo.ca",
"date":"2018-02-26T20:14:41Z"
},
"committer":{
"name":"GitHub Enterprise",
"date":"2018-02-26T20:14:41Z"
},
"message":"Update README.md",
"tree":{
"sha":"3e4710d0e021a0a7",
"comment_count":0,
"verification":{
"verified":false,
"reason":"unsigned",
"signature":null,
"payload":null
}
},
"author":{
"login":"kyle",
"id":5
}
I am just trying to pull the value 'kyle' from the login in the last line. The value of 'kyle' can change as it can be a different login each time. Thus I need string in "login":"string"
Here's what I have right now but that only gets me "login" :
/"login"[^\a]*"/g
Never parse JSON with regex, use a JSON parser.
With jq :
Input file :
{
"commit" : {
"commit" : {
"tree" : {
"verification" : {
"payload" : null,
"verified" : false,
"signature" : null,
"reason" : "unsigned"
},
"sha" : "3e4710d0e021a0a7",
"comment_count" : 0
},
"author" : {
"id" : 5,
"login" : "kyle"
},
"committer" : {
"name" : "GitHub Enterprise",
"date" : "2018-02-26T20:14:41Z"
},
"message" : "Update README.md"
},
"sha" : "adc3208a9ac76262250a"
},
"name" : "master"
}
Command :
$ jq '.commit.commit.author.login' file.json
Or via a python script :
#!/usr/bin/env python3
import json
string = """
{
"commit" : {
"commit" : {
"tree" : {
"verification" : {
"payload" : null,
"verified" : false,
"signature" : null,
"reason" : "unsigned"
},
"sha" : "3e4710d0e021a0a7",
"comment_count" : 0
},
"author" : {
"id" : 5,
"login" : "kyle"
},
"committer" : {
"name" : "GitHub Enterprise",
"date" : "2018-02-26T20:14:41Z"
},
"message" : "Update README.md"
},
"sha" : "adc3208a9ac76262250a"
},
"name" : "master"
}
"""
j = json.loads(string)
print(j['commit']['commit']['author']['login'])
Output :
"kyle"

PyMongo counting number of elements by date

I have DB with my users:
{
"_id": {
"$oid": "5a0decadefcb09087c08a868"
},
"user_id": "5b232a5a-b333-4320-ba63-722b9e167ef3",
"email": "email#email.com",
"password": "***",
"registration_date": {
"$date": "2017-11-16T19:53:17.946Z"
},
"type": "user"
},
{
"_id": {
"$oid": "5a0ded3aefcb090887d7f4fb"
},
"user_id": "0054bbde-3ba0-490f-8d54-ffaf72958888",
"email": "second#gmail.com",
"password": "***",
"registration_date": {
"$date": "2017-11-16T19:55:38.194Z"
},
"type": "user"
}
I want to count users by each date (registration_date) and get some thing like that:
01.01.2017 – 10
01.02.2017 – 20
01.03.2017 – 15
...
I'm trying that code, but it doesn't work:
def registrations_by_date(self):
users = self.users_db.aggregate([
{'$group': {
'_id': {'registration_date':'$date'},
'count': {'$sum':1}
}},
])
return users
What i'm doing wrong? How to get this data?
If the date in your schema is of ISODate
then the below aggregate query will work, the date format is done before grouping so that the timestamp is not taken while grouping the data
{
"_id" : "5a0decadefcb09087c08a868",
"user_id" : "5b232a5a-b333-4320-ba63-722b9e167ef3",
"email" : "email#email.com",
"password" : "***",
"registration_date" : ISODate("2017-11-16T19:53:17.946Z"),
"type" : "user"
}
{
"_id" : "5a0ded3aefcb090887d7f4fb",
"user_id" : "0054bbde-3ba0-490f-8d54-ffaf72958888",
"email" : "second#gmail.com",
"password" : "***",
"registration_date" : ISODate("2017-11-16T19:55:38.194Z"),
"type" : "user"
}
The aggregation query to get the result is
db.userReg.aggregate([
{$project:
{ formattedRegDate:
{ "$dateToString": {format:"%Y-%m-%d", date:"$registration_date"}}
}
},
{$group:{_id:"$formattedRegDate", count:{$sum:1}}}]);
and the result is
{ "_id" : "2017-11-16", "count" : 2 }
If the date in your schema is of String
then the below approach to be used
Sample Data
{
"_id" : "5a0decadefcb09087c08a868",
"user_id" : "5b232a5a-b333-4320-ba63-722b9e167ef3",
"email" : "email#email.com",
"password" : "***",
"registration_date" : "2017-11-16T19:53:17.946Z",
"type" : "user"
}
{
"_id" : "5a0ded3aefcb090887d7f4fb",
"user_id" : "0054bbde-3ba0-490f-8d54-ffaf72958888",
"email" : "second#gmail.com",
"password" : "***",
"registration_date" : "2017-11-16T19:55:38.194Z",
"type" : "user"
}
Query
db.userReg.aggregate([{
$group:{ _id: { date: {"$substr":["$registration_date", 0, 10]}},
count:{$sum:1}
}
}]);
and the result is
{ "_id" : { "date" : "2017-11-16" }, "count" : 2 }
It seems you have an extra ,
db.userReg.aggregate([
{$group: {_id: "$registration_date", count: {$sum:1}}}
])
This gives the correct result(ON the basis of record on my mcahine) :
{
"_id" : ISODate("2017-11-15T19:55:38.194Z"),
"count" : 1.0 }
{
"_id" : ISODate("2017-11-16T19:55:38.194Z"),
"count" : 2.0 }

Tranquility server would not send data to druid

I'm using imply-2.2.3. Here is my tranquility server configuration:
{
"dataSources" : [
{
"spec" : {
"dataSchema" : {
"dataSource" : "tutorial-tranquility-server",
"parser" : {
"type" : "string",
"parseSpec" : {
"timestampSpec" : {
"column" : "timestamp",
"format" : "auto"
},
"dimensionsSpec" : {
"dimensions" : [],
"dimensionExclusions" : [
"timestamp",
"value"
]
},
"format" : "json"
}
},
"granularitySpec" : {
"type" : "uniform",
"segmentGranularity" : "hour",
"queryGranularity" : "none"
},
"metricsSpec" : [
{
"type" : "count",
"name" : "count"
},
{
"name" : "value_sum",
"type" : "doubleSum",
"fieldName" : "value"
},
{
"fieldName" : "value",
"name" : "value_min",
"type" : "doubleMin"
},
{
"type" : "doubleMax",
"name" : "value_max",
"fieldName" : "value"
}
]
},
"ioConfig" : {
"type" : "realtime"
},
"tuningConfig" : {
"type" : "realtime",
"maxRowsInMemory" : "50000",
"windowPeriod" : "PT10M"
}
},
"properties" : {
"task.partitions" : "1",
"task.replicants" : "1"
}
},
{
"spec": {
"dataSchema" : {
"dataSource" : "test",
"parser" : {
"type" : "string",
"parseSpec" : {
"timestampSpec" : {
"column" : "timestamp",
"format" : "auto"
},
"dimensionsSpec" : {
"dimensions" : [
"a"
],
},
"format" : "json"
}
},
"granularitySpec" : {
"type" : "uniform",
"segmentGranularity" : "hour",
"queryGranularity" : "none"
},
"metricsSpec" : [
{
"type" : "count",
"name" : "count"
},
{
"type": "doubleSum",
"name": "b",
"fieldName": "b"
}
]
},
"ioConfig" : {
"type" : "realtime"
},
"tuningConfig" : {
"type" : "realtime",
"maxRowsInMemory" : "50000",
"windowPeriod" : "P1Y"
}
},
"properties": {
"task.partitions" : "1",
"task.replicants" : "1"
}
}
],
"properties" : {
"zookeeper.connect" : "localhost",
"druid.discovery.curator.path" : "/druid/discovery",
"druid.selectors.indexing.serviceName" : "druid/overlord",
"http.port" : "8200",
"http.threads" : "40",
"serialization.format" : "smile",
"druidBeam.taskLocator": "overlord"
}
}
I have trouble sending data to the second datasoruce, test, specifically. I tried to send the below data to druid with python requests:
{'b': 7, 'timestamp': '2017-01-20T03:32:54.586415', 'a': 't'}
The response I receive:
b'{"result":{"received":1,"sent":0}}'
If you read my config file you will notice that I set window period to one year. I would like to send data in with a large time span to druid using tranquility server. Is there something wrong with my config or data?

Python json schema that extracts parameters

I need to parse requests to a single url that are coming in JSON, but in several different formats. For example, some have timestamp noted as timestamp attr, others as unixtime etc. So i want to create json schemas for all types of requests that not only validate incoming JSONs but also extract their parameters from specified places. Is there a library that can do that?
Example:
If I could define a schema that would look something like this
schema = {
"type" : "object",
"properties" : {
"price" : {
"type" : "number",
"mapped_name": "product_price"
},
"name" : {
"type" : "string",
"mapped_name": "product_name"
},
"added_at":{
"type" : "int",
"mapped_name": "timestamp"
},
},
}
and then apply it to a dict
request = {
"name" : "Eggs",
"price" : 34.99,
'added_at': 1234567
}
by some magical function
params = validate_and_extract(request, schema)
I want params to have mapped values there:
{"mapped_name": "Eggs", "product_price": 34.99, "timestamp": 1234567}
so this is a module I'm looking for. And it should support nested dicts in request, not just flat dicts.
The following code may help. It supports nested dict as well.
import json
def valid_type(type_name, obj):
if type_name == "number":
return isinstance(obj, int) or isinstance(obj, float)
if type_name == "int":
return isinstance(obj, int)
if type_name == "float":
return isinstance(obj, float)
if type_name == "string":
return isinstance(obj, str)
def validate_and_extract(request, schema):
''' Validate request (dict) against the schema (dict).
Validation is limited to naming and type information.
No check is done to ensure all elements in schema
are present in the request. This could be enhanced by
specifying mandatory/optional/conditional information
within the schema and subsequently checking for that.
'''
out = {}
for k, v in request.items():
if k not in schema['properties'].keys():
print("Key '{}' not in schema ... skipping.".format(k))
continue
if schema['properties'][k]['type'] == 'object':
v = validate_and_extract(v, schema['properties'][k])
elif not valid_type(schema['properties'][k]['type'], v):
print("Wrong type for '{}' ... skipping.".format(k))
continue
out[schema['properties'][k]['mapped_name']] = v
return out
# Sample Data 1
schema1 = {
"type" : "object",
"properties" : {
"price" : {
"type" : "number",
"mapped_name": "product_price"
},
"name" : {
"type" : "string",
"mapped_name": "product_name"
},
"added_at":{
"type" : "int",
"mapped_name": "timestamp"
},
},
}
request1 = {
"name" : "Eggs",
"price" : 34.99,
'added_at': 1234567
}
# Sample Data 2: containing nested dict
schema2 = {
"type" : "object",
"properties" : {
"price" : {
"type" : "number",
"mapped_name": "product_price"
},
"name" : {
"type" : "string",
"mapped_name": "product_name"
},
"added_at":{
"type" : "int",
"mapped_name": "timestamp"
},
"discount":{
"type" : "object",
"mapped_name": "offer",
"properties" : {
"percent": {
"type" : "int",
"mapped_name": "percentage"
},
"last_date": {
"type" : "string",
"mapped_name": "end_date"
},
}
},
},
}
request2 = {
"name" : "Eggs",
"price" : 34.99,
'added_at': 1234567,
'discount' : {
'percent' : 40,
'last_date' : '2016-09-25'
}
}
params = validate_and_extract(request1, schema1)
print(params)
params = validate_and_extract(request2, schema2)
print(params)
Output from running this:
{'timestamp': 1234567, 'product_name': 'Eggs', 'product_price': 34.99}
{'offer': {'percentage': 40, 'end_date': '2016-09-25'}, 'timestamp': 1234567, 'product_name': 'Eggs', 'product_price': 34.99}
See http://json-schema.org
This doesn't look like a Python question.

Categories

Resources