How to use distinct with pipeline in mongodb using python - python

i have data like this
{ "_id": "1234gbrghr",
"Device" : "samsung",
"UserId" : "12654",
"Month" : "july"
},
{ "_id": "1278gbrghr",
"Device" : "nokia",
"UserId" : "87654",
"Month" : "july"
},
{ "_id": "1234gbrghr",
"Device" : "samsung",
"UserId" : "12654",
"Month" : "july"
}
I need to get the no of distinct user for a particular device in the month of july . for example " If a user(UserId) used samsung device twice or more in the month of july then it will count it as one for samsung .
For this i used this query to get the total no of users in the the month of july . but i need to get the distinct no of users
pipeline1 = [
{'$match':{'Month':'july'}},
{'$group':{'_id' : '$Device', 'count' : { '$sum' : 1 }}}
]
data = db.command('aggregate', 'collection', pipeline=pipeline1);

You will need to group on device and user instead first. You can do that with the following pipeline operator:
{'$group':{'_id' : { d: '$Device', u: '$UserId' } } }
And then secondly you need to count the number of devices per user (like you already had, but slighty modified:
{ '$group': { '_id' : '$_id.d', 'count': { '$sum' : 1 } } }
With the following dataset:
{ "_id" : "1234gbrghr", "Device" : "samsung", "UserId" : "12654", "Month" : "july" }
{ "_id" : "1278gbrghr", "Device" : "nokia", "UserId" : "87654", "Month" : "july" }
{ "_id" : "1239gbrghr", "Device" : "samsung", "UserId" : "12654", "Month" : "july" }
{ "_id" : "1238gbrghr", "Device" : "samsung", "UserId" : "12653", "Month" : "july" }
And the following aggregate command:
db.so.aggregate( [
{ '$match' : {'Month' : 'july' } },
{ '$group' : {
'_id' : { d: '$Device', u: '$UserId' },
'count' : { '$sum' : 1 }
} },
{ '$group': {
'_id' : '$_id.d',
'count': { '$sum' : 1 }
} }
] );
This outputs:
{
"result" : [
{
"_id" : "nokia",
"count" : 1
},
{
"_id" : "samsung",
"count" : 2
}
],
"ok" : 1
}

Related

Using Python to build a special data container

My english is poor.
I have a piece of json data with such a structure, and I want to use python to aggregate this data, as shown in the figure.
If the service.name is the same, then it needs to be archived, and the duplicate "url.path" needs to be removed
I don’t know what method to use to store data, use list? dict?
Can anyone help please? thanks
```
{
[
{
"_source" : {
"error" : {
"exception" : [
{
"handled" : false,
"type" : "Abp.UI.UserFriendlyException",
"message" : "未发现该用户的WeiXinUserRelation,粉丝编号447519"
}
]
},
"trace" : {
"id" : "a3e3796ca145b448829d0d0f96661e67"
},
"#timestamp" : "2021-06-21T06:57:52.603Z",
"service" : {
"name" : "Lonsid_AAA_Web_Host"
}, "url" : {
"path" : "/product/getAAA" }
}
},
{
"_source" : {
"error" : {
"exception" : [
{
"handled" : false,
"type" : "Abp.UI.UserFriendlyException",
"message" : "未发现该用户的WeiXinUserRelation,粉丝编号447519"
}
]
},
"trace" : {
"id" : "a3e3796ca145b448829d0d0f96661e67"
},
"#timestamp" : "2021-06-21T06:57:52.603Z",
"service" : {
"name" : "Lonsid_BBB_Web_Host"
}, "url" : {
"path" : "/product/getBBB" }
}
},
{
"_source" : {
"error" : {
"exception" : [
{
"handled" : false,
"type" : "Abp.UI.UserFriendlyException",
"message" : "未发现该用户的WeiXinUserRelation,粉丝编号447519"
}
]
},
"trace" : {
"id" : "a3e3796ca145b448829d0d0f96661e67"
},
"#timestamp" : "2021-06-21T06:57:52.603Z",
"service" : {
"name" : "Lonsid_AAA_Web_Host"
}, "url" : {
"path" : "/product/getAAA" }
}
} ] }
```
This should get you started. It builds a cache of all the known past names, and drops anything that is previously seen.
import pprint
import json
json_data = """[
{
"_source" : {
"error" : {
"exception" : [
{
"handled" : false,
"type" : "Abp.UI.UserFriendlyException",
"message" : "未发现该用户的WeiXinUserRelation,粉丝编号447519"
}
]
},
"trace" : {
"id" : "a3e3796ca145b448829d0d0f96661e67"
},
"#timestamp" : "2021-06-21T06:57:52.603Z",
"service" : {
"name" : "Lonsid_AAA_Web_Host"
}, "url" : {
"path" : "/product/getAAA" }
}
},
{
"_source" : {
"error" : {
"exception" : [
{
"handled" : false,
"type" : "Abp.UI.UserFriendlyException",
"message" : "未发现该用户的WeiXinUserRelation,粉丝编号447519"
}
]
},
"trace" : {
"id" : "a3e3796ca145b448829d0d0f96661e67"
},
"#timestamp" : "2021-06-21T06:57:52.603Z",
"service" : {
"name" : "Lonsid_BBB_Web_Host"
}, "url" : {
"path" : "/product/getBBB" }
}
},
{
"_source" : {
"error" : {
"exception" : [
{
"handled" : false,
"type" : "Abp.UI.UserFriendlyException",
"message" : "未发现该用户的WeiXinUserRelation,粉丝编号447519"
}
]
},
"trace" : {
"id" : "a3e3796ca145b448829d0d0f96661e67"
},
"#timestamp" : "2021-06-21T06:57:52.603Z",
"service" : {
"name" : "Lonsid_AAA_Web_Host"
}, "url" : {
"path" : "/product/getAAA" }
}
}
]"""
data = json.loads(json_data)
cache = {}
for item in data:
if item['_source']['service']['name'] not in cache:
cache[item['_source']['service']['name']] = item
pprint.pprint(list(cache.values()))

PyMongo counting number of elements by date

I have DB with my users:
{
"_id": {
"$oid": "5a0decadefcb09087c08a868"
},
"user_id": "5b232a5a-b333-4320-ba63-722b9e167ef3",
"email": "email#email.com",
"password": "***",
"registration_date": {
"$date": "2017-11-16T19:53:17.946Z"
},
"type": "user"
},
{
"_id": {
"$oid": "5a0ded3aefcb090887d7f4fb"
},
"user_id": "0054bbde-3ba0-490f-8d54-ffaf72958888",
"email": "second#gmail.com",
"password": "***",
"registration_date": {
"$date": "2017-11-16T19:55:38.194Z"
},
"type": "user"
}
I want to count users by each date (registration_date) and get some thing like that:
01.01.2017 – 10
01.02.2017 – 20
01.03.2017 – 15
...
I'm trying that code, but it doesn't work:
def registrations_by_date(self):
users = self.users_db.aggregate([
{'$group': {
'_id': {'registration_date':'$date'},
'count': {'$sum':1}
}},
])
return users
What i'm doing wrong? How to get this data?
If the date in your schema is of ISODate
then the below aggregate query will work, the date format is done before grouping so that the timestamp is not taken while grouping the data
{
"_id" : "5a0decadefcb09087c08a868",
"user_id" : "5b232a5a-b333-4320-ba63-722b9e167ef3",
"email" : "email#email.com",
"password" : "***",
"registration_date" : ISODate("2017-11-16T19:53:17.946Z"),
"type" : "user"
}
{
"_id" : "5a0ded3aefcb090887d7f4fb",
"user_id" : "0054bbde-3ba0-490f-8d54-ffaf72958888",
"email" : "second#gmail.com",
"password" : "***",
"registration_date" : ISODate("2017-11-16T19:55:38.194Z"),
"type" : "user"
}
The aggregation query to get the result is
db.userReg.aggregate([
{$project:
{ formattedRegDate:
{ "$dateToString": {format:"%Y-%m-%d", date:"$registration_date"}}
}
},
{$group:{_id:"$formattedRegDate", count:{$sum:1}}}]);
and the result is
{ "_id" : "2017-11-16", "count" : 2 }
If the date in your schema is of String
then the below approach to be used
Sample Data
{
"_id" : "5a0decadefcb09087c08a868",
"user_id" : "5b232a5a-b333-4320-ba63-722b9e167ef3",
"email" : "email#email.com",
"password" : "***",
"registration_date" : "2017-11-16T19:53:17.946Z",
"type" : "user"
}
{
"_id" : "5a0ded3aefcb090887d7f4fb",
"user_id" : "0054bbde-3ba0-490f-8d54-ffaf72958888",
"email" : "second#gmail.com",
"password" : "***",
"registration_date" : "2017-11-16T19:55:38.194Z",
"type" : "user"
}
Query
db.userReg.aggregate([{
$group:{ _id: { date: {"$substr":["$registration_date", 0, 10]}},
count:{$sum:1}
}
}]);
and the result is
{ "_id" : { "date" : "2017-11-16" }, "count" : 2 }
It seems you have an extra ,
db.userReg.aggregate([
{$group: {_id: "$registration_date", count: {$sum:1}}}
])
This gives the correct result(ON the basis of record on my mcahine) :
{
"_id" : ISODate("2017-11-15T19:55:38.194Z"),
"count" : 1.0 }
{
"_id" : ISODate("2017-11-16T19:55:38.194Z"),
"count" : 2.0 }

Tranquility server would not send data to druid

I'm using imply-2.2.3. Here is my tranquility server configuration:
{
"dataSources" : [
{
"spec" : {
"dataSchema" : {
"dataSource" : "tutorial-tranquility-server",
"parser" : {
"type" : "string",
"parseSpec" : {
"timestampSpec" : {
"column" : "timestamp",
"format" : "auto"
},
"dimensionsSpec" : {
"dimensions" : [],
"dimensionExclusions" : [
"timestamp",
"value"
]
},
"format" : "json"
}
},
"granularitySpec" : {
"type" : "uniform",
"segmentGranularity" : "hour",
"queryGranularity" : "none"
},
"metricsSpec" : [
{
"type" : "count",
"name" : "count"
},
{
"name" : "value_sum",
"type" : "doubleSum",
"fieldName" : "value"
},
{
"fieldName" : "value",
"name" : "value_min",
"type" : "doubleMin"
},
{
"type" : "doubleMax",
"name" : "value_max",
"fieldName" : "value"
}
]
},
"ioConfig" : {
"type" : "realtime"
},
"tuningConfig" : {
"type" : "realtime",
"maxRowsInMemory" : "50000",
"windowPeriod" : "PT10M"
}
},
"properties" : {
"task.partitions" : "1",
"task.replicants" : "1"
}
},
{
"spec": {
"dataSchema" : {
"dataSource" : "test",
"parser" : {
"type" : "string",
"parseSpec" : {
"timestampSpec" : {
"column" : "timestamp",
"format" : "auto"
},
"dimensionsSpec" : {
"dimensions" : [
"a"
],
},
"format" : "json"
}
},
"granularitySpec" : {
"type" : "uniform",
"segmentGranularity" : "hour",
"queryGranularity" : "none"
},
"metricsSpec" : [
{
"type" : "count",
"name" : "count"
},
{
"type": "doubleSum",
"name": "b",
"fieldName": "b"
}
]
},
"ioConfig" : {
"type" : "realtime"
},
"tuningConfig" : {
"type" : "realtime",
"maxRowsInMemory" : "50000",
"windowPeriod" : "P1Y"
}
},
"properties": {
"task.partitions" : "1",
"task.replicants" : "1"
}
}
],
"properties" : {
"zookeeper.connect" : "localhost",
"druid.discovery.curator.path" : "/druid/discovery",
"druid.selectors.indexing.serviceName" : "druid/overlord",
"http.port" : "8200",
"http.threads" : "40",
"serialization.format" : "smile",
"druidBeam.taskLocator": "overlord"
}
}
I have trouble sending data to the second datasoruce, test, specifically. I tried to send the below data to druid with python requests:
{'b': 7, 'timestamp': '2017-01-20T03:32:54.586415', 'a': 't'}
The response I receive:
b'{"result":{"received":1,"sent":0}}'
If you read my config file you will notice that I set window period to one year. I would like to send data in with a large time span to druid using tranquility server. Is there something wrong with my config or data?

Why elasticsearch is not able to retrieve document by document id?

I am new to elasticsearch. I have created a new index, using following REST API:-
req = {
"settings": {
"analysis": {
"analyzer": {
"hinglish_analyzer": {
"type": "custom",
"tokenizer": "standard",
"char_filter": [
"html_strip"
],
"filter": [
"lowercase",
"asciifolding",
"hinglish-token-filter"
]
}
}
}
},
"mappings" : {
"p_ss__user" : {
"properties" : {
"age" : {
"type": "integer"
},
"first_name" : {
"type" : "text",
"analyzer": "hinglish_analyzer"
},
"gender" : {
"type" : "long"
},
"is_alive" : {
"type" : "boolean"
},
"last_name" : {
"type" : "text",
"analyzer": "hinglish_analyzer"
},
"marital_status" : {
"type" : "long"
},
"user_gotra" : {
"properties" : {
"Gotra" : {
"type" : "text",
"analyzer": "hinglish_analyzer"
},
"id" : {
"type" : "long"
},
"kuldevi" : {
"properties" : {
"Kuldevi" : {
"type" : "text",
"analyzer": "hinglish_analyzer"
},
"id" : {
"type" : "long"
}
}
}
}
},
"user_village" : {
"properties" : {
"areaOrVillageName" : {
"type" : "text",
"analyzer": "hinglish_analyzer"
},
"id" : {
"type" : "long"
},
"tehsil" : {
"properties" : {
"city" : {
"properties" : {
"cityName" : {
"type" : "text",
"analyzer": "hinglish_analyzer"
},
"id" : {
"type" : "long"
},
"state" : {
"properties" : {
"country" : {
"properties" : {
"countryCode" : {
"type" : "text"
},
"countryName" : {
"type" : "text",
"analyzer": "hinglish_analyzer"
},
"id" : {
"type" : "long"
}
}
},
"id" : {
"type" : "long"
},
"stateCode" : {
"type" : "text"
},
"stateName" : {
"type" : "text",
"analyzer": "hinglish_analyzer"
}
}
}
}
},
"id" : {
"type" : "long"
},
"tehsilName" : {
"type" : "text",
"analyzer": "hinglish_analyzer"
}
}
},
"zipcode" : {
"type" : "text"
}
}
},
"username" : {
"type" : "text",
"analyzer": "hinglish_analyzer"
}
}
}
}
}
Here, 'hinglish-token-filter' is my custom token filter, which I have written and is perfectly fine.
Now, I have created a document in elasticsearch with the help of python script(here I pass my own value of _id variable in the request), which looks like given below :-
{
"username" : "Gopi_Chand",
"first_name" : "Gopi Chand",
"last_name" : "",
"gender" : 2,
"age" : 44,
"user_gotra" : {
"Gotra" : "Thanak",
"kuldevi" : {
"Kuldevi" : "Maa Jagdambaa",
"id" : 1
},
"id" : 1,
"kulrishi" : {
"Rishi" : "Parashar",
"id" : 1
}
},
"user_village" : {
"areaOrVillageName" : "Sanatra",
"tehsil" : {
"city" : {
"state" : {
"country" : {
"countryName" : "India",
"id" : 1,
"countryCode" : "IND"
},
"stateName" : "Rajasthan",
"id" : 1
},
"cityName" : "Barmer (Meru)",
"id" : 1
},
"tehsilName" : "Baitu",
"id" : 1
},
"id" : 1,
"zipcode" : ""
},
"marital_status" : 1,
"is_alive" : true
}
The document is successfully getting stored in the elasticsearch with the Id that I have passed, along with other values.
But the problem comes when I try to retrieve the document with the id, that I have set :-
http://localhost:9200/users/p_s_s__user/3222/
It gives me following response :-
{"_index":"users","_type":"p_s_s__user","_id":"3222","found":false}
But when I try following query :-
http://localhost:9200/users/_search?pretty=true
it shows me my document, as shown below :-
{
"took" : 13,
"timed_out" : false,
"_shards" : {
"total" : 5,
"successful" : 5,
"failed" : 0
},
"hits" : {
"total" : 1,
"max_score" : 1.0,
"hits" : [
{
"_index" : "users",
"_type" : "p_ss__user",
"_id" : "3222",
"_score" : 1.0,
"_source" : {
"username" : "Gopi_Chand",
"first_name" : "Gopi Chand",
"last_name" : "",
"gender" : 2,
"age" : 44,
"user_gotra" : {
"Gotra" : "Thanak",
"kuldevi" : {
"Kuldevi" : "Maa Jagdambaa",
"id" : 1
},
"id" : 1,
"kulrishi" : {
"Rishi" : "Parashar",
"id" : 1
}
},
"user_village" : {
"areaOrVillageName" : "Sanatra",
"tehsil" : {
"city" : {
"state" : {
"country" : {
"countryName" : "India",
"id" : 1,
"countryCode" : "IND"
},
"stateName" : "Rajasthan",
"id" : 1
},
"cityName" : "Barmer (Meru)",
"id" : 1
},
"tehsilName" : "Baitu",
"id" : 1
},
"id" : 1,
"zipcode" : ""
},
"marital_status" : 1,
"is_alive" : true
}
}
]
}
}
Can you help me out, what wrong I have done ? Moreover, other queries such as "match" queries, are also not working.
Thanks in advance.

Mongodb query with regex

I am learning python with mongodb in tornado.
{
"_id" : ObjectId("566fb466d82f0769fbb0fb10"),
"Pid" : "1",
"Registration" : "1234",
"Date" : "11-11-2015",
"status" : "booked"
}
{
"_id" : ObjectId("566fb47cd82f0769fbb0fb11"),
"Pid" : "1",
"Registration" : "1234",
"Date" : "13-11-2015",
"status" : "AOs"
}
{
"_id" : ObjectId("566fb48bd82f0769fbb0fb12"),
"Pid" : "1",
"Registration" : "1234",
"Date" : "14-11-2015",
"status" : "NA"
}
{
"_id" : ObjectId("566fb4a3d82f0769fbb0fb13"),
"Pid" : "1",
"Registration" : "1234",
"Date" : "16-12-2015",
"status" : "AOr"
}
if I am passing the month and year only ,I need the entire record of the given month and year.
I tried doing this:
db.cal.aggregate({$project:{"monthyear": {$substr: ["$Date",0,2]}}})
the output is:
{ "_id" : ObjectId("566fb466d82f0769fbb0fb10"), "monthyear" : "11" }
{ "_id" : ObjectId("566fb47cd82f0769fbb0fb11"), "monthyear" : "13" }
{ "_id" : ObjectId("566fb48bd82f0769fbb0fb12"), "monthyear" : "14" }
{ "_id" : ObjectId("566fb4a3d82f0769fbb0fb13"), "monthyear" : "16" }
but I need the below output:
{
"_id" : ObjectId("566fb4a3d82f0769fbb0fb13")
"Pid" : "1",
"Registration" : "1234",
"Date" : "16-12-2015",
"status" : "AOr"
}
Please, help
What about this:
db.cal.find({"Date":{"$regex":"12-2015"}})

Categories

Resources