MongDB - queries a large collection

MongDB - queries a large collection - python

I have 40 MM documents in my MongoDB collection (e.g. db.large_collection)
I want to get all the distinct User_ID.
I have created an Index on the field user_id but when I try to execute, it returns an error.
> db.large_collection.count()
39894523
> db.clean_tweets4.getIndexes()
[
{
"v" : 1,
"key" : {
"_id" : 1
},
"name" : "_id_",
"ns" : "sampled_tourist.clean_tweets4"
},
{
"v" : 1,
"key" : {
"user_id" : 1
},
"name" : "user_id_1",
"ns" : "sampled_tourist.clean_tweets4"
},
{
"v" : 1,
"key" : {
"coordinates" : 1
},
"name" : "coordinates_1",
"ns" : "sampled_tourist.clean_tweets4"
},
{
"v" : 1,
"key" : {
"timestamp_ms" : 1
},
"name" : "timestamp_ms_1",
"ns" : "sampled_tourist.clean_tweets4"
}
]
But when I run
db.clean_tweets4.find({},{user_id:1})
{ "_id" : ObjectId("5790f9a178776f4b56ede2be"), "user_id" : NumberLong("2246342226") }
{ "_id" : ObjectId("5790f9a178776f4b56ede2bf"), "user_id" : NumberLong("2289817236") }
{ "_id" : ObjectId("5790f9a178776f4b56ede2c0"), "user_id" : 1904381486 }
{ "_id" : ObjectId("5790f9a178776f4b56ede2c1"), "user_id" : NumberLong("3044032705") }
{ "_id" : ObjectId("5790f9a178776f4b56ede2c2"), "user_id" : NumberLong("3407958364") }
{ "_id" : ObjectId("5790f9d278776f4b56ee4af2"), "user_id" : 1566025975 }
{ "_id" : ObjectId("5790f7ab78776f4b56ea55c6"), "user_id" : 15857879 }
{ "_id" : ObjectId("5790f9a178776f4b56ede28f"), "user_id" : NumberLong("3394102511") }
{ "_id" : ObjectId("5790f9a178776f4b56ede293"), "user_id" : 1376377652 }
{ "_id" : ObjectId("5790f9a178776f4b56ede294"), "user_id" : 352385989 }
{ "_id" : ObjectId("5790f9a178776f4b56ede295"), "user_id" : NumberLong("2383622643") }
{ "_id" : ObjectId("5790f9a178776f4b56ede29c"), "user_id" : 152362163 }
{ "_id" : ObjectId("5790f9a178776f4b56ede2a0"), "user_id" : 1446113954 }
{ "_id" : ObjectId("5790f9a178776f4b56ede2a1"), "user_id" : 1893437088 }
{ "_id" : ObjectId("5790f9a178776f4b56ede2a2"), "user_id" : 67121578 }
{ "_id" : ObjectId("5790f9a178776f4b56ede2a3"), "user_id" : 1714137770 }
{ "_id" : ObjectId("5790f9a178776f4b56ede2a4"), "user_id" : 52806609 }
Thanks!

find({}); means it returns everything.
This is what you want.
db.collection.find({user_id:1});

Related

How to acces element from json object

How can i access each element from the json object using beautifulsoup or regex?
For example call timestamp and get the output = 1561238146781
{
"timestamp" : 1561238146781,
"context" : {
"external_urls" : {
"spotify" : "https://open.spotify.com/playlist/4rwZyPeww8O7ZPnQqHD2q"
},
"href" : "https://api.spotify.com/v1/playlists/4rwZyPqew8O7ZPnQqHD2q",
"type" : "playlist",
"uri" :
"spotify:user:3sui5bqpr8a0i2t:playlist:4rwZyPqewO7ZPnQqHD2q"
},
"progress_ms" : 115728,
"item" : {
"album" : {
"album_type" : "album",
"artists" : [ {
"external_urls" : {
"spotify" :
"https://open.spotify.com/artist/5eAWCfyUhZqwHtBdNk56l1"
},
"href" :
"https://api.spotify.com/v1/artists/5eAWCfyeZtHHtBdNk56l1",
"id" : "5eAWCfyUhZtHHtBdNk56l1",
"name" : "System Of A Down",
"type" : "artist",
"uri" : "spotify:artist:5eAWewyUhZtHHtBdNk56l1"
} ],
"available_markets" : [ ],
"external_urls" : {
"spotify" : "https://open.spotify.com/album/4DR0Gds7w2GJyQnFVa4jAB"
},
"href" : "https://api.spotify.com/v1/albums/4DR0ewwsdJyQnFVa4jAB",
"id" : "4DR0GWo7w2ewyQnFVa4jAB",
"images" : [ {
"height" : 640,
"url" :
"https://i.scdn.co/image/932e185b217ew6caasd837dbe30d54028de9cfc",
"width" : 615
}, {
"height" : 300,
"url" :
"https://i.scdn.co/image/30de1d4e1ew38sd89573893d8494fd6a66",
"width" : 288
}, {
"height" : 64,
"url" :
"https://i.scdn.co/image/1682cd0e8ew8bf87sdc4cd1e01ce24cd165b",
"width" : 62
} ],
"name" : "Toxicity",
"release_date" : "2001-01-01",
"release_date_precision" : "day",
"total_tracks" : 14,
"type" : "album",
"uri" : "spotify:album:4DR0GewdGJyQnFVa4jAB"
},
"artists" : [ {
"external_urls" : {
"spotify" : "https://open.spotify.com/artist/5eAWCsdUweHtBdNk56l1"
},
"href" : "https://api.spotify.com/v1/artists/5eAWCfewhdsHtBdNk56l1",
"id" : "5eAWCfyUhZtHHtBewk56l1",
"name" : "System Of A Down",
"type" : "artist",
"uri" : "spotify:artist:5eAWCfyUsdtHHtBdNk56l1"
} ],
"available_markets" : [ ],
"disc_number" : 1,
"duration_ms" : 235599,
"explicit" : false,
"external_ids" : {
"isrc" : "USSM10107264"
},
"external_urls" : {
"spotify" : "https://open.spotify.com/track/1twBtsdaZiy7HWPG025QGuP"
},
"href" : "https://api.spotify.com/v1/tracks/1twBt7aZiy7HWPG025QGuP",
"id" : "1twBt7aZiy7HWweG025QGuP",
"is_local" : false,
"name" : "Aerials",
"popularity" : 9,
"preview_url" : null,
"track_number" : 14,
"type" : "track",
"uri" : "spotify:track:1twBt7aZieWPG025QGuP"
},
"currently_playing_type" : "track",
"actions" : {
"disallows" : {
"resuming" : true
}
},
"is_playing" : true
}
Call "is-playing" and get true
I've tried making the hole array a list (each element) and trying to get the element from the list but i've realized that this is hardly useful.

make your json data into data.json file then execute this code :
import json
with open('data.json',"r") as f:
data = json.load(f)
print(data["is_playing"])
output
True

PyMongo counting number of elements by date

I have DB with my users:
{
"_id": {
"$oid": "5a0decadefcb09087c08a868"
},
"user_id": "5b232a5a-b333-4320-ba63-722b9e167ef3",
"email": "email#email.com",
"password": "***",
"registration_date": {
"$date": "2017-11-16T19:53:17.946Z"
},
"type": "user"
},
{
"_id": {
"$oid": "5a0ded3aefcb090887d7f4fb"
},
"user_id": "0054bbde-3ba0-490f-8d54-ffaf72958888",
"email": "second#gmail.com",
"password": "***",
"registration_date": {
"$date": "2017-11-16T19:55:38.194Z"
},
"type": "user"
}
I want to count users by each date (registration_date) and get some thing like that:
01.01.2017 – 10
01.02.2017 – 20
01.03.2017 – 15
...
I'm trying that code, but it doesn't work:
def registrations_by_date(self):
users = self.users_db.aggregate([
{'$group': {
'_id': {'registration_date':'$date'},
'count': {'$sum':1}
}},
])
return users
What i'm doing wrong? How to get this data?

If the date in your schema is of ISODate
then the below aggregate query will work, the date format is done before grouping so that the timestamp is not taken while grouping the data
{
"_id" : "5a0decadefcb09087c08a868",
"user_id" : "5b232a5a-b333-4320-ba63-722b9e167ef3",
"email" : "email#email.com",
"password" : "***",
"registration_date" : ISODate("2017-11-16T19:53:17.946Z"),
"type" : "user"
}
{
"_id" : "5a0ded3aefcb090887d7f4fb",
"user_id" : "0054bbde-3ba0-490f-8d54-ffaf72958888",
"email" : "second#gmail.com",
"password" : "***",
"registration_date" : ISODate("2017-11-16T19:55:38.194Z"),
"type" : "user"
}
The aggregation query to get the result is
db.userReg.aggregate([
{$project:
{ formattedRegDate:
{ "$dateToString": {format:"%Y-%m-%d", date:"$registration_date"}}
}
},
{$group:{_id:"$formattedRegDate", count:{$sum:1}}}]);
and the result is
{ "_id" : "2017-11-16", "count" : 2 }
If the date in your schema is of String
then the below approach to be used
Sample Data
{
"_id" : "5a0decadefcb09087c08a868",
"user_id" : "5b232a5a-b333-4320-ba63-722b9e167ef3",
"email" : "email#email.com",
"password" : "***",
"registration_date" : "2017-11-16T19:53:17.946Z",
"type" : "user"
}
{
"_id" : "5a0ded3aefcb090887d7f4fb",
"user_id" : "0054bbde-3ba0-490f-8d54-ffaf72958888",
"email" : "second#gmail.com",
"password" : "***",
"registration_date" : "2017-11-16T19:55:38.194Z",
"type" : "user"
}
Query
db.userReg.aggregate([{
$group:{ _id: { date: {"$substr":["$registration_date", 0, 10]}},
count:{$sum:1}
}
}]);
and the result is
{ "_id" : { "date" : "2017-11-16" }, "count" : 2 }

It seems you have an extra ,
db.userReg.aggregate([
{$group: {_id: "$registration_date", count: {$sum:1}}}
])
This gives the correct result(ON the basis of record on my mcahine) :
{
"_id" : ISODate("2017-11-15T19:55:38.194Z"),
"count" : 1.0 }
{
"_id" : ISODate("2017-11-16T19:55:38.194Z"),
"count" : 2.0 }

Why elasticsearch is not able to retrieve document by document id?

I am new to elasticsearch. I have created a new index, using following REST API:-
req = {
"settings": {
"analysis": {
"analyzer": {
"hinglish_analyzer": {
"type": "custom",
"tokenizer": "standard",
"char_filter": [
"html_strip"
],
"filter": [
"lowercase",
"asciifolding",
"hinglish-token-filter"
]
}
}
}
},
"mappings" : {
"p_ss__user" : {
"properties" : {
"age" : {
"type": "integer"
},
"first_name" : {
"type" : "text",
"analyzer": "hinglish_analyzer"
},
"gender" : {
"type" : "long"
},
"is_alive" : {
"type" : "boolean"
},
"last_name" : {
"type" : "text",
"analyzer": "hinglish_analyzer"
},
"marital_status" : {
"type" : "long"
},
"user_gotra" : {
"properties" : {
"Gotra" : {
"type" : "text",
"analyzer": "hinglish_analyzer"
},
"id" : {
"type" : "long"
},
"kuldevi" : {
"properties" : {
"Kuldevi" : {
"type" : "text",
"analyzer": "hinglish_analyzer"
},
"id" : {
"type" : "long"
}
}
}
}
},
"user_village" : {
"properties" : {
"areaOrVillageName" : {
"type" : "text",
"analyzer": "hinglish_analyzer"
},
"id" : {
"type" : "long"
},
"tehsil" : {
"properties" : {
"city" : {
"properties" : {
"cityName" : {
"type" : "text",
"analyzer": "hinglish_analyzer"
},
"id" : {
"type" : "long"
},
"state" : {
"properties" : {
"country" : {
"properties" : {
"countryCode" : {
"type" : "text"
},
"countryName" : {
"type" : "text",
"analyzer": "hinglish_analyzer"
},
"id" : {
"type" : "long"
}
}
},
"id" : {
"type" : "long"
},
"stateCode" : {
"type" : "text"
},
"stateName" : {
"type" : "text",
"analyzer": "hinglish_analyzer"
}
}
}
}
},
"id" : {
"type" : "long"
},
"tehsilName" : {
"type" : "text",
"analyzer": "hinglish_analyzer"
}
}
},
"zipcode" : {
"type" : "text"
}
}
},
"username" : {
"type" : "text",
"analyzer": "hinglish_analyzer"
}
}
}
}
}
Here, 'hinglish-token-filter' is my custom token filter, which I have written and is perfectly fine.
Now, I have created a document in elasticsearch with the help of python script(here I pass my own value of _id variable in the request), which looks like given below :-
{
"username" : "Gopi_Chand",
"first_name" : "Gopi Chand",
"last_name" : "",
"gender" : 2,
"age" : 44,
"user_gotra" : {
"Gotra" : "Thanak",
"kuldevi" : {
"Kuldevi" : "Maa Jagdambaa",
"id" : 1
},
"id" : 1,
"kulrishi" : {
"Rishi" : "Parashar",
"id" : 1
}
},
"user_village" : {
"areaOrVillageName" : "Sanatra",
"tehsil" : {
"city" : {
"state" : {
"country" : {
"countryName" : "India",
"id" : 1,
"countryCode" : "IND"
},
"stateName" : "Rajasthan",
"id" : 1
},
"cityName" : "Barmer (Meru)",
"id" : 1
},
"tehsilName" : "Baitu",
"id" : 1
},
"id" : 1,
"zipcode" : ""
},
"marital_status" : 1,
"is_alive" : true
}
The document is successfully getting stored in the elasticsearch with the Id that I have passed, along with other values.
But the problem comes when I try to retrieve the document with the id, that I have set :-
http://localhost:9200/users/p_s_s__user/3222/
It gives me following response :-
{"_index":"users","_type":"p_s_s__user","_id":"3222","found":false}
But when I try following query :-
http://localhost:9200/users/_search?pretty=true
it shows me my document, as shown below :-
{
"took" : 13,
"timed_out" : false,
"_shards" : {
"total" : 5,
"successful" : 5,
"failed" : 0
},
"hits" : {
"total" : 1,
"max_score" : 1.0,
"hits" : [
{
"_index" : "users",
"_type" : "p_ss__user",
"_id" : "3222",
"_score" : 1.0,
"_source" : {
"username" : "Gopi_Chand",
"first_name" : "Gopi Chand",
"last_name" : "",
"gender" : 2,
"age" : 44,
"user_gotra" : {
"Gotra" : "Thanak",
"kuldevi" : {
"Kuldevi" : "Maa Jagdambaa",
"id" : 1
},
"id" : 1,
"kulrishi" : {
"Rishi" : "Parashar",
"id" : 1
}
},
"user_village" : {
"areaOrVillageName" : "Sanatra",
"tehsil" : {
"city" : {
"state" : {
"country" : {
"countryName" : "India",
"id" : 1,
"countryCode" : "IND"
},
"stateName" : "Rajasthan",
"id" : 1
},
"cityName" : "Barmer (Meru)",
"id" : 1
},
"tehsilName" : "Baitu",
"id" : 1
},
"id" : 1,
"zipcode" : ""
},
"marital_status" : 1,
"is_alive" : true
}
}
]
}
}
Can you help me out, what wrong I have done ? Moreover, other queries such as "match" queries, are also not working.
Thanks in advance.

Mongodb query with regex

I am learning python with mongodb in tornado.
{
"_id" : ObjectId("566fb466d82f0769fbb0fb10"),
"Pid" : "1",
"Registration" : "1234",
"Date" : "11-11-2015",
"status" : "booked"
}
{
"_id" : ObjectId("566fb47cd82f0769fbb0fb11"),
"Pid" : "1",
"Registration" : "1234",
"Date" : "13-11-2015",
"status" : "AOs"
}
{
"_id" : ObjectId("566fb48bd82f0769fbb0fb12"),
"Pid" : "1",
"Registration" : "1234",
"Date" : "14-11-2015",
"status" : "NA"
}
{
"_id" : ObjectId("566fb4a3d82f0769fbb0fb13"),
"Pid" : "1",
"Registration" : "1234",
"Date" : "16-12-2015",
"status" : "AOr"
}
if I am passing the month and year only ,I need the entire record of the given month and year.
I tried doing this:
db.cal.aggregate({$project:{"monthyear": {$substr: ["$Date",0,2]}}})
the output is:
{ "_id" : ObjectId("566fb466d82f0769fbb0fb10"), "monthyear" : "11" }
{ "_id" : ObjectId("566fb47cd82f0769fbb0fb11"), "monthyear" : "13" }
{ "_id" : ObjectId("566fb48bd82f0769fbb0fb12"), "monthyear" : "14" }
{ "_id" : ObjectId("566fb4a3d82f0769fbb0fb13"), "monthyear" : "16" }
but I need the below output:
{
"_id" : ObjectId("566fb4a3d82f0769fbb0fb13")
"Pid" : "1",
"Registration" : "1234",
"Date" : "16-12-2015",
"status" : "AOr"
}
Please, help

What about this:
db.cal.find({"Date":{"$regex":"12-2015"}})

How to use distinct with pipeline in mongodb using python

i have data like this
{ "_id": "1234gbrghr",
"Device" : "samsung",
"UserId" : "12654",
"Month" : "july"
},
{ "_id": "1278gbrghr",
"Device" : "nokia",
"UserId" : "87654",
"Month" : "july"
},
{ "_id": "1234gbrghr",
"Device" : "samsung",
"UserId" : "12654",
"Month" : "july"
}
I need to get the no of distinct user for a particular device in the month of july . for example " If a user(UserId) used samsung device twice or more in the month of july then it will count it as one for samsung .
For this i used this query to get the total no of users in the the month of july . but i need to get the distinct no of users
pipeline1 = [
{'$match':{'Month':'july'}},
{'$group':{'_id' : '$Device', 'count' : { '$sum' : 1 }}}
]
data = db.command('aggregate', 'collection', pipeline=pipeline1);

You will need to group on device and user instead first. You can do that with the following pipeline operator:
{'$group':{'_id' : { d: '$Device', u: '$UserId' } } }
And then secondly you need to count the number of devices per user (like you already had, but slighty modified:
{ '$group': { '_id' : '$_id.d', 'count': { '$sum' : 1 } } }
With the following dataset:
{ "_id" : "1234gbrghr", "Device" : "samsung", "UserId" : "12654", "Month" : "july" }
{ "_id" : "1278gbrghr", "Device" : "nokia", "UserId" : "87654", "Month" : "july" }
{ "_id" : "1239gbrghr", "Device" : "samsung", "UserId" : "12654", "Month" : "july" }
{ "_id" : "1238gbrghr", "Device" : "samsung", "UserId" : "12653", "Month" : "july" }
And the following aggregate command:
db.so.aggregate( [
{ '$match' : {'Month' : 'july' } },
{ '$group' : {
'_id' : { d: '$Device', u: '$UserId' },
'count' : { '$sum' : 1 }
} },
{ '$group': {
'_id' : '$_id.d',
'count': { '$sum' : 1 }
} }
] );
This outputs:
{
"result" : [
{
"_id" : "nokia",
"count" : 1
},
{
"_id" : "samsung",
"count" : 2
}
],
"ok" : 1
}

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

MongDB - queries a large collection - python

find({}); means it returns everything. This is what you want. db.collection.find({user_id:1});

Related

How to acces element from json object

PyMongo counting number of elements by date

Why elasticsearch is not able to retrieve document by document id?

Mongodb query with regex

How to use distinct with pipeline in mongodb using python

Categories

Resources