Why elasticsearch is not able to retrieve document by document id? - python

I am new to elasticsearch. I have created a new index, using following REST API:-
req = {
"settings": {
"analysis": {
"analyzer": {
"hinglish_analyzer": {
"type": "custom",
"tokenizer": "standard",
"char_filter": [
"html_strip"
],
"filter": [
"lowercase",
"asciifolding",
"hinglish-token-filter"
]
}
}
}
},
"mappings" : {
"p_ss__user" : {
"properties" : {
"age" : {
"type": "integer"
},
"first_name" : {
"type" : "text",
"analyzer": "hinglish_analyzer"
},
"gender" : {
"type" : "long"
},
"is_alive" : {
"type" : "boolean"
},
"last_name" : {
"type" : "text",
"analyzer": "hinglish_analyzer"
},
"marital_status" : {
"type" : "long"
},
"user_gotra" : {
"properties" : {
"Gotra" : {
"type" : "text",
"analyzer": "hinglish_analyzer"
},
"id" : {
"type" : "long"
},
"kuldevi" : {
"properties" : {
"Kuldevi" : {
"type" : "text",
"analyzer": "hinglish_analyzer"
},
"id" : {
"type" : "long"
}
}
}
}
},
"user_village" : {
"properties" : {
"areaOrVillageName" : {
"type" : "text",
"analyzer": "hinglish_analyzer"
},
"id" : {
"type" : "long"
},
"tehsil" : {
"properties" : {
"city" : {
"properties" : {
"cityName" : {
"type" : "text",
"analyzer": "hinglish_analyzer"
},
"id" : {
"type" : "long"
},
"state" : {
"properties" : {
"country" : {
"properties" : {
"countryCode" : {
"type" : "text"
},
"countryName" : {
"type" : "text",
"analyzer": "hinglish_analyzer"
},
"id" : {
"type" : "long"
}
}
},
"id" : {
"type" : "long"
},
"stateCode" : {
"type" : "text"
},
"stateName" : {
"type" : "text",
"analyzer": "hinglish_analyzer"
}
}
}
}
},
"id" : {
"type" : "long"
},
"tehsilName" : {
"type" : "text",
"analyzer": "hinglish_analyzer"
}
}
},
"zipcode" : {
"type" : "text"
}
}
},
"username" : {
"type" : "text",
"analyzer": "hinglish_analyzer"
}
}
}
}
}
Here, 'hinglish-token-filter' is my custom token filter, which I have written and is perfectly fine.
Now, I have created a document in elasticsearch with the help of python script(here I pass my own value of _id variable in the request), which looks like given below :-
{
"username" : "Gopi_Chand",
"first_name" : "Gopi Chand",
"last_name" : "",
"gender" : 2,
"age" : 44,
"user_gotra" : {
"Gotra" : "Thanak",
"kuldevi" : {
"Kuldevi" : "Maa Jagdambaa",
"id" : 1
},
"id" : 1,
"kulrishi" : {
"Rishi" : "Parashar",
"id" : 1
}
},
"user_village" : {
"areaOrVillageName" : "Sanatra",
"tehsil" : {
"city" : {
"state" : {
"country" : {
"countryName" : "India",
"id" : 1,
"countryCode" : "IND"
},
"stateName" : "Rajasthan",
"id" : 1
},
"cityName" : "Barmer (Meru)",
"id" : 1
},
"tehsilName" : "Baitu",
"id" : 1
},
"id" : 1,
"zipcode" : ""
},
"marital_status" : 1,
"is_alive" : true
}
The document is successfully getting stored in the elasticsearch with the Id that I have passed, along with other values.
But the problem comes when I try to retrieve the document with the id, that I have set :-
http://localhost:9200/users/p_s_s__user/3222/
It gives me following response :-
{"_index":"users","_type":"p_s_s__user","_id":"3222","found":false}
But when I try following query :-
http://localhost:9200/users/_search?pretty=true
it shows me my document, as shown below :-
{
"took" : 13,
"timed_out" : false,
"_shards" : {
"total" : 5,
"successful" : 5,
"failed" : 0
},
"hits" : {
"total" : 1,
"max_score" : 1.0,
"hits" : [
{
"_index" : "users",
"_type" : "p_ss__user",
"_id" : "3222",
"_score" : 1.0,
"_source" : {
"username" : "Gopi_Chand",
"first_name" : "Gopi Chand",
"last_name" : "",
"gender" : 2,
"age" : 44,
"user_gotra" : {
"Gotra" : "Thanak",
"kuldevi" : {
"Kuldevi" : "Maa Jagdambaa",
"id" : 1
},
"id" : 1,
"kulrishi" : {
"Rishi" : "Parashar",
"id" : 1
}
},
"user_village" : {
"areaOrVillageName" : "Sanatra",
"tehsil" : {
"city" : {
"state" : {
"country" : {
"countryName" : "India",
"id" : 1,
"countryCode" : "IND"
},
"stateName" : "Rajasthan",
"id" : 1
},
"cityName" : "Barmer (Meru)",
"id" : 1
},
"tehsilName" : "Baitu",
"id" : 1
},
"id" : 1,
"zipcode" : ""
},
"marital_status" : 1,
"is_alive" : true
}
}
]
}
}
Can you help me out, what wrong I have done ? Moreover, other queries such as "match" queries, are also not working.
Thanks in advance.

Related

Using Python to build a special data container

My english is poor.
I have a piece of json data with such a structure, and I want to use python to aggregate this data, as shown in the figure.
If the service.name is the same, then it needs to be archived, and the duplicate "url.path" needs to be removed
I don’t know what method to use to store data, use list? dict?
Can anyone help please? thanks
```
{
[
{
"_source" : {
"error" : {
"exception" : [
{
"handled" : false,
"type" : "Abp.UI.UserFriendlyException",
"message" : "未发现该用户的WeiXinUserRelation,粉丝编号447519"
}
]
},
"trace" : {
"id" : "a3e3796ca145b448829d0d0f96661e67"
},
"#timestamp" : "2021-06-21T06:57:52.603Z",
"service" : {
"name" : "Lonsid_AAA_Web_Host"
}, "url" : {
"path" : "/product/getAAA" }
}
},
{
"_source" : {
"error" : {
"exception" : [
{
"handled" : false,
"type" : "Abp.UI.UserFriendlyException",
"message" : "未发现该用户的WeiXinUserRelation,粉丝编号447519"
}
]
},
"trace" : {
"id" : "a3e3796ca145b448829d0d0f96661e67"
},
"#timestamp" : "2021-06-21T06:57:52.603Z",
"service" : {
"name" : "Lonsid_BBB_Web_Host"
}, "url" : {
"path" : "/product/getBBB" }
}
},
{
"_source" : {
"error" : {
"exception" : [
{
"handled" : false,
"type" : "Abp.UI.UserFriendlyException",
"message" : "未发现该用户的WeiXinUserRelation,粉丝编号447519"
}
]
},
"trace" : {
"id" : "a3e3796ca145b448829d0d0f96661e67"
},
"#timestamp" : "2021-06-21T06:57:52.603Z",
"service" : {
"name" : "Lonsid_AAA_Web_Host"
}, "url" : {
"path" : "/product/getAAA" }
}
} ] }
```
This should get you started. It builds a cache of all the known past names, and drops anything that is previously seen.
import pprint
import json
json_data = """[
{
"_source" : {
"error" : {
"exception" : [
{
"handled" : false,
"type" : "Abp.UI.UserFriendlyException",
"message" : "未发现该用户的WeiXinUserRelation,粉丝编号447519"
}
]
},
"trace" : {
"id" : "a3e3796ca145b448829d0d0f96661e67"
},
"#timestamp" : "2021-06-21T06:57:52.603Z",
"service" : {
"name" : "Lonsid_AAA_Web_Host"
}, "url" : {
"path" : "/product/getAAA" }
}
},
{
"_source" : {
"error" : {
"exception" : [
{
"handled" : false,
"type" : "Abp.UI.UserFriendlyException",
"message" : "未发现该用户的WeiXinUserRelation,粉丝编号447519"
}
]
},
"trace" : {
"id" : "a3e3796ca145b448829d0d0f96661e67"
},
"#timestamp" : "2021-06-21T06:57:52.603Z",
"service" : {
"name" : "Lonsid_BBB_Web_Host"
}, "url" : {
"path" : "/product/getBBB" }
}
},
{
"_source" : {
"error" : {
"exception" : [
{
"handled" : false,
"type" : "Abp.UI.UserFriendlyException",
"message" : "未发现该用户的WeiXinUserRelation,粉丝编号447519"
}
]
},
"trace" : {
"id" : "a3e3796ca145b448829d0d0f96661e67"
},
"#timestamp" : "2021-06-21T06:57:52.603Z",
"service" : {
"name" : "Lonsid_AAA_Web_Host"
}, "url" : {
"path" : "/product/getAAA" }
}
}
]"""
data = json.loads(json_data)
cache = {}
for item in data:
if item['_source']['service']['name'] not in cache:
cache[item['_source']['service']['name']] = item
pprint.pprint(list(cache.values()))

How to acces element from json object

How can i access each element from the json object using beautifulsoup or regex?
For example call timestamp and get the output = 1561238146781
{
"timestamp" : 1561238146781,
"context" : {
"external_urls" : {
"spotify" : "https://open.spotify.com/playlist/4rwZyPeww8O7ZPnQqHD2q"
},
"href" : "https://api.spotify.com/v1/playlists/4rwZyPqew8O7ZPnQqHD2q",
"type" : "playlist",
"uri" :
"spotify:user:3sui5bqpr8a0i2t:playlist:4rwZyPqewO7ZPnQqHD2q"
},
"progress_ms" : 115728,
"item" : {
"album" : {
"album_type" : "album",
"artists" : [ {
"external_urls" : {
"spotify" :
"https://open.spotify.com/artist/5eAWCfyUhZqwHtBdNk56l1"
},
"href" :
"https://api.spotify.com/v1/artists/5eAWCfyeZtHHtBdNk56l1",
"id" : "5eAWCfyUhZtHHtBdNk56l1",
"name" : "System Of A Down",
"type" : "artist",
"uri" : "spotify:artist:5eAWewyUhZtHHtBdNk56l1"
} ],
"available_markets" : [ ],
"external_urls" : {
"spotify" : "https://open.spotify.com/album/4DR0Gds7w2GJyQnFVa4jAB"
},
"href" : "https://api.spotify.com/v1/albums/4DR0ewwsdJyQnFVa4jAB",
"id" : "4DR0GWo7w2ewyQnFVa4jAB",
"images" : [ {
"height" : 640,
"url" :
"https://i.scdn.co/image/932e185b217ew6caasd837dbe30d54028de9cfc",
"width" : 615
}, {
"height" : 300,
"url" :
"https://i.scdn.co/image/30de1d4e1ew38sd89573893d8494fd6a66",
"width" : 288
}, {
"height" : 64,
"url" :
"https://i.scdn.co/image/1682cd0e8ew8bf87sdc4cd1e01ce24cd165b",
"width" : 62
} ],
"name" : "Toxicity",
"release_date" : "2001-01-01",
"release_date_precision" : "day",
"total_tracks" : 14,
"type" : "album",
"uri" : "spotify:album:4DR0GewdGJyQnFVa4jAB"
},
"artists" : [ {
"external_urls" : {
"spotify" : "https://open.spotify.com/artist/5eAWCsdUweHtBdNk56l1"
},
"href" : "https://api.spotify.com/v1/artists/5eAWCfewhdsHtBdNk56l1",
"id" : "5eAWCfyUhZtHHtBewk56l1",
"name" : "System Of A Down",
"type" : "artist",
"uri" : "spotify:artist:5eAWCfyUsdtHHtBdNk56l1"
} ],
"available_markets" : [ ],
"disc_number" : 1,
"duration_ms" : 235599,
"explicit" : false,
"external_ids" : {
"isrc" : "USSM10107264"
},
"external_urls" : {
"spotify" : "https://open.spotify.com/track/1twBtsdaZiy7HWPG025QGuP"
},
"href" : "https://api.spotify.com/v1/tracks/1twBt7aZiy7HWPG025QGuP",
"id" : "1twBt7aZiy7HWweG025QGuP",
"is_local" : false,
"name" : "Aerials",
"popularity" : 9,
"preview_url" : null,
"track_number" : 14,
"type" : "track",
"uri" : "spotify:track:1twBt7aZieWPG025QGuP"
},
"currently_playing_type" : "track",
"actions" : {
"disallows" : {
"resuming" : true
}
},
"is_playing" : true
}
Call "is-playing" and get true
I've tried making the hole array a list (each element) and trying to get the element from the list but i've realized that this is hardly useful.
make your json data into data.json file then execute this code :
import json
with open('data.json',"r") as f:
data = json.load(f)
print(data["is_playing"])
output
True

Tranquility server would not send data to druid

I'm using imply-2.2.3. Here is my tranquility server configuration:
{
"dataSources" : [
{
"spec" : {
"dataSchema" : {
"dataSource" : "tutorial-tranquility-server",
"parser" : {
"type" : "string",
"parseSpec" : {
"timestampSpec" : {
"column" : "timestamp",
"format" : "auto"
},
"dimensionsSpec" : {
"dimensions" : [],
"dimensionExclusions" : [
"timestamp",
"value"
]
},
"format" : "json"
}
},
"granularitySpec" : {
"type" : "uniform",
"segmentGranularity" : "hour",
"queryGranularity" : "none"
},
"metricsSpec" : [
{
"type" : "count",
"name" : "count"
},
{
"name" : "value_sum",
"type" : "doubleSum",
"fieldName" : "value"
},
{
"fieldName" : "value",
"name" : "value_min",
"type" : "doubleMin"
},
{
"type" : "doubleMax",
"name" : "value_max",
"fieldName" : "value"
}
]
},
"ioConfig" : {
"type" : "realtime"
},
"tuningConfig" : {
"type" : "realtime",
"maxRowsInMemory" : "50000",
"windowPeriod" : "PT10M"
}
},
"properties" : {
"task.partitions" : "1",
"task.replicants" : "1"
}
},
{
"spec": {
"dataSchema" : {
"dataSource" : "test",
"parser" : {
"type" : "string",
"parseSpec" : {
"timestampSpec" : {
"column" : "timestamp",
"format" : "auto"
},
"dimensionsSpec" : {
"dimensions" : [
"a"
],
},
"format" : "json"
}
},
"granularitySpec" : {
"type" : "uniform",
"segmentGranularity" : "hour",
"queryGranularity" : "none"
},
"metricsSpec" : [
{
"type" : "count",
"name" : "count"
},
{
"type": "doubleSum",
"name": "b",
"fieldName": "b"
}
]
},
"ioConfig" : {
"type" : "realtime"
},
"tuningConfig" : {
"type" : "realtime",
"maxRowsInMemory" : "50000",
"windowPeriod" : "P1Y"
}
},
"properties": {
"task.partitions" : "1",
"task.replicants" : "1"
}
}
],
"properties" : {
"zookeeper.connect" : "localhost",
"druid.discovery.curator.path" : "/druid/discovery",
"druid.selectors.indexing.serviceName" : "druid/overlord",
"http.port" : "8200",
"http.threads" : "40",
"serialization.format" : "smile",
"druidBeam.taskLocator": "overlord"
}
}
I have trouble sending data to the second datasoruce, test, specifically. I tried to send the below data to druid with python requests:
{'b': 7, 'timestamp': '2017-01-20T03:32:54.586415', 'a': 't'}
The response I receive:
b'{"result":{"received":1,"sent":0}}'
If you read my config file you will notice that I set window period to one year. I would like to send data in with a large time span to druid using tranquility server. Is there something wrong with my config or data?

Why MongoDB returns unfiltered results?

I'm doing a filtered query on MongoDB table but the results doesn't match with filter criterias.
I'm using the following query:
db.url_repository.find({"p":0,"pr":0}).limit(10)
But my results are
{ "_id" : ObjectId("568742bb60d99e329a115b7a"), "pr" : 0, "p" : 1, "u" : "http://www.blabla", "type" : "regular", "ct" : ISODate("2016-01-02T05:23:39.586Z"), "dt" : ISODate("2016-08-20T14:59:48.759Z") }
{ "_id" : ObjectId("568742bb60d99e329a115b7b"), "pr" : 0, "p" : 1, "u" : "http://www.blabla", "type" : "regular", "ct" : ISODate("2016-01-02T05:23:39.586Z"), "dt" : ISODate("2016-08-20T15:00:00.392Z") }
{ "_id" : ObjectId("568742bb60d99e329a115b7c"), "pr" : 0, "p" : 1, "u" : "http://www.blabla", "type" : "regular", "ct" : ISODate("2016-01-02T05:23:39.586Z"), "dt" : ISODate("2016-08-20T15:00:12.537Z") }
{ "_id" : ObjectId("568742bb60d99e329a115b7d"), "pr" : 0, "p" : 1, "u" : "http://www.blabla/", "type" : "regular", "ct" : ISODate("2016-01-02T05:23:39.587Z"), "dt" : ISODate("2016-08-20T15:00:15.887Z") }
As you can see, it doesn't obey to rules and results are including documents with "p":1
These results are returning on MongoDB shell but also pymongo (Python library) returns right results. I couldn't find what can be the reason for this.
Thank you very much for your support.
Edit: explain() results are also below
{
"queryPlanner" : {
"plannerVersion" : 1,
"namespace" : "crawlerdb.url_repository",
"indexFilterSet" : false,
"parsedQuery" : {
"$and" : [
{
"p" : {
"$eq" : 0
}
},
{
"pr" : {
"$eq" : 0
}
}
]
},
"winningPlan" : {
"stage" : "LIMIT",
"limitAmount" : 10,
"inputStage" : {
"stage" : "FETCH",
"filter" : {
"pr" : {
"$eq" : 0
}
},
"inputStage" : {
"stage" : "IXSCAN",
"keyPattern" : {
"p" : 1
},
"indexName" : "is_processed",
"isMultiKey" : false,
"isUnique" : false,
"isSparse" : false,
"isPartial" : false,
"indexVersion" : 1,
"direction" : "forward",
"indexBounds" : {
"p" : [
"[0.0, 0.0]"
]
}
}
}
},
"rejectedPlans" : [
{
"stage" : "LIMIT",
"limitAmount" : 10,
"inputStage" : {
"stage" : "FETCH",
"inputStage" : {
"stage" : "IXSCAN",
"keyPattern" : {
"p" : 1,
"pr" : -1
},
"indexName" : "url_is_processed_priority",
"isMultiKey" : false,
"isUnique" : false,
"isSparse" : false,
"isPartial" : false,
"indexVersion" : 1,
"direction" : "forward",
"indexBounds" : {
"p" : [
"[0.0, 0.0]"
],
"pr" : [
"[0.0, 0.0]"
]
}
}
}
}
]
},
"serverInfo" : {
"host" : "barcelona",
"port" : 27017,
"version" : "3.2.7",
"gitVersion" : "4249c1d2b5999ebbf1fdf3bc0e0e3b3ff5c0aaf2"
},
"ok" : 1
}

How to use distinct with pipeline in mongodb using python

i have data like this
{ "_id": "1234gbrghr",
"Device" : "samsung",
"UserId" : "12654",
"Month" : "july"
},
{ "_id": "1278gbrghr",
"Device" : "nokia",
"UserId" : "87654",
"Month" : "july"
},
{ "_id": "1234gbrghr",
"Device" : "samsung",
"UserId" : "12654",
"Month" : "july"
}
I need to get the no of distinct user for a particular device in the month of july . for example " If a user(UserId) used samsung device twice or more in the month of july then it will count it as one for samsung .
For this i used this query to get the total no of users in the the month of july . but i need to get the distinct no of users
pipeline1 = [
{'$match':{'Month':'july'}},
{'$group':{'_id' : '$Device', 'count' : { '$sum' : 1 }}}
]
data = db.command('aggregate', 'collection', pipeline=pipeline1);
You will need to group on device and user instead first. You can do that with the following pipeline operator:
{'$group':{'_id' : { d: '$Device', u: '$UserId' } } }
And then secondly you need to count the number of devices per user (like you already had, but slighty modified:
{ '$group': { '_id' : '$_id.d', 'count': { '$sum' : 1 } } }
With the following dataset:
{ "_id" : "1234gbrghr", "Device" : "samsung", "UserId" : "12654", "Month" : "july" }
{ "_id" : "1278gbrghr", "Device" : "nokia", "UserId" : "87654", "Month" : "july" }
{ "_id" : "1239gbrghr", "Device" : "samsung", "UserId" : "12654", "Month" : "july" }
{ "_id" : "1238gbrghr", "Device" : "samsung", "UserId" : "12653", "Month" : "july" }
And the following aggregate command:
db.so.aggregate( [
{ '$match' : {'Month' : 'july' } },
{ '$group' : {
'_id' : { d: '$Device', u: '$UserId' },
'count' : { '$sum' : 1 }
} },
{ '$group': {
'_id' : '$_id.d',
'count': { '$sum' : 1 }
} }
] );
This outputs:
{
"result" : [
{
"_id" : "nokia",
"count" : 1
},
{
"_id" : "samsung",
"count" : 2
}
],
"ok" : 1
}

Categories

Resources