change JSON elasticsearch query to python - python

I have been using basic queries in python for elasticsearch like this:
from elasticsearch import Elasticsearch
from elasticsearch_dsl import Search
es = Elasticsearch()
def someView(request):
s = Search().query("regexp", title_en="someword.*")
response = s.execute()
I would like to combine a query to check if someword exists in either of the fields "title_en" or "text_en"
Any idea how to accomplish this?
In this link I saw an example of a bool query using JSON, but I donĀ“t understand how something similar could be done with python code.
{
"query": {
"bool" : {
"must" : {
"term" : { "user" : "kimchy" }
},
"filter": {
"term" : { "tag" : "tech" }
},
"must_not" : {
"range" : {
"age" : { "gte" : 10, "lte" : 20 }
}
},
"should" : [
{ "term" : { "tag" : "wow" } },
{ "term" : { "tag" : "elasticsearch" } }
],
"minimum_should_match" : 1,
"boost" : 1.0
}
}
}

Figured out the way to use or in query :
q = Q("regexp", text_en='someword.*') | Q("regexp", title_en='someword.*')
c = Search().query(q)
response = c.execute()

Related

Rank records on the basis of a field value in Elasticsearch

I have a field distribution in record schema that looks likes this:
...
"distribution": {
"properties": {
"availability": {
"type": "keyword"
}
}
}
...
I want to rank the records with distribution.availability == "ondemand" lower than other records.
I looked in Elasticsearch docs but can't find a way to reduce the scores of this type of records in index-time to appear lower in search results.
How can I achieve this, any pointers to related source would be enough as well.
More Info:
I was completely omitting these ondemand records with help of python client in query-time like this:
from elasticsearch_dsl.query import Q
_query = Q("query_string", query=query_string) & ~Q('match', **{'availability.keyword': 'ondemand'})
Now, I want to include these records but I want to place them lower than other records.
If it is not possible to implement something like this in index-time, please suggest how can I achieve this in query-time with python client.
After applying the suggestion from llermaly, the python client query looks like this:
boosting_query = Q(
"boosting",
positive=Q("match_all"),
negative=Q(
"bool", filter=[Q({"term": {"distribution.availability.keyword": "ondemand"}})]
),
negative_boost=0.5,
)
if query_string:
_query = Q("query_string", query=query_string) & boosting_query
else:
_query = Q() & boosting_query
EDIT2 : elasticsearch-dsl-py version of boosting query
from elasticsearch import Elasticsearch
from elasticsearch_dsl import Search
from elasticsearch_dsl import Q
client = Elasticsearch()
q = Q('boosting', positive=Q("match_all"), negative=Q('bool', filter=[Q({"term": {"test.available.keyword": "ondemand"}})]), negative_boost=0.5)
s = Search(using=client, index="test_parths007").query(q)
response = s.execute()
print(response)
for hit in response:
print(hit.meta.score, hit.test.available)
EDIT : Just read you need to do it on index time.
Elasticsearch deprecated index time boosting on 5.0
https://www.elastic.co/guide/en/elasticsearch/reference/7.11/mapping-boost.html
You can use a Boosting query to achieve that on query time.
Ingest Documents
POST test_parths007/_doc
{
"name": "doc1",
"test": {
"available": "ondemand"
}
}
POST test_parths007/_doc
{
"name": "doc1",
"test": {
"available": "higherscore"
}
}
POST test_parths007/_doc
{
"name": "doc2",
"test": {
"available": "higherscore"
}
}
Query (index time)
POST test_parths007/_search
{
"query": {
"boosting": {
"positive": {
"match_all": {}
},
"negative": {
"term": {
"test.available.keyword": "ondemand"
}
},
"negative_boost": 0.5
}
}
}
Response
{
"took" : 0,
"timed_out" : false,
"_shards" : {
"total" : 1,
"successful" : 1,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : {
"value" : 3,
"relation" : "eq"
},
"max_score" : 1.0,
"hits" : [
{
"_index" : "test_parths007",
"_type" : "_doc",
"_id" : "VMdY7XcB50NMsuQPelRx",
"_score" : 1.0,
"_source" : {
"name" : "doc2",
"test" : {
"available" : "higherscore"
}
}
},
{
"_index" : "test_parths007",
"_type" : "_doc",
"_id" : "Vcda7XcB50NMsuQPiVRB",
"_score" : 1.0,
"_source" : {
"name" : "doc1",
"test" : {
"available" : "higherscore"
}
}
},
{
"_index" : "test_parths007",
"_type" : "_doc",
"_id" : "U8dY7XcB50NMsuQPdlTo",
"_score" : 0.5,
"_source" : {
"name" : "doc1",
"test" : {
"available" : "ondemand"
}
}
}
]
}
}
For more advanced manipulation you can check the Function Score Query

Basic request to mongodb with pymongo

I need to get all objects inside "posts" that have "published: true"
with pymongo. I've tried already so many variants but all I can do:
for elt in db[collection].find({}, {"posts"}):
print(elt)
And it'll show all "posts". I've tried smth like this:
for elt in db[collection].find({}, {"posts", {"published": {"$eq": True}}}):
print(elt)
But it doesn't work. Help, I'm trying for 3 days already =\
What you want to be doing is to use the aggregate $filter like so:
db[collection].aggregate([
{
"$match": { // only fetch documents with such posts
"posts.published": {"$eq": True}
}
},
{
"$project": {
"posts": {
"$filter": {
"input": "$posts",
"as": "post",
"cond": {"$eq": ["$$post.published", True]}
}
}
}
}
])
Note that the currenct structure returned will be:
[
{posts: [post1, post2]},
{posts: [post3, post4]}
]
If you want to retrieve it as a list of posts you'll need to add an $unwind stage to flatten the array.
The query options are quite limited you can do it with $elemMatch (projection) or with the $ operator but both of these return only the first post that matches the condition which is not what you want.
------- EDIT --------
Realizing posts is actually an object and not an array, you'll have to turn the object to an array, iterate over to filter and then restore the structure like so:
db.collection.aggregate([
{
$project: {
"posts": {
"$arrayToObject": {
$filter: {
input: {
"$objectToArray": "$posts"
},
as: "post",
cond: {
$eq: [
"$$post.v.published",
true
]
}
}
}
}
}
}
])
Mongo Playground
What I assumed that your document looks like this,
{
"_id" : ObjectId("5f8570f8afdefd2cfe7473a7"),
"posts" : {
"a" : {
"p" : false,
"name" : "abhishek"
},
"k" : {
"p" : true,
"name" : "jack"
},
"c" : {
"p" : true,
"name" : "abhinav"
}
}}
You can try the following query but the result format will be a bit different, adding that for clarification,
db.getCollection('temp2').aggregate([
{
$project: {
subPost: { $objectToArray: "$posts" }
}
},
{
'$unwind' : '$subPost'
},
{
'$match' : {'subPost.v.p':true}
},
{
'$group': {_id:'$_id', subPosts: { $push: { subPost: "$subPost"} }}
}
])
result format,
{
"_id" : ObjectId("5f8570f8afdefd2cfe7473a7"),
"subPosts" : [
{
"subPost" : {
"k" : "k",
"v" : {
"p" : true,
"name" : "jack"
}
}
},
{
"subPost" : {
"k" : "c",
"v" : {
"p" : true,
"name" : "abhinav"
}
}
}
]
}

Python - After i insert geo data into elastic search, how do I search the data?

I'm am using python-elasticsearch to insert geo data into the engine as below, but what function or method can I use to search for my data? Can you give an example please?
mappings = {
"doc": {
"properties": {
"geo": {
"properties": {
"location": {"type": "geo_point"}
}
}
}
}
}
es.indices.create(index='geodata', body=mappings)
# ...
es_entries['geo'] = {'location':str(data['_longitude_'])+","+str(data['_latitude_'])}
# ...
es.index(index="geodata", doc_type="doc", body=es_entries)
You can use the Geo Distance Query:
{
"query": {
"bool" : {
"must" : {
"match_all" : {}
},
"filter" : {
"geo_distance" : {
"distance" : "10km",
"geo.location" : {
"lat" : 10,
"lon" : -10
}
}
}
}
}
}
You can use both es.search and elasticsearch.helpers.scan, for example:
res = es.search(index='geodata', body= { ... }) # put the above dictionary in the `body`

Query DSL not working in pyes search

I am trying to use a custom query DSL to get results using the pyes library. I have query DSL that works when I use the command line
curl -XGET localhost:9200/test_index/_search -d '{
"query": {
"function_score": {
"query": {
"match_all": {}
},
"field_value_factor": {
"field": "starred",
"modifier": "none",
"factor": 2
}
}
},
"aggs" : {
"types" : {
"filters" : {
"filters" : {
"category1" : { "type" : { "value" : "category1"}},
"category2" : { "type" : { "value" : "category2"}},
"category3" : { "type" : { "value" : "category3"}},
"category4": { "type" : { "value" : "category4"}},
"category5" : { "type" : { "value" : "category5"}}
}
},
"aggs": {
"topFoundHits": {
"top_hits": {
"size": 5
}
}
}
}
}
}'
The idea here is to search across many categorized documents for all documents matching a particular string query. Then using aggregations I want to find the top five resulting documents by category. Starred items are boosted so that they show up above other search results.
This works great when I enter the command as listed above directly in terminal but it doesn't work when I try to put it in pyes. I'm not sure what the best way is to do it. The documentation for the pyes library is really confusing for me to translate this totally into pyes objects.
I'm trying to do the following:
query_dsl = self.get_text_index_query_dsl()
resulting_docs = conn.search(query=query_dsl)
(where self.get_test_index_query_dsl returns the query dsl dict above)
Searching as is gives me a:
ElasticSearchException: QueryParsingException[[test_index] No query registered for [query]]; }]
If I remove the parent "query" mapping and try:
query_dsl = {
"function_score": {
"query": {
"match_all": {}
},
"field_value_factor": {
"field": "starred",
"modifier": "none",
"factor": 2
}
},
"aggs" : {
"types" : {
"filters" : {
"filters" : {
"category1" : { "type" : { "value" : "category1"}},
"category2" : { "type" : { "value" : "category2"}},
"category3" : { "type" : { "value" : "category3"}},
"category4": { "type" : { "value" : "category4"}},
"category5" : { "type" : { "value" : "category5"}}
}
},
"aggs": {
"topFoundHits": {
"top_hits": {
"size": 5
}
}
}
}
}
}
This also errors out with: ElasticSearchException: ElasticsearchParseException[Expected field name but got START_OBJECT "aggs"]; }]
These errors in addition to the fact that pyes doesn't seem to have a 'topFoundHits' functionality yet (I think) are holding me up.
Any ideas why this is happening and how to fix it?
Thank you so much!
I got this working using this library where you can just use your regular query dsl JSON syntax : http://elasticsearch-dsl.readthedocs.org/en/latest/.

In elasticsearch-py, how different should the search command be if I used custom tokenization during indexing?

I am using elasticsearch-py to index tweets (originally in JSON format). In order to preserve special characters like hashtags, user targets and emoticons, I specified a special mapping while creating the index. This is what it looks like:
from elasticsearch import Elasticsearch
import sys,json
es = Elasticsearch()
es.indices.create(
index='ecommercetweets',
body={
"settings" : {
"index" : {
"number_of_shards" : 1,
"number_of_replicas" : 1
},
"analysis" : {
"filter" : {
"tweet_filter" : {
"type" : "word_delimiter",
"type_table": ["# => ALPHA", "# => ALPHA", ":) => ALPHA", ":( => ALPHA"]
}
},
"analyzer" : {
"tweet_analyzer" : {
"type" : "custom",
"tokenizer" : "whitespace",
"filter" : ["lowercase", "tweet_filter"]
}
}
}
},
"mappings" : {
"tweet" : {
"properties" : {
"text" : {
"analyzer" : "tweet_analyzer"
}
}
}
}
},
ignore=400
)
fin = open(sys.argv[1],"r")
count = 0
for line in fin:
jsonLine = json.loads(line)
doc = {
'tweetId' : jsonLine["id"],
'text' : jsonLine["text"],
'userId' : jsonLine["user"]["id"],
'favorite_count' : jsonLine["favorite_count"],
'retweet_count' :jsonLine["retweet_count"],
'language': jsonLine["lang"],
'dateTime':jsonLine["created_at"],
'location':jsonLine["place"]
}
es.index(index='ecommercetweets', doc_type='tweet', id=count, body=doc)
count+=1
I am searching using this command:
results1 = es.search(index='ecommercetweets',q="text:delivery")
results2 = es.search(index='ecommercetweets',q="text:#delivery")
Both returns the same number of hits, although I am pretty sure this should not be the case for the data I am using.
Am I going wrong with the search command?
One way you can deal with it is to use a term query (or term filter). This should do it:
es.search(index='ecommercetweets',body={
"query": {
"term": {
"text": {
"value": "#delivery"
}
}
}
})
Here is some code I used to play around with it:
http://sense.qbox.io/gist/fe61f0cd92b465276b261100cbe7f4778002a96d

Categories

Resources