I need to parse requests to a single url that are coming in JSON, but in several different formats. For example, some have timestamp noted as timestamp attr, others as unixtime etc. So i want to create json schemas for all types of requests that not only validate incoming JSONs but also extract their parameters from specified places. Is there a library that can do that?
Example:
If I could define a schema that would look something like this
schema = {
"type" : "object",
"properties" : {
"price" : {
"type" : "number",
"mapped_name": "product_price"
},
"name" : {
"type" : "string",
"mapped_name": "product_name"
},
"added_at":{
"type" : "int",
"mapped_name": "timestamp"
},
},
}
and then apply it to a dict
request = {
"name" : "Eggs",
"price" : 34.99,
'added_at': 1234567
}
by some magical function
params = validate_and_extract(request, schema)
I want params to have mapped values there:
{"mapped_name": "Eggs", "product_price": 34.99, "timestamp": 1234567}
so this is a module I'm looking for. And it should support nested dicts in request, not just flat dicts.
The following code may help. It supports nested dict as well.
import json
def valid_type(type_name, obj):
if type_name == "number":
return isinstance(obj, int) or isinstance(obj, float)
if type_name == "int":
return isinstance(obj, int)
if type_name == "float":
return isinstance(obj, float)
if type_name == "string":
return isinstance(obj, str)
def validate_and_extract(request, schema):
''' Validate request (dict) against the schema (dict).
Validation is limited to naming and type information.
No check is done to ensure all elements in schema
are present in the request. This could be enhanced by
specifying mandatory/optional/conditional information
within the schema and subsequently checking for that.
'''
out = {}
for k, v in request.items():
if k not in schema['properties'].keys():
print("Key '{}' not in schema ... skipping.".format(k))
continue
if schema['properties'][k]['type'] == 'object':
v = validate_and_extract(v, schema['properties'][k])
elif not valid_type(schema['properties'][k]['type'], v):
print("Wrong type for '{}' ... skipping.".format(k))
continue
out[schema['properties'][k]['mapped_name']] = v
return out
# Sample Data 1
schema1 = {
"type" : "object",
"properties" : {
"price" : {
"type" : "number",
"mapped_name": "product_price"
},
"name" : {
"type" : "string",
"mapped_name": "product_name"
},
"added_at":{
"type" : "int",
"mapped_name": "timestamp"
},
},
}
request1 = {
"name" : "Eggs",
"price" : 34.99,
'added_at': 1234567
}
# Sample Data 2: containing nested dict
schema2 = {
"type" : "object",
"properties" : {
"price" : {
"type" : "number",
"mapped_name": "product_price"
},
"name" : {
"type" : "string",
"mapped_name": "product_name"
},
"added_at":{
"type" : "int",
"mapped_name": "timestamp"
},
"discount":{
"type" : "object",
"mapped_name": "offer",
"properties" : {
"percent": {
"type" : "int",
"mapped_name": "percentage"
},
"last_date": {
"type" : "string",
"mapped_name": "end_date"
},
}
},
},
}
request2 = {
"name" : "Eggs",
"price" : 34.99,
'added_at': 1234567,
'discount' : {
'percent' : 40,
'last_date' : '2016-09-25'
}
}
params = validate_and_extract(request1, schema1)
print(params)
params = validate_and_extract(request2, schema2)
print(params)
Output from running this:
{'timestamp': 1234567, 'product_name': 'Eggs', 'product_price': 34.99}
{'offer': {'percentage': 40, 'end_date': '2016-09-25'}, 'timestamp': 1234567, 'product_name': 'Eggs', 'product_price': 34.99}
See http://json-schema.org
This doesn't look like a Python question.
Related
I have a table in opensearch in which the format of every field is "text".
This is how my table looks like
Now the query(q1) which I am running in opensearch looks like this. i am not getting any output. But when I run query q2 then I get the output.
q1 = {"size":10,"query":{"bool":{"must":[{"multi_match":{"query":"cen","fields":["name","alias"],"fuzziness":"AUTO"}}],"filter":[{"match_phrase":{"category":"Specialty"}},{"match_phrase":{"prov_type":"A"}},{"match_phrase":{"prov_type":"C"}}]}}}
q2 = {"size":10,"query":{"bool":{"must":[{"multi_match":{"query":"cen","fields":["name","alias"],"fuzziness":"AUTO"}}],"filter":[{"match_phrase":{"category":"Specialty"}},{"match_phrase":{"prov_type":"A"}}]}}}
Now I want to apply multiple filtering on prov_type. I have tried using terms also with prov_type in list like ['A','B'].
Can anyone please answer this on how to apply multiple filters on value for single column in opensearch/elasticsearch. Datatype for every field is text.
Have already tried this - How to filter with multiple fields and values in elasticsearch?
Mapping for the index
GET index/_mapping
{
"spec_proc_comb_exp" : {
"mappings" : {
"properties" : {
"alias" : {
"type" : "text",
"fields" : {
"keyword" : {
"type" : "keyword",
"ignore_above" : 256
}
}
},
"category" : {
"type" : "text",
"fields" : {
"keyword" : {
"type" : "keyword",
"ignore_above" : 256
}
}
},
"name" : {
"type" : "text",
"fields" : {
"keyword" : {
"type" : "keyword",
"ignore_above" : 256
}
}
},
"prov_type" : {
"type" : "text",
"fields" : {
"keyword" : {
"type" : "keyword",
"ignore_above" : 256
}
}
},
"specialty_code" : {
"type" : "text",
"fields" : {
"keyword" : {
"type" : "keyword",
"ignore_above" : 256
}
}
}
}
}
}
}
Please let me know in case you need anymore information
You can use the should query to filter your data with the OR condition.
Should: The clause (query) should appear in the matching document.
GET test_allergy/_search
{
"query": {
"bool": {
"should": [
{
"term": {
"prov_type": "A"
}
},
{
"term": {
"prov_type": "C"
}
}
],
"minimum_should_match": 1
}
}
}
Note: You can set minimum_should_match as a number or percentage.
I have a JSON string in the following format that i get from an API and need to reformat it, so that I can check the difference between two lookups (which settings are different in the different modules):
{ "modules": [
{
"name": "A1",
"bar" : "AA",
"settings" :[
{
"name" : "set1",
"value" : "1"
},
{
"name" : "set2",
"value" : "2"
}
]
},
{
"name": "A2",
"bar" : "DD",
"settings" :[
{
"name" : "set1",
"value" : "A21"
}
]
},
{
"name": "A1",
"settings" :[
{
"name" : "set3",
"value" : "1"
}
]
}
]
}
and need to get it into a dictionary of the format
'A1' : {
'bar' : 'AA',
'settings': {
'set1' : '1',
'set2' : '2',
'set3' : '1'
}....
is there any nicer, easier way to do this than, assuming I have read the string from above in a dictionary json_dict
modules_a = { module['name'] : { 'bar' : module['bar'], 'settings' : {}} for module in json_dict['modules']
for module in json_dict['modules']:
modules_a[module['name']]['settings'].update( s['name']: s['value'] for s in module['settings'] )
you have some errors in the input, you missed a comma after bar. Here is a more readable version:
# First, merge together the modules with the same names
concatenated_json = {'modules': []}
reference_dict = dict()
for module in json["modules"]:
# Check whether module has a 'bar' and whether it has already been mentioned
if module.get("bar") is not None and reference_dict.get(module["bar"]) is None:
# Module has not been mentioned yet, add it to the fixed dict and note its reference
concatenated_json['modules'].append(module)
reference_dict[module["name"]] = module
else:
# Append to settings of a previously mentioned module
reference_dict[module["name"]]["settings"] += module["settings"]
json = concatenated_json
# Format dict in a required way
modules_a = {
module["name"]:{
"bar": module["bar"],
"settings": {
setting["name"]: setting["value"] for setting in module["settings"]
}
}
for module in json["modules"]
}
Here's a way to do it, although I'm not sure what you meant about "resorting".
# Preallocate result.
modules_a = {module['name']: {'settings': []} for module in json_dict['modules']}
for module in json_dict['modules']:
obj = modules_a[module['name']]
obj.update({k: v for k, v in module.items() if k != 'settings'})
# Accumulate 'settings' in a list.
obj['settings'].extend([{setting['name']: setting['value']}
for setting in module['settings'] ])
import json
print(json.dumps(modules_a, indent=4))
Result:
{
"A1": {
"settings": [
{
"set1": "1"
},
{
"set2": "2"
},
{
"set3": "1"
}
],
"bar": "AA",
"name": "A1"
},
"A2": {
"settings": [
{
"set1": "A21"
}
],
"bar": "DD",
"name": "A2"
}
}
I have a field distribution in record schema that looks likes this:
...
"distribution": {
"properties": {
"availability": {
"type": "keyword"
}
}
}
...
I want to rank the records with distribution.availability == "ondemand" lower than other records.
I looked in Elasticsearch docs but can't find a way to reduce the scores of this type of records in index-time to appear lower in search results.
How can I achieve this, any pointers to related source would be enough as well.
More Info:
I was completely omitting these ondemand records with help of python client in query-time like this:
from elasticsearch_dsl.query import Q
_query = Q("query_string", query=query_string) & ~Q('match', **{'availability.keyword': 'ondemand'})
Now, I want to include these records but I want to place them lower than other records.
If it is not possible to implement something like this in index-time, please suggest how can I achieve this in query-time with python client.
After applying the suggestion from llermaly, the python client query looks like this:
boosting_query = Q(
"boosting",
positive=Q("match_all"),
negative=Q(
"bool", filter=[Q({"term": {"distribution.availability.keyword": "ondemand"}})]
),
negative_boost=0.5,
)
if query_string:
_query = Q("query_string", query=query_string) & boosting_query
else:
_query = Q() & boosting_query
EDIT2 : elasticsearch-dsl-py version of boosting query
from elasticsearch import Elasticsearch
from elasticsearch_dsl import Search
from elasticsearch_dsl import Q
client = Elasticsearch()
q = Q('boosting', positive=Q("match_all"), negative=Q('bool', filter=[Q({"term": {"test.available.keyword": "ondemand"}})]), negative_boost=0.5)
s = Search(using=client, index="test_parths007").query(q)
response = s.execute()
print(response)
for hit in response:
print(hit.meta.score, hit.test.available)
EDIT : Just read you need to do it on index time.
Elasticsearch deprecated index time boosting on 5.0
https://www.elastic.co/guide/en/elasticsearch/reference/7.11/mapping-boost.html
You can use a Boosting query to achieve that on query time.
Ingest Documents
POST test_parths007/_doc
{
"name": "doc1",
"test": {
"available": "ondemand"
}
}
POST test_parths007/_doc
{
"name": "doc1",
"test": {
"available": "higherscore"
}
}
POST test_parths007/_doc
{
"name": "doc2",
"test": {
"available": "higherscore"
}
}
Query (index time)
POST test_parths007/_search
{
"query": {
"boosting": {
"positive": {
"match_all": {}
},
"negative": {
"term": {
"test.available.keyword": "ondemand"
}
},
"negative_boost": 0.5
}
}
}
Response
{
"took" : 0,
"timed_out" : false,
"_shards" : {
"total" : 1,
"successful" : 1,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : {
"value" : 3,
"relation" : "eq"
},
"max_score" : 1.0,
"hits" : [
{
"_index" : "test_parths007",
"_type" : "_doc",
"_id" : "VMdY7XcB50NMsuQPelRx",
"_score" : 1.0,
"_source" : {
"name" : "doc2",
"test" : {
"available" : "higherscore"
}
}
},
{
"_index" : "test_parths007",
"_type" : "_doc",
"_id" : "Vcda7XcB50NMsuQPiVRB",
"_score" : 1.0,
"_source" : {
"name" : "doc1",
"test" : {
"available" : "higherscore"
}
}
},
{
"_index" : "test_parths007",
"_type" : "_doc",
"_id" : "U8dY7XcB50NMsuQPdlTo",
"_score" : 0.5,
"_source" : {
"name" : "doc1",
"test" : {
"available" : "ondemand"
}
}
}
]
}
}
For more advanced manipulation you can check the Function Score Query
I'm using imply-2.2.3. Here is my tranquility server configuration:
{
"dataSources" : [
{
"spec" : {
"dataSchema" : {
"dataSource" : "tutorial-tranquility-server",
"parser" : {
"type" : "string",
"parseSpec" : {
"timestampSpec" : {
"column" : "timestamp",
"format" : "auto"
},
"dimensionsSpec" : {
"dimensions" : [],
"dimensionExclusions" : [
"timestamp",
"value"
]
},
"format" : "json"
}
},
"granularitySpec" : {
"type" : "uniform",
"segmentGranularity" : "hour",
"queryGranularity" : "none"
},
"metricsSpec" : [
{
"type" : "count",
"name" : "count"
},
{
"name" : "value_sum",
"type" : "doubleSum",
"fieldName" : "value"
},
{
"fieldName" : "value",
"name" : "value_min",
"type" : "doubleMin"
},
{
"type" : "doubleMax",
"name" : "value_max",
"fieldName" : "value"
}
]
},
"ioConfig" : {
"type" : "realtime"
},
"tuningConfig" : {
"type" : "realtime",
"maxRowsInMemory" : "50000",
"windowPeriod" : "PT10M"
}
},
"properties" : {
"task.partitions" : "1",
"task.replicants" : "1"
}
},
{
"spec": {
"dataSchema" : {
"dataSource" : "test",
"parser" : {
"type" : "string",
"parseSpec" : {
"timestampSpec" : {
"column" : "timestamp",
"format" : "auto"
},
"dimensionsSpec" : {
"dimensions" : [
"a"
],
},
"format" : "json"
}
},
"granularitySpec" : {
"type" : "uniform",
"segmentGranularity" : "hour",
"queryGranularity" : "none"
},
"metricsSpec" : [
{
"type" : "count",
"name" : "count"
},
{
"type": "doubleSum",
"name": "b",
"fieldName": "b"
}
]
},
"ioConfig" : {
"type" : "realtime"
},
"tuningConfig" : {
"type" : "realtime",
"maxRowsInMemory" : "50000",
"windowPeriod" : "P1Y"
}
},
"properties": {
"task.partitions" : "1",
"task.replicants" : "1"
}
}
],
"properties" : {
"zookeeper.connect" : "localhost",
"druid.discovery.curator.path" : "/druid/discovery",
"druid.selectors.indexing.serviceName" : "druid/overlord",
"http.port" : "8200",
"http.threads" : "40",
"serialization.format" : "smile",
"druidBeam.taskLocator": "overlord"
}
}
I have trouble sending data to the second datasoruce, test, specifically. I tried to send the below data to druid with python requests:
{'b': 7, 'timestamp': '2017-01-20T03:32:54.586415', 'a': 't'}
The response I receive:
b'{"result":{"received":1,"sent":0}}'
If you read my config file you will notice that I set window period to one year. I would like to send data in with a large time span to druid using tranquility server. Is there something wrong with my config or data?
This my Dict structure, I cannot iterate entire dict to find ObjectId as value
My Input data:
{
"_id" : ObjectId("5671947d29c23846797d836a"),
"event_version" : "1.0",
"event_time" : ISODate("2015-12-16T16:42:37.501Z"),
"event_name" : "Create_Assignment",
"user_id" : "admin",
"tenant" : "Demo_Tenant",
"sourceIPAddress" : "",
"user_agent" : "",
"request_parameters" : {
"username" : "admin",
"status" : "active",
"first_name" : "",
"last_name" : "",
"is_deleted" : false,
"updated_by" : "admin",
"roles" : [
{
"_ref" : {
"$ref" : "role",
"$id" : ObjectId("5671947d29c23846797d8362")
},
"_cls" : "Role"
},
{
"_ref" : {
"$ref" : "role",
"$id" : ObjectId("5671947d29c23846797d8366")
},
"_cls" : "Role"
}
]
}
I have tried :
def todict(self, data, obj=None):
for key in data:
if isinstance(data[key], (ObjectId)):
print '>>>>>>>>>>', data[key]
obj[key]=str(data[key])
else:
if not isinstance(data[key], (str, unicode, list, datetime, bool)):
self.todict(data[key],obj)
else:
obj[key]=data[key]
return obj
But this it doesnt work properly. I need recursive function to convert all ObjectID values into str
Expected JSON:
{
"_id" : "5671947d29c23846797d836a",
"event_version" : "1.0",
"event_time" : ISODate("2015-12-16T16:42:37.501Z"),
"event_name" : "Create_Assignment",
"user_id" : "admin",
"tenant" : "Demo_Tenant",
"sourceIPAddress" : "",
"user_agent" : "",
"request_parameters" : {
"username" : "admin",
"status" : "active",
"first_name" : "",
"last_name" : "",
"is_deleted" : false,
"updated_by" : "admin",
"roles" : [
{
"_ref" : {
"$ref" : "role",
"$id" : "5671947d29c23846797d8362"
},
"_cls" : "Role"
},
{
"_ref" : {
"$ref" : "role",
"$id" : "5671947d29c23846797d8366"
},
"_cls" : "Role"
}
]
}
Never too late, hope this helps someone...
from bson import ObjectId
# convert recursively all ObjectIds to strings in a dictionary
def obj_to_str(data):
if isinstance(data, dict):
return {obj_to_str(key): obj_to_str(value) for key, value in data.items()}
elif isinstance(data, list):
return [obj_to_str(element) for element in data]
elif isinstance(data, ObjectId):
return str(data)
else:
return data
you can try this:
...
from bson import ObjectId
...
def todict(self, data, obj=None):
for key in data:
if isinstance(data[key], ObjectId):
obj[key]=str(data[key])
... ...