python join sub-dictionaries - python

I'm trying to join sub-Dicts in python so that a valid json is composed
what I have is:
{
'ctx/language': 'en',
'ctx/territory': 'DE',
'composer_name': 'openEHR2study',
'Allergies': {
'adverse_reaction-allergy': [{
'reaction_event_summary': {
'clinical_impact': [{
'|code': 'at0035'
}
]
}
}, {
'recorded': ['2020-05-14T00:00:00.000Z']
}, {
'reaction_event_summary': {
'certainty': [{
'|code': 'at0024'
}
]
}
}, {
'substance_agent': ['s']
}, {
'reaction_reported': ['true']
}, {
'comment': ['c']
}
]
}
}
What I would like is a join over "reaction_event_summary" like this:
{
'ctx/language': 'en',
'ctx/territory': 'DE',
'composer_name': 'openEHR2study',
'Allergies': {
'adverse_reaction-allergy': [{
'reaction_event_summary': {
'clinical_impact': [{
'|code': 'at0035'
}
]
'certainty': [{
'|code': 'at0024'
}
]
}
}, {
'recorded': ['2020-05-14T00:00:00.000Z']
}, {
'substance_agent': ['s']
}, {
'reaction_reported': ['true']
}, {
'comment': ['c']
}
]
}
I have no Idea how I should loop through the json/list and dicts to get this done.

I have made a rough attempt, please check if this works. We are trying to iterate the dictionary with another deepcopy.
val = {
'ctx/language': 'en',
'ctx/territory': 'DE',
'composer_name': 'openEHR2study',
'Allergies': {
'adverse_reaction-allergy': [
{
'reaction_event_summary': {
'clinical_impact': [{
'|code': 'at0035'
}
]
}
}, {
'recorded': ['2020-05-14T00:00:00.000Z']
}, {
'reaction_event_summary': {
'certainty': [{
'|code': 'at0024'
}
]
}
}, {
'substance_agent': ['s']
}, {
'reaction_reported': ['true']
}, {
'comment': ['c']
}
]
}
}
import copy
val1 = copy.deepcopy(val)
del val1['Allergies']['adverse_reaction-allergy']
val1['Allergies']['adverse_reaction-allergy'] = []
reaction_count = 0
for _d in val['Allergies']['adverse_reaction-allergy']:
if _d.get('reaction_event_summary', False):
if reaction_count < 1:
reaction_count += 1
val1['Allergies']['adverse_reaction-allergy'].append(
{'reaction_event_summary': _d.get('reaction_event_summary')})
else:
print(_d.get('reaction_event_summary'))
_temp = val1['Allergies']['adverse_reaction-allergy'][0]['reaction_event_summary']
_temp['certainty'] = _d.get('reaction_event_summary',{}).get('certainty',{})
val1['Allergies']['adverse_reaction-allergy'][0]['reaction_event_summary'] = _temp
else:
val1['Allergies']['adverse_reaction-allergy'].append(_d)
import json
print(json.dumps(val1, indent=2))
Example output
{
"ctx/language": "en",
"ctx/territory": "DE",
"composer_name": "openEHR2study",
"Allergies": {
"adverse_reaction-allergy": [
{
"reaction_event_summary": {
"clinical_impact": [
{
"|code": "at0035"
}
],
"certainty": [
{
"|code": "at0024"
}
]
}
},
{
"recorded": [
"2020-05-14T00:00:00.000Z"
]
},
{
"substance_agent": [
"s"
]
},
{
"reaction_reported": [
"true"
]
},
{
"comment": [
"c"
]
}
]
}
}

Related

How to construct a path using jmespath to get values

I am having problems getting some values using jmespath.search().
Just to put it in context, I am downloading all the information from my request in a CSV file. I then upload this as a JSON and using JMESPath, I wish to get the values.
I want to get the #value where '_instrumentIdScheme': 'mhi:MHILIST'
json fixed:
[
{
"_fpmlVersion": "5-6",
"header": {
"messageType": "PrevDayCloseBond",
"sendTo": [
{
"#value": "Anvil"
}
],
"creationTimestamp": "2021-09-28T06:00:00.000Z"
},
"m:asOfDate": {
"#value": "2021-09-28T00:00:00.000Z"
},
"_xmlns": "http://www.fpml.org/FpML-5/reporting",
"_xmlns:m": "urn:com.mizuho.bdm",
"_xmlns:mhi": "urn:com.mizuho.bdm.mhi",
"_xmlns:xsi": "http://www.w3.org/2001/XMLSchema-instance",
"_xsi:schemaLocation": "http://www.fpml.org/FpML-5/reporting http://svc-bdmentity01p:8080/schema/7.2.0/com/mizuho/bdm/fpml/fpml-5-6-reporting.xsd urn:com.mizuho.bdm http://svc-bdmentity01p:8080/schema/7.2.0/com/mizuho/bdm/fpml/mizuho-fpml.xsd urn:com.mizuho.bdm.mhi http://svc-bdmentity01p:8080/schema/7.2.0/com/mizuho/bdm/mhi/fpml/mhi-fpml.xsd",
"m:assetPricing": [
{
"m:pricingSource": [
{
"#value": "LON-XEN-BBG"
},
{
"#value": "BGN",
"_pricingSourceScheme": "mizuho:bloomberg-source"
}
],
"m:instrumentId": [
{
"#value": "100001380992",
"_instrumentIdScheme": "mhi:MHILIST"
},
{
"#value": "100001380992",
"_instrumentIdScheme": "mhsa:instrument-id"
}
],
"m:currency": {
"#value": "USD"
},
"m:price": [
{
"value": 140.78125,
"measureType": {
"#value": "Bid Price",
"_assetMeasureScheme": "mizuho:price-type"
}
},
{
"value": 140.875,
"measureType": {
"#value": "Mid Price",
"_assetMeasureScheme": "mizuho:price-type"
}
},
{
"value": 140.96875,
"measureType": {
"#value": "Offer Price",
"_assetMeasureScheme": "mizuho:price-type"
}
}
]
}
],
"m:pricingDate": "2021-09-28T00:00:00.000Z"
}
]
replace all simple quotes by double quotes
to select all #value with the condition:
def flatten(container):
for i in container:
if isinstance(i, (list,tuple)):
for j in flatten(i):
yield j
else:
yield i
str = """
[
{
"_fpmlVersion": "5-6",
"header": {
"messageType": "PrevDayCloseBond",
"sendTo": [
{
"#value": "Anvil"
}
],
"creationTimestamp": "2021-09-28T06:00:00.000Z"
},
"m:asOfDate": {
"#value": "2021-09-28T00:00:00.000Z"
},
"_xmlns": "http://www.fpml.org/FpML-5/reporting",
"_xmlns:m": "urn:com.mizuho.bdm",
"_xmlns:mhi": "urn:com.mizuho.bdm.mhi",
"_xmlns:xsi": "http://www.w3.org/2001/XMLSchema-instance",
"_xsi:schemaLocation": "http://www.fpml.org/FpML-5/reporting http://svc-bdmentity01p:8080/schema/7.2.0/com/mizuho/bdm/fpml/fpml-5-6-reporting.xsd urn:com.mizuho.bdm http://svc-bdmentity01p:8080/schema/7.2.0/com/mizuho/bdm/fpml/mizuho-fpml.xsd urn:com.mizuho.bdm.mhi http://svc-bdmentity01p:8080/schema/7.2.0/com/mizuho/bdm/mhi/fpml/mhi-fpml.xsd",
"m:assetPricing": [
{
"m:pricingSource": [
{
"#value": "LON-XEN-BBG"
},
{
"#value": "BGN",
"_pricingSourceScheme": "mizuho:bloomberg-source"
}
],
"m:instrumentId": [
{
"#value": "100001380992",
"_instrumentIdScheme": "mhi:MHILIST"
},
{
"#value": "100001380992",
"_instrumentIdScheme": "mhsa:instrument-id"
}
],
"m:currency": {
"#value": "USD"
},
"m:price": [
{
"value": 140.78125,
"measureType": {
"#value": "Bid Price",
"_assetMeasureScheme": "mizuho:price-type"
}
},
{
"value": 140.875,
"measureType": {
"#value": "Mid Price",
"_assetMeasureScheme": "mizuho:price-type"
}
},
{
"value": 140.96875,
"measureType": {
"#value": "Offer Price",
"_assetMeasureScheme": "mizuho:price-type"
}
}
]
}
],
"m:pricingDate": "2021-09-28T00:00:00.000Z"
}
]
"""
str = str.replace("\n", "").replace("\t", "")
str = json.loads(str)
#print(str)
valueslist = jmespath.search('[]["m:assetPricing"][][]."m:instrumentId"[?"_instrumentIdScheme" == `mhi:MHILIST`].["#value"]', str)
#print(valueslist)
values = list(flatten(valueslist))
print(values)
result:
['100001380992']

How to remove parent json element in python3 if child is object is empty

I'm trying to move data from SQL to Mongo. Here is a challenge I'm facing, if any child object is empty I want to remove parent element. I want till insurance field to be removed.
Here is what I tried:
def remove_empty_elements(jsonData):
if(isinstance(jsonData, list) or isinstance(jsonData,dict)):
for elem in list(jsonData):
if not isinstance(elem, dict) and isinstance(jsonData[elem], list) and elem:
jsonData[elem] = [x for x in jsonData[elem] if x]
if(len(jsonData[elem])==0):
del jsonData[elem]
elif not isinstance(elem, dict) and isinstance(jsonData[elem], dict) and not jsonData[elem]:
del jsonData[elem]
else:
pass
return jsonData
sample data
{
"_id": "30546c62-8ea0-4f1a-a239-cc7508041a7b",
"IsActive": "True",
"name": "Pixel 3",
"phone": [
{
"Bill": 145,
"phonetype": "xyz",
"insurance": [
{
"year_one_claims": [
{
"2020": 200
},
{
},
{
},
{
},
{
}
]
},
{
"year_two_claims": [
{
},
{
},
{
},
{
},
{
}
]
},
]
}
],
"Provider": {
"agent": "aaadd",
}
}
Results should look like that
{
"_id": "30546c62-8ea0-4f1a-a239-cc7508041a7b",
"IsActive": "True",
"name": "Pixel 3",
"phone": [
{
"Bill": 145,
"phonetype": "xyz",
"insurance": [
{
"year_one_claims": [
{
"2020": 200
},
]
},
]
}
],
"Provider": {
"agent": "aaadd",
}
}
Your if statements are kind of confusing. I think you are looking for a recursion:
import json
# define which elements you want to remove:
to_be_deleted = [[], {}, "", None]
def remove_empty_elements(jsonData):
if isinstance(jsonData, list):
jsonData = [new_elem for elem in jsonData
if (new_elem := remove_empty_elements(elem)) not in to_be_deleted]
elif isinstance(jsonData,dict):
jsonData = {key: new_value for key, value in jsonData.items()
if (new_value := remove_empty_elements(value)) not in to_be_deleted}
return jsonData
print(json.dumps(remove_empty_elements(jsonData), indent=4))
Edit/Note: from Python3.8 you can use assignements (:=) in comprehensions
Output:
{
"_id": "30546c62-8ea0-4f1a-a239-cc7508041a7b",
"IsActive": "True",
"name": "Pixel 3",
"phone": [
{
"Bill": 145,
"phonetype": "xyz",
"insurance": [
{
"year_one_claims": [
{
"2020": 200
}
]
}
]
}
],
"Provider": {
"agent": "aaadd"
}
}
Try out this:
data = {
"_id": "30546c62-8ea0-4f1a-a239-cc7508041a7b",
"IsActive": "True",
"name": "Pixel 3",
"phone": [
{
"Bill": 145,
"phonetype": "xyz",
"insurance": [
{
"year_one_claims": [
{
"2020": 200
},
{
},
{
},
{
},
{
}
]
},
{
"year_two_claims": [
{
},
{
},
{
},
{
},
{
}
]
},
]
}
],
"Provider": {
"agent": "aaadd",
}
}
for phn_data in data['phone']:
for ins in phn_data['insurance']:
for key, val in list(ins.items()):
for ins_data in list(val):
if not ins_data:
val.remove(ins_data)
if not val:
del ins[key]
phn_data['insurance'].remove(ins)
print (data)
Output:
{
'_id': '30546c62-8ea0-4f1a-a239-cc7508041a7b',
'IsActive': 'True',
'name': 'Pixel 3',
'phone': [{
'Bill': 145,
'phonetype': 'xyz',
'insurance': [{
'year_one_claims': [{
'2020': 200
}]
}]
}],
'Provider': {
'agent': 'aaadd'
}
}

Parse the complex JSON in Python without storing in File

I'm trying the parse the following JSON data without storing it in a file, using Python.
{
"select": {
"value": "s_name"
},
"from": "student",
"where": {
"in": [
"s_id",
{
"select": {
"value": "s_id"
},
"from": "student_course",
"where": {
"in": [
"c_id",
{
"select": {
"value": "c_id"
},
"from": "course",
"where": {
"or": [
{
"and": [
{
"eq": [
"c_name",
{
"literal": "DSA"
}
]
},
{
"eq": [
"c_name",
{
"literal": "dbms"
}
]
}
]
},
{
"eq": [
"c_name",
{
"literal": "algorithm"
}
]
}
]
}
}
]
}
}
]
}
}
I'm using the following code:
import json
x = "JSON Data which is shared above"
y = json.dumps(x)
jsonDict = json.loads(y)
print (jsonDict['where'])
And not sure, how to proceed further, could you please advise, how it can be done?
I want to fetch the value of all objects, especially where clause.
json.dumps() takes an object and encodes it into a JSON string. But you are trying to take a JSON string and decode it into an object (a dict in this case). The method you should be applying against x therefore is json.loads(). You can then convert the resulting dict back into a JSON string, y, with json.dumps():
import json
x = """{
"select": {
"value": "s_name"
},
"from": "student",
"where": {
"in": [
"s_id",
{
"select": {
"value": "s_id"
},
"from": "student_course",
"where": {
"in": [
"c_id",
{
"select": {
"value": "c_id"
},
"from": "course",
"where": {
"or": [
{
"and": [
{
"eq": [
"c_name",
{
"literal": "DSA"
}
]
},
{
"eq": [
"c_name",
{
"literal": "dbms"
}
]
}
]
},
{
"eq": [
"c_name",
{
"literal": "algorithm"
}
]
}
]
}
}
]
}
}
]
}
}"""
jsonDict = json.loads(x) # from string to a dict
print(jsonDict['where'])
y = json.dumps(jsonDict) # from dict back to a string
Prints:
{'in': ['s_id', {'select': {'value': 's_id'}, 'from': 'student_course', 'where': {'in': ['c_id', {'select': {'value': 'c_id'}, 'from': 'course', 'where': {'or': [{'and': [{'eq': ['c_name', {'literal': 'DSA'}]}, {'eq': ['c_name', {'literal': 'dbms'}]}]}, {'eq': ['c_name', {'literal': 'algorithm'}]}]}}]}}]}

How to create a dynamic query from the dictionary by checking the length

dictionary is below. Below is sample dictionary of length 2 and 3. By checking the condition need to generate the query dynamically
a = [{'data': 'abc'}, {'prod': 'def'}]
if len(a) = 2:
#below query has to generate
"query": {
"bool": {
"should": [
{
"query_string": {
"query": "*abc*",
"fields": [
"data"
]
}
},
{
"query_string": {
"query": "*def*",
"fields": [
"prod"
]
}
}
]
}
}
}
a = [{'data': 'abc'}, {'prod': 'def'},{'email': '#gmail'}]
if len(a) = 3
#below is the query
"query": {
"bool": {
"should": [
{
"query_string": {
"query": "*abc*",
"fields": [
"data"
]
}
},
{
"query_string": {
"query": "*def*",
"fields": [
"prod"
]
}
},
{
"query_string": {
"query": "*#gmail.com*",
"fields": [
"email"
]
}
}
]
}
}
}```
Basically if dictionary keep on adding {"query_string": {"query": "*#gmail.com*","fields": ["email"]}} the query also keep on adding
Using a simple iteration.
Ex:
a = [{'data': 'abc'}, {'prod': 'def'}]
result = {"query": {
"bool": {
"should": []
}
}
}
for item in a:
for k, v in item.items():
result['query']['bool']['should'].append({
"query_string": {
"query": f"*{v}*",
"fields": [
k
]}
})
print(result)
Output:
{'query': {'bool': {'should': [{'query_string': {'fields': ['data'],
'query': '*abc*'}},
{'query_string': {'fields': ['prod'],
'query': '*def*'}}]}}}

Set difference between unions of specific subfields

I have a large collection that can be modeled more or less as the one created by the following code:
import string
from random import randint, random, choice
documents = []
for i in range(100):
letters = choice(string.letters[0:15])
documents.append({'hgvs_id': "".join([str(randint(0,9)), letters]),
'sample_id': "CDE",
'number': i*random()*50 - 30 })
documents.append({'hgvs_id': "".join([str(randint(0,9)), letters]),
'sample_id': 'ABC',
'number': i*random()*50 - 30 })
documents.append({'hgvs_id': "".join([str(randint(0,9)), letters]),
'sample_id': 'GEF',
'number': i*random()*50 - 30 })
for i in range(10): # add some unique values for sample_id 'ABC'
letters = choice(string.letters[0:15])
documents.append({'hgvs_id': "55" + letters,
'sample_id': 'ABC',
'number': i*random()*50 - 30 })
collection.insert_many(documents)
I am trying to retrieve the unique hgvs_id's that occur within documents that have a specific sample_id (ABC here) but not in documents containing the other two. Usually, there will be many more sample_id than just three.
It sounds pretty simple, but so far I have been unsuccessful. Given the size of the collection I'm working with (~30GB), I've been trying to use the aggregate framework as follows:
sample_1 = collection.aggregate(
[
{'$group':
{
'_id': '$hgvs_id',
#'sample_id' : {"addToSet": '$hgvs_id'},
'matchedDocuments':
{'$push':
{
'id': '$_id',
'sample_name': "$sample_id",
'hgvs_ids': "$hgvs_id"
}
},
}
},
{'$match': {
"$and": [
{'matchedDocuments': {"$elemMatch": {'sample_name': 'ABC'}}},
# Some other operation????
]
}
}
]) #, allowDiskUse=True) may be needed
This returns (understandably) all the hgvs_id's having sample_id equal ABC. Any leads would be more than appreciated.
If it's the only sample_id in the "set" of grouped values then the $size will be one:
With MongoDB 3.4 you can use $in in combination:
[
{ "$group": {
"_id": "$hgvs_id",
"samples": { "$addToSet": "$sample_id" }
}},
{ "$redact": {
"$cond": {
"if": {
"$and": [
{ "$in": [ "ABC", "$samples" ] },
{ "$eq": [ { "$size": "$samples" }, 1 ] }
]
},
"then": "$$KEEP",
"else": "$$PRUNE"
}
}}
]
Otherwise use $setIntersection which is just a little longer in syntax:
[
{ "$group": {
"_id": "$hgvs_id",
"samples": { "$addToSet": "$sample_id" }
}},
{ "$redact": {
"$cond": {
"if": {
"$and": [
{ "$eq": [ { "$size": { "$setIntersection": [ "$samples", ["ABC"] ] } }, 1 ] },
{ "$eq": [ { "$size": "$samples" }, 1 ] }
]
},
"then": "$$KEEP",
"else": "$$PRUNE"
}
}}
]
Or probably in the simplest form for all versions supporting aggregation anyway:
{ "$group": {
"_id": "$hgvs_id",
"samples": { "$addToSet": "$sample_id" }
}},
{ "$match": {
"$and": [{ "samples": "ABC" },{ "samples": { "$size": 1 } }]
}}
]
The same principle applies to any number of arguments in that the "set" produced much much the size of the arguments given as well as containing the specific value.

Categories

Resources