Python groupby/convert a triple join table to nested dictionary - python

From a SQL stored procedure that performs a join on 3 tables I get the data below.
data = [
{"service_order_number": "ABC", "item_id": 0, "ticket_id": 10},
{"service_order_number": "ABC", "item_id": 0, "ticket_id": 11},
{"service_order_number": "ABC", "item_id": 1, "ticket_id": 12},
{"service_order_number": "DEF", "item_id": 3, "ticket_id": 13},
{"service_order_number": "DEF", "item_id": 3, "ticket_id": 14},
{"service_order_number": "DEF", "item_id": 3, "ticket_id": 15}]
I would like to group the data on service_order_number and item_id to return a list of dicts like below.
[
{
"service_order_number": "ABC",
"line_items": [
{
"item_id": 0,
"tickets": [
{
"ticket_id": 10
},
{
"ticket_id": 11
}
]
},
{
"item_id": 1,
"tickets": [
{
"ticket_id": 12
}
]
}
]
},
{
"service_order_number": "DEF",
"line_items": [
{
"item_id": 3,
"tickets": [
{
"ticket_id": 13
},
{
"ticket_id": 14
},
{
"ticket_id": 15
}
]
}
]
}
]
The hierarchy would be service_order_number > item_id > ticket_id
Is there an easy way to convert this data into my desired structure?

Here is a possible solution using defaultdict()
import json
from collections import defaultdict
my_dict = defaultdict(lambda: defaultdict(list))
for item in data:
(my_dict[item['service_order_number']][item['item_id']]
.append({'ticket_id': item['ticket_id']}))
res = [{'service_order_number': service_order_number,
'line_items': [{'item_id': item_id, 'tickets': tickets}
for item_id, tickets in item_group.items()]}
for service_order_number, item_group in my_dict.items()
]
print(json.dumps(res, indent=1, default=int))
[
{
"service_order_number": "ABC",
"line_items": [
{
"item_id": 0,
"tickets": [
{
"ticket_id": 10
},
{
"ticket_id": 11
}
]
},
{
"item_id": 1,
"tickets": [
{
"ticket_id": 12
}
]
}
]
},
{
"service_order_number": "DEF",
"line_items": [
{
"item_id": 3,
"tickets": [
{
"ticket_id": 13
},
{
"ticket_id": 14
},
{
"ticket_id": 15
}
]
}
]
}
]

Related

Convert sql join data into list of dictionaries on certain same key

From a sql stored proc that performs a join on two tables I get the data below.
[
{"service_order_number": "ABC", "vendor_id": 0, "recipient_id": 0, "item_id": 0, "part_number": "string", "part_description": "string"},
{"service_order_number": "ABC", "vendor_id": 0, "recipient_id": 0, "item_id": 1, "part_number": "string", "part_description": "string"},
{"service_order_number": "DEF", "vendor_id": 0, "recipient_id": 0, "item_id": 2, "part_number": "string", "part_description": "string"},
{"service_order_number": "DEF", "vendor_id": 0, "recipient_id": 0, "item_id": 3, "part_number": "string", "part_description": "string"}
]
What would be the best way to convert this data into the below format? Is it possible on the python side? Or is there something other than a join I can perform to get data back in this format?
[{
"service_order_number": "ABC",
"vendor_id": 0,
"recipient_id": 0,
items: [
{
"item_id": 0,
"part_number": "string",
"part_description": "string",
},
{
"item_id": 1,
"part_number": "string",
"part_description": "string",
}
]
},
{"service_order_number": "DEF"
"vendor_id": 0,
"recipient_id": 0,
items: [
{
"item_id": 2,
"part_number": "string",
"part_description": "string",
},
{
"item_id": 3,
"part_number": "string",
"part_description": "string",
}
]
}]
Here a possibile solution using defaultdict()
from collections import defaultdict
grouped_data = defaultdict(list)
for item in data:
grouped_data[item['service_order_number']].append(item)
res = [{'service_order_number': k, 'vendor_id': v[0]['vendor_id'],
'recipient_id': v[0]['recipient_id'], 'items': [{'item_id': item['item_id'],
'part_number': item['part_number'], 'part_description': item['part_description']}
for item in v]} for k, v in grouped_data.items()]
print(res)
If you don't need the original data after, you can use dict.pop to create common keys to group over and populate a dictionary in a loop. Note that this code destroys the original data you'll only have res in the end.
res = {}
keys = ['service_order_number', 'vendor_id', 'recipient_id']
for d in my_data:
vals = tuple(d.pop(k) for k in keys)
res.setdefault(vals, {}).update(dict(zip(keys, vals)))
# "items" key-value pairs are further nested inside 'items' key
res[vals].setdefault('items', []).append(d)
res = list(res.values())
which outputs
[{'service_order_number': 'ABC',
'vendor_id': 0,
'recipient_id': 0,
'items': [{'item_id': 0, 'part_number': 'string', 'part_description': 'string'},
{'item_id': 1, 'part_number': 'string', 'part_description': 'string'}]},
{'service_order_number': 'DEF',
'vendor_id': 0,
'recipient_id': 0,
'items': [{'item_id': 2, 'part_number': 'string', 'part_description': 'string'},
{'item_id': 3, 'part_number': 'string', 'part_description': 'string'}]}]

Python groupby/convert join table to triple nested dictionary

From a SQL stored procedure that performs a join on 3 tables I get the data below.
data = [
{"so_number": "ABC", "po_status": "OPEN", "item_id": 0, "part_number": "XTZ", "ticket_id": 10, "ticket_month": "JUNE"},
{"so_number": "ABC", "po_status": "OPEN", "item_id": 0, "part_number": "XTZ", "ticket_id": 11, "ticket_month": "JUNE"},
{"so_number": "ABC", "po_status": "OPEN", "item_id": 1, "part_number": "XTY", "ticket_id": 12, "ticket_month": "JUNE"},
{"so_number": "DEF", "po_status": "OPEN", "item_id": 3, "part_number": "XTU", "ticket_id": 13, "ticket_month": "JUNE"},
{"so_number": "DEF", "po_status": "OPEN", "item_id": 3, "part_number": "XTU", "ticket_id": 14, "ticket_month": "JUNE"},
{"so_number": "DEF", "po_status": "OPEN", "item_id": 3, "part_number": "XTU", "ticket_id": 15, "ticket_month": "JUNE"}]
I would like to group the data on so_number and item_id to return a list of dicts like below.
[
{
"so_number ": "ABC",
"po_status": "OPEN",
"line_items": [
{
"item_id": 0,
"part_number": "XTZ",
"tickets": [
{
"ticket_id": 10,
"ticket_month": "JUNE"
},
{
"ticket_id": 11,
"ticket_month": "JUNE"
}
]
},
{
"item_id": 1,
"part_number": "XTY",
"tickets": [
{
"ticket_id": 12,
"ticket_month": "JUNE"
}
]
}
]
},
{
"so_number ": "DEF",
"po_status": "OPEN",
"line_items": [
{
"item_id": 3,
"part_number": "XTU"
"tickets": [
{
"ticket_id": 13,
"ticket_month": "JUNE"
},
{
"ticket_id": 14,
"ticket_month": "JUNE"
},
{
"ticket_id": 15,
"ticket_month": "JUNE"
}
]
}
]
}
]
I wanted to know if there was an efficient way of doing this. I am open to using pandas as well.
I thought about accessing the 3 sql tables through a loop and creating this list of dicts but it will probably not be best practice or efficient.
Given the nested structure, you could use groupby in loops:
import pandas as pd
import json
data = [
{"so_number": "ABC", "po_status": "OPEN", "item_id": 0, "part_number": "XTZ", "ticket_id": 10, "ticket_month": "JUNE"},
{"so_number": "ABC", "po_status": "OPEN", "item_id": 0, "part_number": "XTZ", "ticket_id": 11, "ticket_month": "JUNE"},
{"so_number": "ABC", "po_status": "OPEN", "item_id": 1, "part_number": "XTY", "ticket_id": 12, "ticket_month": "JUNE"},
{"so_number": "DEF", "po_status": "OPEN", "item_id": 3, "part_number": "XTU", "ticket_id": 13, "ticket_month": "JUNE"},
{"so_number": "DEF", "po_status": "OPEN", "item_id": 3, "part_number": "XTU", "ticket_id": 14, "ticket_month": "JUNE"},
{"so_number": "DEF", "po_status": "OPEN", "item_id": 3, "part_number": "XTU", "ticket_id": 15, "ticket_month": "JUNE"}]
df = pd.DataFrame(data)
res = []
for (so, po), dfg1 in df.groupby(["so_number", "po_status"]):
d1 = {"so_number ": so,
"po_status": po,
"line_items": []
}
for (iid, pnb), dfg2 in dfg1.groupby(["item_id", "part_number"]):
d2 = {"item_id": iid,
"part_number": pnb,
"tickets": dfg2[["ticket_id", "ticket_month"]].to_dict(orient="records")
}
d1["line_items"].append(d2)
res.append(d1)
print(json.dumps(res, indent=2, default=int))
Output:
[
{
"so_number ": "ABC",
"po_status": "OPEN",
"line_items": [
{
"item_id": 0,
"part_number": "XTZ",
"tickets": [
{
"ticket_id": 10,
"ticket_month": "JUNE"
},
{
"ticket_id": 11,
"ticket_month": "JUNE"
}
]
},
{
"item_id": 1,
"part_number": "XTY",
"tickets": [
{
"ticket_id": 12,
"ticket_month": "JUNE"
}
]
}
]
},
{
"so_number ": "DEF",
"po_status": "OPEN",
"line_items": [
{
"item_id": 3,
"part_number": "XTU",
"tickets": [
{
"ticket_id": 13,
"ticket_month": "JUNE"
},
{
"ticket_id": 14,
"ticket_month": "JUNE"
},
{
"ticket_id": 15,
"ticket_month": "JUNE"
}
]
}
]
}
]
Edit following your comment: you will still have to define the grouping keys. But you can do it only once and keep all other keys at the last level:
res = []
lvl1 = ["so_number", "po_status"]
lvl2 = ["item_id", "part_number"]
for val1, dfg1 in df.groupby(lvl1):
d1 = dict(zip(lvl1, val1))
d1["line_items"]= []
for val2, dfg2 in dfg1.groupby(lvl2):
d2 = dict(zip(lvl2, val2))
d2["tickets"]= dfg2.drop(columns=lvl1+lvl2).to_dict(orient="records")
d1["line_items"].append(d2)
res.append(d1)

How to merge list of dictionaries by unique key value

I want to merge list of dictionary provided below with unique channel and zrepcode.
sample input:
[
{
"channel": 1,
"zrepcode": "123456",
"turn": 7833.9
},
{
"channel": 1,
"zrepcode": "123456",
"pipeline": 324
},
{
"channel": 1,
"zrepcode": "123456",
"inv_bal": 941.16
},
{
"channel": 1,
"zrepcode": "123456",
"display": 341
},
{
"channel": 3,
"zrepcode": "123456",
"display": 941.16
},
{
"channel": 3,
"zrepcode": "123456",
"turn": 7935.01
},
{
"channel": 3,
"zrepcode": "123456",
"pipeline": 0
},
{
"channel": 3,
"zrepcode": "123456",
"inv_bal": 341
},
{
"channel": 3,
"zrepcode": "789789",
"display": 941.16
},
{
"channel": 3,
"zrepcode": "789789",
"turn": 7935.01
},
{
"channel": 3,
"zrepcode": "789789",
"pipeline": 0
},
{
"channel": 3,
"zrepcode": "789789",
"inv_bal": 341
}
]
Sample output:
[
{'channel': 1, 'zrepcode': '123456', 'turn': 7833.9, 'pipeline': 324.0,'display': 341,'inv_bal': 941.16},
{'channel': 3, 'zrepcode': '123456', 'turn': 7935.01, 'pipeline': 0.0, 'display': 941.16, 'inv_bal': 341.0},
{'channel': 3, 'zrepcode': '789789', 'turn': 7935.01, 'pipeline': 0.0, 'display': 941.16, 'inv_bal': 341.0}
]
Easily solved with our good friend collections.defaultdict:
import collections
by_key = collections.defaultdict(dict)
for datum in data: # data is the list of dicts from the post
key = (datum.get("channel"), datum.get("zrepcode")) # form the key tuple
by_key[key].update(datum) # update the defaultdict by the key tuple
print(list(by_key.values()))
This outputs
[
{'channel': 1, 'zrepcode': '123456', 'turn': 7833.9, 'pipeline': 324, 'inv_bal': 941.16, 'display': 341},
{'channel': 3, 'zrepcode': '123456', 'display': 941.16, 'turn': 7935.01, 'pipeline': 0, 'inv_bal': 341},
{'channel': 3, 'zrepcode': '789789', 'display': 941.16, 'turn': 7935.01, 'pipeline': 0, 'inv_bal': 341},
]

Is there an efficient way to compare each key, value pair of a dictionary in a many to one comparison

Idea is to compare N number of dictionaries with a single standard dictionary where each key, value pair comparison has a different conditional rule.
Eg.,
Standard dictionary -
{'ram': 16,
'storage': [512, 1, 2],
'manufacturers': ['Dell', 'Apple', 'Asus', 'Alienware'],
'year': 2018,
'drives': ['A', 'B', 'C', 'D', 'E']
}
List of dictionaries -
{'ram': 8,
'storage': 1,
'manufacturers': 'Apple',
'year': 2018,
'drives': ['C', 'D', 'E']
},
{'ram': 16,
'storage': 4,
'manufacturers': 'Asus',
'year': 2021,
'drives': ['F', 'G','H']
},
{'ram': 4,
'storage': 2,
'manufacturers': 'ACER',
'year': 2016,
'drives': ['F', 'G', 'H']
}
Conditions-
'ram' > 8
if 'ram' >=8 then 'storage' >= 2 else 1
'manufactures' in ['Dell', 'Apple', 'Asus', 'Alienware']
'year' >= 2018
if 'year' > 2018 then 'drives' in ['A', 'B', 'C', 'D', 'E'] else ['F', 'G', 'H']
So the expected output is to display all the non-matching ones with non-matching values and none/null for the matching values.
Expected Output -
{'ram': 8,
'storage': 1,
'manufacturers': None,
'year': None,
'drives': ['C', 'D', 'E']
},
{'ram': None,
'storage': None,
'manufacturers': None,
'year': None,
'drives': ['F','G','H']
},
{'ram': 4,
'storage': 2,
'manufacturers': 'ACER',
'year': 2016,
'drives': None
}
While working with MongoDB I encountered this problem where each document in a data collection should be compared with a standard collection. Any MongoDB direct query would also be very helpful.
To achieve the conditions along using MongoDB Aggregation, use the below Query:
db.collection.aggregate([
{
"$project": {
"ram": {
"$cond": {
"if": {
"$gt": [
"$ram",
8
]
},
"then": null,
"else": "$ram",
}
},
"storage": {
"$cond": {
"if": {
"$and": [
{
"$gte": [
"$ram",
8
]
},
{
"$gte": [
"$storage",
2
]
},
],
},
"then": null,
"else": "$storage",
}
},
"manufacturers": {
"$cond": {
"if": {
"$in": [
"$manufacturers",
[
"Dell",
"Apple",
"Asus",
"Alienware"
],
]
},
"then": null,
"else": "$manufacturers",
}
},
"year": {
"$cond": {
"if": {
"$gte": [
"$year",
2018
]
},
"then": null,
"else": "$year",
}
},
"drives": {
"$cond": {
"if": {
"$gt": [
"$year",
2018
]
},
"then": {
"$setIntersection": [
"$drives",
[
"A",
"B",
"C",
"D",
"E"
]
]
},
"else": "$drives",
}
},
}
}
])
Mongo Playground Sample Execution
You can combine this with for loop in Python
for std_doc in std_col.find({}, {
"ram": 1,
"storage": 1,
"manufacturers": 1,
"year": 1,
"drives": 1,
}):
print(list(list_col.aggregate([
{
"$project": {
"ram": {
"$cond": {
"if": {
"$gt": [
"$ram",
8
]
},
"then": None,
"else": "$ram",
}
},
"storage": {
"$cond": {
"if": {
"$and": [
{
"$gte": [
"$ram",
8
]
},
{
"$gte": [
"$storage",
2
]
},
],
},
"then": None,
"else": "$storage",
}
},
"manufacturers": {
"$cond": {
"if": {
"$in": [
"$manufacturers",
[
"Dell",
"Apple",
"Asus",
"Alienware"
],
]
},
"then": None,
"else": "$manufacturers",
}
},
"year": {
"$cond": {
"if": {
"$gte": [
"$year",
2018
]
},
"then": None,
"else": "$year",
}
},
"drives": {
"$cond": {
"if": {
"$gt": [
"$year",
2018
]
},
"then": {
"$setIntersection": [
"$drives",
[
"A",
"B",
"C",
"D",
"E"
]
]
},
"else": "$drives",
}
},
}
}
])))
The most optimized solution is to perform a lookup, but this varies based on your requirement:
db.std_col.aggregate([
{
"$lookup": {
"from": "dict_col",
"let": {
"cmpRam": "$ram",
"cmpStorage": "$storage",
"cmpManufacturers": "$manufacturers",
"cmpYear": "$year",
"cmpDrives": "$drives",
},
"pipeline": [
{
"$project": {
"ram": {
"$cond": {
"if": {
"$gt": [
"$ram",
"$$cmpRam",
]
},
"then": null,
"else": "$ram",
}
},
"storage": {
"$cond": {
"if": {
"$and": [
{
"$gte": [
"$ram",
"$$cmpRam"
]
},
{
"$gte": [
"$storage",
"$$cmpStorage"
]
},
],
},
"then": null,
"else": "$storage",
}
},
"manufacturers": {
"$cond": {
"if": {
"$in": [
"$manufacturers",
"$$cmpManufacturers",
]
},
"then": null,
"else": "$manufacturers",
}
},
"year": {
"$cond": {
"if": {
"$gte": [
"$year",
"$$cmpYear",
]
},
"then": null,
"else": "$year",
}
},
"drives": {
"$cond": {
"if": {
"$gt": [
"$year",
"$$cmpYear"
]
},
"then": {
"$setIntersection": [
"$drives",
"$$cmpDrives"
]
},
"else": "$drives",
}
},
}
},
],
"as": "inventory_docs"
}
}
])
Mongo Playground Sample Execution

How to map the dictionary values to another dictionary

I have dictionary which is below
{
"aggregations": {
"A": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{ "key": "ADL", "doc_count": 1 },
{ "key": "SDD", "doc_count": 1 },
{ "key": "JJD", "doc_count": 1 }
]
},
"B": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{ "key": "ABC", "doc_count": 1 },
{ "key": "CDE", "doc_count": 1 },
{ "key": "FGH", "doc_count": 1 }
]
},
"C": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{ "key": "XYX", "doc_count": 1 },
{ "key": "NXS", "doc_count": 1 }
]
}
}
}
aggregations.keys will be aggregationfilters.fieldName
aggregations.buckets.key will be aggregationfilters.values.title
aggregationfilters.values.paragraph is null everytime
aggregations.buckets.doc_count will be aggregationfilters.values.count
Basically I need to extract aggregations.keys and aggregations.bucket values and put into different dictionary.
Need to write a general code structure to do that.
I cannot do with .pop(rename) the dictioanry
My expected out
{
"aggregationfilters": [
{
"name": "ABC",
"fieldName": "A",
"values": [
{ "title": "ADL", "paragraph": null, "count": 1 },
{ "title": "SDD", "paragraph": null, "count": 1 },
{ "title": "JJD", "paragraph": null, "count": 1 }
]
}, {
"name": "CDE",
"fieldName": "B",
"values": [
{ "title": "ABC", "paragraph": null, "count": 1 },
{ "title": "CDE", "paragraph": null, "count": 1 },
{ "title": "FGH", "paragraph": null, "count": 1 }
]
}, {
"name": "FGH",
"fieldName": "C",
"values": [
{ "title": "XYX", "paragraph": null, "count": 1 },
{ "title": "NXS", "paragraph": null, "count": 1 }
]
}
]
}
Well, this works, but even with my best effort this still doesn't look that clean.
import json
source = {
"aggregations": {
"A": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{"key": "ADL", "doc_count": 1},
{"key": "SDD", "doc_count": 1},
{"key": "JJD", "doc_count": 1},
],
},
"B": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{"key": "ABC", "doc_count": 1},
{"key": "CDE", "doc_count": 1},
{"key": "FGH", "doc_count": 1},
],
},
"C": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [{"key": "XYX", "doc_count": 1}, {"key": "NXS", "doc_count": 1}],
},
}
}
convert_map = {
"buckets": "values",
"doc_count": "count",
"key": "title",
}
remove_map = {"sum_other_doc_count", "doc_count_error_upper_bound"}
add_map = {"name": "Changed VAL_", "fieldName": "VAL_"}
def converting_generator(
source_: dict, convert_map_: dict, remove_map_: set, add_map_: dict
):
working_dict = {k: v for k, v in source_.items()}
variable_identifier = "VAL_"
for key, inner_dic in working_dict.items():
inner_dic: dict
for rm_key in remove_map_:
try:
inner_dic.pop(rm_key)
except KeyError:
pass
for add_key, add_val in add_map_.items():
inner_dic[add_key] = add_val.replace(variable_identifier, key)
dumped = json.dumps(inner_dic, indent=2)
for original, target in convert_map_.items():
dumped = dumped.replace(original, target)
yield json.loads(dumped)
converted = {
"aggregation_filters": list(
converting_generator(source["aggregations"], convert_map, remove_map, add_map)
)
}
for inner_dict in converted["aggregation_filters"]:
for even_inner_dict in inner_dict["values"]:
even_inner_dict["paragraph"] = None
print(json.dumps(converted, indent=2))
Output:
{
"aggregation_filters": [
{
"values": [
{
"title": "ADL",
"count": 1,
"paragraph": null
},
{
"title": "SDD",
"count": 1,
"paragraph": null
},
{
"title": "JJD",
"count": 1,
"paragraph": null
}
],
"name": "Changed A",
"fieldName": "A"
},
{
"values": [
{
"title": "ABC",
"count": 1,
"paragraph": null
},
{
"title": "CDE",
"count": 1,
"paragraph": null
},
{
"title": "FGH",
"count": 1,
"paragraph": null
}
],
"name": "Changed B",
"fieldName": "B"
},
{
"values": [
{
"title": "XYX",
"count": 1,
"paragraph": null
},
{
"title": "NXS",
"count": 1,
"paragraph": null
}
],
"name": "Changed C",
"fieldName": "C"
}
]
}
Always show your code, would be nice if that's a working one - to show that you've put at least that worth of the effort on your problem.
I don't bother it as this feels like puzzle solving, but others may not.

Categories

Resources