I have a JSON input file that looks like this:
{"nodes": [
{"properties": {
"id": "rootNode",
"name": "Bertina Dunmore"},
"nodes": [
{"properties": {
"id": 1,
"name": "Gwenneth Rylett",
"parent_id": "rootNode"},
"nodes": [
{"properties": {
"id": 11,
"name": "Joell Waye",
"parent_id": 1}},
{"properties": {
"id": 12,
"name": "Stan Willcox",
"parent_id": 1}}]},
{"properties": {
"id": 2,
"name": "Delbert Dukesbury",
"parent_id": "rootNode"},
"nodes": [
{"properties": {
"id": 21,
"name": "Cecil McKeever",
"parent_id": 2}},
{"properties": {
"id": 22,
"name": "Joy Obee",
"parent_id": 2}}]}]}]}
I want to get the nested properties dictionaries into a (flat) list of dictionaries. Creating a recursive function that will read this dictionaries is easy:
def get_node(nodes):
for node in nodes:
print(node['properties'])
if 'nodes' in node.keys():
get_node(node['nodes'])
Now, I'm struggling to append these to a single list:
def get_node(nodes):
prop_list = []
for node in nodes:
print(node['properties'])
prop_list.append(node['properties'])
if 'nodes' in node.keys():
get_node(node['nodes'])
return prop_list
This returns [{'id': 'rootNode', 'name': 'Bertina Dunmore'}], even though all properties dictionaries are printed. I suspect that this is because I'm not handling the function scope properly.
Can someone please help me get my head around this?
your problem is that every time you call get_node, the list where you append is initialized again. you can avoid this by passing the list to append in the recursive function
Moreover, I think would be nice to use dataclass to deal with this problem,
from dataclasses import dataclass
from typing import Union
#dataclass
class Property:
id: int
name: str
parent_id: Union[str, None] = None
def explore_json(data, properties: list=None):
if properties is None:
properties = []
for key, val in data.items():
if key == "nodes":
for node in val:
explore_json(node, properties)
elif key == "properties":
properties.append(Property(**val))
return properties
explore_json(data)
output
[Property(id='rootNode', name='Bertina Dunmore', parent_id=None),
Property(id=1, name='Gwenneth Rylett', parent_id='rootNode'),
Property(id=11, name='Joell Waye', parent_id=1),
Property(id=12, name='Stan Willcox', parent_id=1),
Property(id=2, name='Delbert Dukesbury', parent_id='rootNode'),
Property(id=21, name='Cecil McKeever', parent_id=2),
Property(id=22, name='Joy Obee', parent_id=2)]
You need to combine the prop_list returned by the recursive call with the prop_list in the current scope. For example,
def get_node(nodes):
prop_list = []
for node in nodes:
print(node['properties'])
prop_list.append(node['properties'])
if 'nodes' in node.keys():
prop_list.extend(get_node(node['nodes']))
return prop_list
With that:
def get_node(prop_list, nodes):
for node in nodes:
print(node['properties'])
prop_list.append(node['properties'])
if 'nodes' in node.keys():
get_node(prop_list, node['nodes'])
You can just do:
prop_list = []
get_node(prop_list, <yourdictnodes>)
Should alter prop_list into:
{'id': 'rootNode', 'name': 'Bertina Dunmore'}
{'id': 1, 'name': 'Gwenneth Rylett', 'parent_id': 'rootNode'}
{'id': 11, 'name': 'Joell Waye', 'parent_id': 1}
{'id': 12, 'name': 'Stan Willcox', 'parent_id': 1}
{'id': 2, 'name': 'Delbert Dukesbury', 'parent_id': 'rootNode'}
{'id': 21, 'name': 'Cecil McKeever', 'parent_id': 2}
{'id': 22, 'name': 'Joy Obee', 'parent_id': 2}
Related
I'm struggling with updating a value in a json object.
import json
userBoard = '' #see example below. is loaded in a separate function
#app.get("/setItem")
def setItem():
id = request.args.get('itemId')
id = int(id[2:]) # is for instance 2
for item in json.loads(session['userBoard']):
if item['id'] == id:
item['solved']='true'
else:
print('Nothing found!')
return('OK')
Example of the json:
[{"id": 1, "name": "t1", "solved": "false"}, {"id": 2, "name": "t2", "solved": "false"}, {"id": 3, "name": "t3"}]
However, when I check the printout of the userBoard, the value is still 'false'. Does anyone have any idea? Does this need to be serialized somehow? Tried many things but it didn't work out...
Many thanks!
One could say the question is somehow specific and is lacking some information to provide a simple answer. So I am going to make some assumptions and propose a solution.
First, id and input are python built-ins and should not be used as variable names. I will use these strings with a _ prefix on purpose, so that you could still use these names in a safer way.
import json
from typing import List
json_ex = '[{"id": 1, "name": "t1", "solved": "false"}, {"id": 2, "name": "t2", "solved": "false"}, {"id": 3, "name": "t3"}]'
_id = 2 # for now a constant for demonstration purposes
def setItem(_input: List[dict]):
for item in _input:
if (this_id := item['id']) == _id: # requires python 3.8+, otherwise you can simplify this
item['solved'] = 'true'
print(f'Updated item id {this_id}')
else:
print('Nothing found!')
json_ex_parsed = json.loads(json_ex) # this is now a list of dictionaries
setItem(json_ex_parsed)
Output:
Nothing found!
Updated item id 2
Nothing found!
The contents of json_ex_parsed before applying setItem:
[{'id': 1, 'name': 't1', 'solved': 'false'},
{'id': 2, 'name': 't2', 'solved': 'false'},
{'id': 3, 'name': 't3'}]
and after:
[{'id': 1, 'name': 't1', 'solved': 'false'},
{'id': 2, 'name': 't2', 'solved': 'true'}, # note here has been updated
{'id': 3, 'name': 't3'}]
I am trying to devise a logic in python for the given scenario -
I have a list of multiple dictionaries, my main goal is to get the unique list of dictionary based on the id key.
non_unique = [
{"id": 1, "name": "A", "items": ["blah1"]},
{"id": 1, "name": "A", "items": ["blah2"]}
]
I can get the unique list of dictionaries by this dictionary comprehension:
list({v["id"]: v for v in non_unique}.values())
But I am unable to fit a logic in the dictionary comprehension to concatenate the value of items key. My expected output is:
[{"id": 1, "name": "A", "items": ["blah1", "blah2"]}]
Sometimes a simple for loop is much more clear than dict or list comprehension....in your case i would simple use :
from operator import itemgetter
non_unique = [{'id': 1, "name": "A", "items": ["blah1"]},
{'id': 1, "name": "A", "items": ["blah2"]},
{'id': 2, "name": "A", "items": ["blah2"]},
{'id': 2, "name": "B", "items": ["blah1"]},
]
result = {}
for uniq in non_unique:
id, items, name = itemgetter('id', 'items', 'name')(uniq)
if id in result:
result[id]["items"] += items
if name not in result[id]["name"].split():
result[id]["name"] += ",{}".format(name)
else:
result[id] = uniq
unique_list = [val for item, val in result.items()]
print(unique_list)
Output :
[{'id': 1, 'name': 'A', 'items': ['blah1', 'blah2']}, {'id': 2, 'name': 'A,B', 'items': ['blah2', 'blah1']}]
EDIT
As suggested in comments : i add a simple check for the name and add it to names if it does not exists....
I also add the itemgetter for getting a more "clear" code.
You can use this method.
non_unique = [
{'id': 1, 'name': "A", 'items': ["blah1"]},
{'id': 1, 'name': "A", 'items': ["blah2"]}
]
dic = []
for v in non_unique:
for x in dic:
if x['id'] == v['id']:
if v['name']not in x['name']:
x['name'] += v['name']
if v['items'] not in x['items']:
x['items'] += v['items']
break
else:
dic.append(v)
print(dic)
Output - [{'id': 1, 'name': 'A', 'items': ['blah1', 'blah2']}]
I'm trying to expand field mappings in a Table mapped by my AWS Glue crawler to a nested dictionary in Python. But, I can't find any Spark/Hive parsers to deserialize the
var_type = 'struct<loc_lat:double,service_handler:string,ip_address:string,device:bigint,source:struct<id:string,contacts:struct<admin:struct<email:string,name:string>>,name:string>,loc_name:string>'
string located in table_schema['Table']['StorageDescriptor']['Columns'] to a Python dict.
How to dump the table definition in Glue:
import boto3
client = boto3.client('glue')
client.get_table(DatabaseName=selected_db, Name=selected_table)
Response:
table_schema = {'Table': {'Name': 'asdfasdf',
'DatabaseName': 'asdfasdf',
'Owner': 'owner',
'CreateTime': datetime.datetime(2019, 7, 29, 13, 20, 13, tzinfo=tzlocal()),
'UpdateTime': datetime.datetime(2019, 7, 29, 13, 20, 13, tzinfo=tzlocal()),
'LastAccessTime': datetime.datetime(2019, 7, 29, 13, 20, 13, tzinfo=tzlocal()),
'Retention': 0,
'StorageDescriptor': {'Columns': [{'Name': 'version', 'Type': 'int'},
{'Name': 'payload',
'Type': 'struct<loc_lat:double,service_handler:string,ip_address:string,device:bigint,source:struct<id:string,contacts:struct<admin:struct<email:string,name:string>>,name:string>,loc_name:string>'},
{'Name': 'origin', 'Type': 'string'}],
'Location': 's3://asdfasdf/',
'InputFormat': 'org.apache.hadoop.mapred.TextInputFormat',
'OutputFormat': 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat',
'Compressed': False,
'NumberOfBuckets': -1,
'SerdeInfo': {'SerializationLibrary': 'org.openx.data.jsonserde.JsonSerDe',
'Parameters': {'paths': 'origin,payload,version'}},
'BucketColumns': [],
'SortColumns': [],
'Parameters': {'CrawlerSchemaDeserializerVersion': '1.0',
'CrawlerSchemaSerializerVersion': '1.0',
'UPDATED_BY_CRAWLER': 'asdfasdf',
'averageRecordSize': '799',
'classification': 'json',
'compressionType': 'none',
'objectCount': '94',
'recordCount': '92171',
'sizeKey': '74221058',
'typeOfData': 'file'},
'StoredAsSubDirectories': False},
'PartitionKeys': [{'Name': 'partition_0', 'Type': 'string'},
{'Name': 'partition_1', 'Type': 'string'},
{'Name': 'partition_2', 'Type': 'string'}],
'TableType': 'EXTERNAL_TABLE',
'Parameters': {'CrawlerSchemaDeserializerVersion': '1.0',
'CrawlerSchemaSerializerVersion': '1.0',
'UPDATED_BY_CRAWLER': 'asdfasdf',
'averageRecordSize': '799',
'classification': 'json',
'compressionType': 'none',
'objectCount': '94',
'recordCount': '92171',
'sizeKey': '74221058',
'typeOfData': 'file'},
'CreatedBy': 'arn:aws:sts::asdfasdf'},
'ResponseMetadata': {'RequestId': 'asdfasdf',
'HTTPStatusCode': 200,
'HTTPHeaders': {'date': 'Thu, 01 Aug 2019 16:23:06 GMT',
'content-type': 'application/x-amz-json-1.1',
'content-length': '3471',
'connection': 'keep-alive',
'x-amzn-requestid': 'asdfasdf'},
'RetryAttempts': 0}}
Goal would be a python dictionary and values for each field type, vs. the embedded string. E.g.
expand_function('struct<loc_lat:double,service_handler:string,ip_address:string,device:bigint,source:struct<id:string,contacts:struct<admin:struct<email:string,name:string>>,name:string>,loc_name:string>'})
returns
{
'loc_lat':'double',
'service_handler':'string',
'ip_address':'string',
'device':'bigint',
'source':{'id':'string',
'contacts': {
'admin': {
'email':'string',
'name':'string'
},
'name':'string'
},
'loc_name':'string'
}
Thanks!
The accepted answer doesn't handle arrays.
This solution does:
import json
import re
def _hive_struct_to_json(hive_str):
"""
Expands embedded Hive struct strings to Python dictionaries
Args:
Hive struct format as string
Returns
JSON object
"""
r = re.compile(r'(.*?)(struct<|array<|[:,>])(.*)')
root = dict()
to_parse = hive_str
parents = []
curr_elem = root
key = None
while to_parse:
left, operator, to_parse = r.match(to_parse).groups()
if operator == 'struct<' or operator == 'array<':
parents.append(curr_elem)
new_elem = dict() if operator == 'struct<' else list()
if key:
curr_elem[key] = new_elem
curr_elem = new_elem
elif isinstance(curr_elem, list):
curr_elem.append(new_elem)
curr_elem = new_elem
key = None
elif operator == ':':
key = left
elif operator == ',' or operator == '>':
if left:
if isinstance(curr_elem, dict):
curr_elem[key] = left
elif isinstance(curr_elem, list):
curr_elem.append(left)
if operator == '>':
curr_elem = parents.pop()
return root
hive_str = '''
struct<
loc_lat:double,
service_handler:string,
ip_address:string,
device:bigint,
source:struct<
id:string,
contacts:struct<
admin:struct<
email:string,
name:array<string>
>
>,
name:string
>,
loc_name:string,
tags:array<
struct<
key:string,
value:string
>
>
>
'''
hive_str = re.sub(r'[\s]+', '', hive_str).strip()
print(hive_str)
print(json.dumps(_hive_struct_to_json(hive_str), indent=2))
Prints:
struct<loc_lat:double,service_handler:string,ip_address:string,device:bigint,source:struct<id:string,contacts:struct<admin:struct<email:string,name:array<string>>>,name:string>,loc_name:string,tags:array<struct<key:string,value:string>>>
{
"loc_lat": "double",
"service_handler": "string",
"ip_address": "string",
"device": "bigint",
"source": {
"id": "string",
"contacts": {
"admin": {
"email": "string",
"name": [
"string"
]
}
},
"name": "string"
},
"loc_name": "string",
"tags": [
{
"key": "string",
"value": "string"
}
]
}
Here's a function running on the embedded Hive struct string above.
def _hive_struct_to_json(hive_struct):
"""
Expands embedded Hive struct strings to Python dictionaries
Args:
Hive struct format as string
Returns
JSON object
"""
# Convert embedded hive type definition string to JSON
hive_struct = hive_struct.replace(':', '":"')
hive_struct = hive_struct.replace(',', '","')
hive_struct = hive_struct.replace('struct<', '{"')
hive_struct = hive_struct.replace('"{"', '{"')
hive_struct = hive_struct.replace('>', '"}')
hive_struct = hive_struct.replace('}"', '}')
return json.loads(hive_struct)
hive_str = 'struct<loc_lat:double,service_handler:string,ip_address:string,device:bigint,source:struct<id:string,contacts:struct<admin:struct<email:string,name:string>>,name:string>,loc_name:string>'
print(json.dumps(_hive_struct_to_json(hive_str),indent=2))
Returns:
{
"loc_lat": "double",
"service_handler": "string",
"ip_address": "string",
"device": "bigint",
"source": {
"id": "string",
"contacts": {
"admin": {
"email": "string",
"name": "string"
}
},
"name": "string"
},
"loc_name": "string"
}
I tried to scout from some existing ways and found some helper functions from pyspark.
import pyspark.sql.types as T
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("tmp").getOrCreate()
struct_map = T._parse_datatype_string("MAP < STRING, STRUCT < year: INT, place: STRING, details: STRING >>")
struct_map is a pyspark type that inturn has nested fields to iterate over. Once you have an object like above, performing a recursive call to flatten it should be easy. I am open to hearing opinions from others about this approach.
Closed. This question needs to be more focused. It is not currently accepting answers.
Want to improve this question? Update the question so it focuses on one problem only by editing this post.
Closed 3 years ago.
Improve this question
I have the object:
object = [
[{
id: 1,
name: "a",
age: 20
},
{
id: 2,
name: "b",
age: 19
}],
[{
id: 1,
address: "something",
email: "something#"
},
{
id: 2,
address: "helpppp",
email: "something"
}]
];
And I want this:
object = [
{
id: 1,
name: "a",
age: 20
address: "something",
email: "something#"
},
{
id: 2,
name: "a",
age: 19
address: "helpppp",
email: "something"
}
];
Assuming you can export the object in JSON/python dict format rather than as a Javascript object, the following will work.
pprint is used as a convenience for formatting output.
from collections import defaultdict
from pprint import pprint
def combine_list_of_list_of_dicts(source):
output = defaultdict(dict)
for a in source:
for b in a:
output[b["id"]].update(b)
return list(output.values())
source = [
[
{
"id": 1,
"name": "a",
"age": 20
},
{
"id": 2,
"name": "b",
"age": 19
}
],
[
{
"id": 1,
"address": "something",
"email": "something#"
},
{
"id": 2,
"address": "helpppp",
"email": "something"
}
]
]
result = combine_list_of_list_of_dicts(source)
pprint(result)
Gives
[{'address': 'something',
'age': 20,
'email': 'something#',
'id': 1,
'name': 'a'},
{'address': 'helpppp', 'age': 19, 'email': 'something', 'id': 2, 'name': 'b'}]
I'm not sure why you are using a list of lists, you can just as easily store each dictionary in a a list, but if you have no control of that I would make the following suggestions:
The keys in your dictionary need to be quoted with either a single quote or double quote.
You can find the ids for your dictionary with the following code:
ids = set(sum([[y['id'] for y in x] for x in object], []))
This statement is a bit convoluted in that it uses list comprehension, sum (to get individual elements from a list of lists, and set to get unique ids. So the following should work for the example you've described:
flat_object = sum(object, [])
ids = set(sum([[y['id'] for y in x] for x in object], []))
merged_object = list()
for obj_id in ids:
same = [x for x in flat_object if x['id'] == obj_id]
merged = same[0]
for x in same[1:]:
merged.update(x)
merged_object.append(merged)
print(merged_object)
# output
# [{'id': 1, 'name': 'a', 'age': 20, 'address': 'something', 'email': 'something#'},
{'id': 2, 'name': 'b', 'age': 19, 'address': 'helpppp', 'email': 'something'}]
I am currently planning out a django app that allows users to not only build custom tables associated with models (e.g., a user could create a trivial custom "parking spot" table that is associated with the "employee" model without having to edit models.py), but to also build custom reports using those custom tables. The only way I can think to do this is by having a model that stores custom table data in a JSONField (I'm using Postgres as a backend so this actually works out great), and then have a reports model that allows users to build and save "SQL-like" queries that return joined datasets for their custom reports.
I've figured out how to store the custom tables and use them in my app, and I even have a loose concept on how to merge multiple JSON objects on pseudo foreign keys to be pulled into custom reports, but I have only gotten as far as creating one-to-one joins.
With the script below, if any of my dicts have multiple records on a single foreign key only the last record is used. Does anyone have any idea how I can accomplish a one-to-many join of multiple python dictionaries?
If I have these three datasets:
employees = [{"id": 1, "user_id": 303, "name": "Mike"},
{"id": 2, "user_id": 304, "name": "James"},
{"id": 3, "user_id": 305, "name": "David"},]
roles = [{"id": 1, "user_id": 303, "role": "Manager"},
{"id": 2, "user_id": 304, "role": "Assistant"},
{"id": 3, "user_id": 305, "role": "Assistant"},]
absences = [{"id": 1, "user_id": 303, "date": "2015-03-01"},
{"id": 2, "user_id": 303, "date": "2015-03-02"},
{"id": 3, "user_id": 303, "date": "2015-03-03"},
{"id": 4, "user_id": 304, "date": "2015-03-15"},
{"id": 5, "user_id": 305, "date": "2015-03-19"},]
My desired outcome on a straight join would be:
[{'date': '2015-03-01', 'role': 'Manager', 'user_id': 303, 'id': 1, 'name': 'Mike'},
{'date': '2015-03-02', 'role': 'Manager', 'user_id': 303, 'id': 1, 'name': 'Mike'},
{'date': '2015-03-03', 'role': 'Manager', 'user_id': 303, 'id': 1, 'name': 'Mike'},
{'date': '2015-03-15', 'role': 'Assistant', 'user_id': 304, 'id': 2, 'name': 'James'},
{'date': '2015-03-19', 'role': 'Assistant', 'user_id': 305, 'id': 3, 'name': 'David'}]
but since my script loops through my FROM dictionary first (in this case, employees), all I am able to get is this:
[{'date': '2015-03-03', 'role': 'Manager', 'user_id': 303, 'id': 1, 'name': 'Mike'},
{'date': '2015-03-15', 'role': 'Assistant', 'user_id': 304, 'id': 2, 'name': 'James'},
{'date': '2015-03-19', 'role': 'Assistant', 'user_id': 305, 'id': 3, 'name': 'David'}]
And here are the basics of my code:
def joiner(from_table, joins):
report_data = []
for row in from_table:
new_row = row
for table in joins:
table_dict = table["table"]
table_fk = table["fk"]
for tdr in table_dict:
if tdr[table_fk] == row[table_fk]:
for field in table["fields"]:
new_row[field] = tdr[field]
report_data = from_table
return report_data
join_tables = [{"table": roles, "fk": "user_id", "fields": ["role"]},
{"table": absences, "fk": "user_id", "fields": ["date"]},
]
joiner(employees, join_tables)
The simplest fix I could think of was to start with the "absences" dict as the from_table instead of employees, but then that is a Many-to-One join, which would be very limiting for my purposes.
Also, if anyone has a better idea for building user created data schemas that can be merged in custom reports using django, I'm all ears. The only other solution I can think of would be to bypass django models entirely and just have all custom tables created, updated, and queried using straight SQL.
As long as you put the longest list of dictionaries first (Can be easily modified) when you call the merge, here is a crude solution
def merge_lists(listdict1, listdict2,listdict3, joinkey):
mergedlist=listdict1
for i in range(len(listdict1)):
for j in range(len(listdict2)):
if (listdict1[i][joinkey]==listdict2[j][joinkey]):
for keys in listdict2[j].keys():
mergedlist[i][keys]=listdict2[j][keys]
for k in range(len(listdict3)):
if listdict1[i][joinkey]==listdict3[k][joinkey]:
for keys in listdict3[k].keys():
mergedlist[i][keys]=listdict3[k][keys]
return mergedlist
merge_lists(absences, employees, roles, "user_id")
[
{
"date":"2015-03-01",
"id":1,
"name":"Mike",
"role":"Manager",
"user_id":303
},
{
"date":"2015-03-02",
"id":1,
"name":"Mike",
"role":"Manager",
"user_id":303
},
{
"date":"2015-03-03",
"id":1,
"name":"Mike",
"role":"Manager",
"user_id":303
},
{
"date":"2015-03-15",
"id":2,
"name":"James",
"role":"Assistant",
"user_id":304
},
{
"date":"2015-03-19",
"id":3,
"name":"David",
"role":"Assistant",
"user_id":305
}
]