Convert string into Dict / JSON - python

I'm trying to convert the following piece of data into a dict so I can have key:value pair and have them nested :
data="Group A\n name : rey\n age : 6\n status : active\n in role : 201\n weight : 25\n interests\n Out Side : 16016\n In Side : 0\n Out : 2804\n\n name : dan\n age : 5\n status : inactive\n in role : 201\n weight : 18\n interests\n Out Side : 16016\n In Side : 0\n Out : 2804\n\n"
Problem is, not all of them have : and some of them are supposed to be grouped together (i.e part of Group A). \n line break definitely helps me in separating them.
Here's the result I'm hoping to get :
[
{
"Group A":
[
{"name": "rey", "age": "6", "status": "active"},
{"name": "dan", "age": "5", "status": "inactive"}
]
}
]
What do I have right now? I've separated some of them into dict with :
result = {}
for row in data.split('\n'):
if ':' in row:
key, value = row.split(':')
result[key] = value.strip()
This outputs me :
{' name ': 'dan', ' weight ': '18', ' In Side ': '0', ' Out ': '2804', ' in role ': '201', ' status ': 'inactive', ' Out Side ': '16016', ' age ': '5'}
But this messes up with the existing order in which the data is shown above - and not all of it came out.
I'm kind of capturing this data from an external program and so limited to only Python version 2.7. Any ideas would be super helpful!

You should find the pattern in your data :
Your data is separated by empty lines
The inner data start with space
Using those observations you can write :
def parse(data):
aa = {}
curRecord = None
curGroup = None
for row in data.split('\n'):
if row.startswith(' '):
# this is a new key in the inner record
if ':' in row :
if curRecord == None:
curRecord = {}
curGroup.append(curRecord)
key, value = row.split(':')
curRecord[key.strip()] = value.strip()
elif row == '':
# this signal new inner record
curRecord = None
else:
# otherwise , it is a new group
curRecord = None
curGroup = []
aa[row.strip()] = curGroup
return aa
>>> import json
>>> print( json.dumps(parse(data)) );
{"Group A": [{"name": "rey", ... }, {"name": "dan", ... }]}

I will use setdefault to create new lists within your Group. This will work for multiple Groups in case you have group B, C, D...
import json
def convert_string(data):
result = {}
current = ""
key_no = -1
for pair in data.split("\n"):
if ':' not in pair:
if "Group" in pair:
result.setdefault(pair,[])
current = pair
key_no = -1
elif "name" in pair:
result[current].append({})
key_no +=1
key, value = pair.split(':')
result[current][key_no][key.strip()] = value.strip()
elif all(s not in pair.lower() for s in ("weight","in role","out","in side")):
key, value = pair.split(':')
result[current][key_no][key.strip()] = value.strip()
return result
print (json.dumps(convert_string(d),indent=4))
{
"Group A": [
{
"name": "rey",
"age": "6",
"status": "active"
},
{
"name": "dan",
"age": "5",
"status": "inactive"
}
]
}

Related

Convert find words in a string

I have an output of a command like that as an example :
Time = XX ,info =XX , mesg=XXXX
{
"description":"add by the customer"
"group": " group black "
"id" :1,
"name": "group_1"
"Num": "No-648747464598"
}
{
"description":"add by the customer"
"group": "group black "
"id" :2,
"name": "group_2"
"Num": "No-7464674846"
}
{
"description":"add by the customer"
"group": " group black "
"id" :3,
"name": "group_3"
"Num": "No-9950509505"
}
How can I use the string function and split so if I give a name will get the Num?
Example :
Fonction X (group_3):
return No-9950509505
You could enumerate the lines and then fetch the data you need like following:
def func(group_name):
c = -1
group = 'group_3'
for idx, i in enumerate(x.splitlines()):
if c != -1:
return i.split(':')[1].replace('"', '')
if group_name in i:
c= idx

Create complex object in Python based on property names in dot notation

I am trying to create a complex object based on metadata I have. It is an array of attributes which I am iterating and trying to create a dict. For example below is the array:
[
"itemUniqueId",
"itemDescription",
"manufacturerInfo[0].manufacturer.value",
"manufacturerInfo[0].manufacturerPartNumber",
"attributes.noun.value",
"attributes.modifier.value",
"attributes.entityAttributes[0].attributeName",
"attributes.entityAttributes[0].attributeValue",
"attributes.entityAttributes[0].attributeUOM",
"attributes.entityAttributes[1].attributeName",
"attributes.entityAttributes[1].attributeValue",
"attributes.entityAttributes[1].attributeUOM",
]
This array should give an output as below:
{
"itemUniqueId": "",
"itemDescription": "",
"manufacturerInfo": [
{
"manufacturer": {
"value": ""
},
"manufacturerPartNumber": ""
}
],
"attributes": {
"noun": {
"value": ""
},
"modifier": {
"value": ""
},
"entityAttributes": [
{
"attributeName": "",
"attributeValue": "",
"attributeUOM": ""
},
{
"attributeName": "",
"attributeValue": "",
"attributeUOM": ""
}
]
}
}
I have written this logic but unable to get the desired output. It should work on both object and array given the metadata.
source_json = [
"itemUniqueId",
"itemDescription",
"manufacturerInfo[0].manufacturer.value",
"manufacturerInfo[0].manufacturerPartNumber",
"attributes.noun.value",
"attributes.modifier.value",
"attributes.entityAttributes[0].attributeName",
"attributes.entityAttributes[0].attributeValue",
"attributes.entityAttributes[0].attributeUOM",
"attributes.entityAttributes[1].attributeName",
"attributes.entityAttributes[1].attributeValue",
"attributes.entityAttributes[1].attributeUOM",
]
for row in source_json:
propertyNames = row.split('.')
temp = ''
parent = {}
parentArr = []
parentObj = {}
# if len(propertyNames) > 1:
arrLength = len(propertyNames)
for i, (current) in enumerate(zip(propertyNames)):
if i == 0:
if '[' in current:
parent[current]=parentArr
else:
parent[current] = parentObj
temp = current
if i > 0 and i < arrLength - 1:
if '[' in current:
parent[current] = parentArr
else:
parent[current] = parentObj
temp = current
if i == arrLength - 1:
if '[' in current:
parent[current] = parentArr
else:
parent[current] = parentObj
temp = current
# temp[prev][current] = ""
# finalMapping[target] = target
print(parent)
There's a similar question at Convert Dot notation string into nested Python object with Dictionaries and arrays where the accepted answer works for this question, but has unused code paths (e.g. isInArray) and caters to unconventional conversions expected by that question:
❓ "arrOne[0]": "1,2,3" → "arrOne": ["1", "2", "3"] instead of
✅ "arrOne[0]": "1,2,3" → "arrOne": ["1,2,3"] or
✅ "arrOne[0]": "1", "arrOne[1]": "2", "arrOne[2]": "3" → "arrOne": ["1", "2", "3"]
Here's a refined implementation of the branch function:
def branch(tree, path, value):
key = path[0]
array_index_match = re.search(r'\[([0-9]+)\]', key)
if array_index_match:
# Get the array index, and remove the match from the key
array_index = int(array_index_match[0].replace('[', '').replace(']', ''))
key = key.replace(array_index_match[0], '')
# Prepare the array at the key
if key not in tree:
tree[key] = []
# Prepare the object at the array index
if array_index == len(tree[key]):
tree[key].append({})
# Replace the object at the array index
tree[key][array_index] = value if len(path) == 1 else branch(tree[key][array_index], path[1:], value)
else:
# Prepare the object at the key
if key not in tree:
tree[key] = {}
# Replace the object at the key
tree[key] = value if len(path) == 1 else branch(tree[key], path[1:], value)
return tree
Usage:
VALUE = ''
def create_dict(attributes):
d = {}
for path_str in attributes:
branch(d, path_str.split('.'), VALUE)
return d
source_json = [
"itemUniqueId",
"itemDescription",
"manufacturerInfo[0].manufacturer.value",
"manufacturerInfo[0].manufacturerPartNumber",
"attributes.noun.value",
"attributes.modifier.value",
"attributes.entityAttributes[0].attributeName",
"attributes.entityAttributes[0].attributeValue",
"attributes.entityAttributes[0].attributeUOM",
"attributes.entityAttributes[1].attributeName",
"attributes.entityAttributes[1].attributeValue",
"attributes.entityAttributes[1].attributeUOM",
]
assert create_dict(source_json) == {
"itemUniqueId": "",
"itemDescription": "",
"manufacturerInfo": [
{
"manufacturer": {
"value": ""
},
"manufacturerPartNumber": ""
}
],
"attributes": {
"noun": {
"value": ""
},
"modifier": {
"value": ""
},
"entityAttributes": [
{
"attributeName": "",
"attributeValue": "",
"attributeUOM": ""
},
{
"attributeName": "",
"attributeValue": "",
"attributeUOM": ""
}
]
}
}
First we should iterate over whole list and store each 3rd attributes, after that we could change this struct to our desired output:
from typing import Dict, List
source_json = [
"attributes.entityAttributes[0].attributeName",
"attributes.entityAttributes[0].attributeValue",
"attributes.entityAttributes[0].attributeUOM",
"attributes.entityAttributes[1].attributeName",
"attributes.entityAttributes[1].attributeValue",
"attributes.entityAttributes[1].attributeUOM",
"attributes.entityAttributes[2].attributeName"
]
def accumulate(source: List) -> Dict:
accumulator = {}
for v in source:
vs = v.split(".")
root_attribute = vs[0]
if not root_attribute in accumulator:
accumulator[root_attribute] = {}
i = vs[1].rfind('[')
k = (vs[1][:i], vs[1][i+1:-1])
if not k in accumulator[root_attribute]:
accumulator[root_attribute][k] = {}
accumulator[root_attribute][k][vs[2]] = ""
return accumulator
def get_result(accumulated: Dict) -> Dict:
result = {}
for k, v in accumulated.items():
result[k] = {}
for (entity, idx), v1 in v.items():
if not entity in result[k]:
result[k][entity] = []
if len(v1) == 3:
result[k][entity].append(v1)
return result
print(get_result(accumulate(source_json)))
The output will be:
{
'attributes':
{
'entityAttributes':
[
{
'attributeName': '',
'attributeValue': '',
'attributeUOM': ''
},
{'attributeName': '',
'attributeValue': '',
'attributeUOM': ''
}
]
}
}
In accumulate function we store 3rd level attributes in Dict with (entityAttributes, 0) ... (entityAttributes, 2) keys.
In get_result function we convert Dict with (entityAttributes, 0) ... (entityAttributes, 2) keys to Dict from string to List.
How about something like this:
import re
import json
source_json = [
"attributes.entityAttributes[0].attributeName",
"attributes.entityAttributes[0].attributeValue",
"attributes.entityAttributes[0].attributeUOM",
"attributes.entityAttributes[1].attributeName",
"attributes.entityAttributes[1].attributeValue",
"attributes.entityAttributes[1].attributeUOM",
"attributes.entityAttributes[2].attributeName"
]
def to_object(source_json):
def add_attribute(target, attribute_list):
head, tail = attribute_list[0], attribute_list[1:]
if tail:
add_attribute(target.setdefault(head,{}), tail)
else:
target[head] = ''
target = {}
for row in source_json:
add_attribute(target, re.split(r'[\.\[\]]+',row))
return target
print(json.dumps(to_object(source_json), indent=4))
Note that this will not exactly do what you requested. It interprets stores the array also as an object with keys '0' ... '2'. This makes it easier to implement and also more stable. What would you expect, when the input list missed the entries with entityAttributes[0]. Should the list include an empty element or something different. Anyway you save space by not including this element, which works only if you store the array in an object.
None of the answers provided so far strike me as very intuitive. Here's one way
to tackle the problem with three easy-to-understand functions.
Normalize inputs. First we need a function to normalize the inputs strings. Instead of rules-bearing strings like
'foo[0].bar' – where one must understand that integers
in square brackets imply a list – we want a simple tuple
of keys like ('foo', 0, 'bar').
def attribute_to_keys(a):
return tuple(
int(k) if k.isdigit() else k
for k in a.replace('[', '.').replace(']', '').split('.')
)
Build a uniform data structure. Second, we need a function to assemble a data structure consisting of dicts
of dicts of dicts ... all the way down.
def assemble_data(attributes):
data = {}
for a in attributes:
d = data
for k in attribute_to_keys(a):
d = d.setdefault(k, {})
return convert(data)
def convert(d):
# Just a placeholder for now.
return d
Convert the uniform data. Third, we need to implement a real version of the placeholder. Specifically, we
need it to recursively convert the uniform data structure into our ultimate
goal having (a) empty strings at leaf nodes, and (b) lists rather than dicts
whenever the dict keys are all integers. Note that this even fills in empty
list positions with an empty string (a contingency not covered in your problem
description; adjust as needed if you want a different behavior).
def convert(d):
if not d:
return ''
elif all(isinstance(k, int) for k in d):
return [convert(d.get(i)) for i in range(max(d) + 1)]
else:
return {k : convert(v) for k, v in d.items()}
You can use a custom builder class which implements __getattr__ and __getitem__ to gradually build the underlying object. This building can then be triggered by using eval on each of the attribute strings (note: eval is not safe for input from untrusted sources).
The following is an example implementation:
class Builder:
def __init__(self):
self.obj = None
def __getattr__(self, key):
if self.obj is None:
self.obj = {}
return self.obj.setdefault(key, Builder())
def __getitem__(self, index):
if self.obj is None:
self.obj = []
self.obj.extend(Builder() for _ in range(index+1-len(self.obj)))
return self.obj[index]
def convert(self):
if self.obj is None:
return ''
elif isinstance(self.obj, list):
return [v.convert() for v in self.obj]
elif isinstance(self.obj, dict):
return {k: v.convert() for k,v in self.obj.items()}
else:
assert False
attributes = [
'itemUniqueId',
'itemDescription',
'manufacturerInfo[0].manufacturer.value',
'manufacturerInfo[0].manufacturerPartNumber',
'attributes.noun.value',
'attributes.modifier.value',
'attributes.entityAttributes[0].attributeName',
'attributes.entityAttributes[0].attributeValue',
'attributes.entityAttributes[0].attributeUOM',
'attributes.entityAttributes[1].attributeName',
'attributes.entityAttributes[1].attributeValue',
'attributes.entityAttributes[1].attributeUOM',
]
builder = Builder()
for attr in attributes:
eval(f'builder.{attr}')
result = builder.convert()
import json
print(json.dumps(result, indent=4))
which gives the following output:
{
"itemUniqueId": "",
"itemDescription": "",
"manufacturerInfo": [
{
"manufacturer": {
"value": ""
},
"manufacturerPartNumber": ""
}
],
"attributes": {
"noun": {
"value": ""
},
"modifier": {
"value": ""
},
"entityAttributes": [
{
"attributeName": "",
"attributeValue": "",
"attributeUOM": ""
},
{
"attributeName": "",
"attributeValue": "",
"attributeUOM": ""
}
]
}
}

How to prevent double interation of list of dictionaries?

I am trying to create text strings by populating from a json object string.
When I iterate through the list of dictionaries, the iterator doubles the string. How do I fix this?
Code so far:
import json
data = '''{
"text": "aaa",
"text2": "bbb",
"data": [
{
"id": "1",
"text": "Red"
}, {
"id": "2",
"text": "Blue"
}
]
}'''
data_decoded = json.loads(data)
data_list = data_decoded['data']
insertQuery = "update "+ data_decoded['text'] +" set "
#print(insertQuery)
for pair in data_list:
for k, v in pair.items():
if k == data_decoded['text2']:
where = ' \"' + k + '\" = \'' + v + '\''
else:
insertQuery = insertQuery + ' where \"' +k+'\" = \''+ v + '\''
query = insertQuery + where
print(query)
Output:
update aaa set where "id" = '1' where "text" = 'Red' "id" = '2'
update aaa set where "id" = '1' where "text" = 'Red' where "id" = '2' where "text" = 'Blue' "id" = '2'
My desired result is for every key value pair the code prints one sentence, like so:
update aaa set where "id" = '1' where "text" = 'Red'
update aaa set where "id" = '2' where "text" = 'Blue'
Not entirely sure, but you can just access your dictionary items rather than looping over them :)
If you use >= python 3.6
query = ''
field = data_decoded['text']
for pair in data_list:
query += f"update {field} set where id = {pair['id']} where text = {pair ['text']}\n"
Otherwise:
query = ''
field = data_decoded['text']
for pair in data_list:
query += "update {field} set where id = {id} where text = {text}\n".format(field=field, id=pair['id'], text=pair['text'])

Extracting Json Data from List using python

I have this sample json data in a list:
data = [
{ "Name": "John_Doe", "Age" : 25 }
{ "Name": "Jane_Roe", "Age" : 26 }
]
I need a way to extract the all the key value pairs from an element in the list based on the 'Name' Key. If my variable = 'John_Doe', then the script should only return the values related to John_Doe, i, e only the following values :
{ "Name": "John_Doe", "Age" : 25 }
Just extract all the dictionaries with the value "John_Doe" associated with the key "Name":
print([d for d in data if d['Name'] == "John_Doe"])
# [{ "Name": "John_Doe", "Age" : 25 }]
Or with filter():
print(list(filter(lambda x : x['Name'] == "John_Doe", data)))
# [{ "Name": "John_Doe", "Age" : 25 }]
If all you need is the dict with a Name element matching John_Doe, then:
matches = [m for m in data if "Name" in m and m["Name"] == "John_Doe"]
You can unroll this list comprehension to see what it does:
matches = []
for m in data:
if "Name" in m and m["Name"] == "John_Doe":
matches.append[m]
def get_details(data, name):
for i in data:
if i['Name'] == name:
return i
return {}
data = [{"Name": "John_Doe", "Age" : 25 },{"Name": "Jane_Roe", "Age" : 26 }]
name = "John_Doe"
get_details(data, name)
output:
{'Age': 25, 'Name': 'John_Doe'}

How to search through a nested list with a dictionary in Python?

In Python 3.6, I have a list like the one below and can't figure out how to properly search through the values. So, if I am given the search string below, I need to search through the values for both title and tags and whichever one has most matches, I would return the id for and if there were many different images (ids) with the same amount of matches, then the one whose title comes first alphabetically would be returned. Also, it is supposed to not be casesensitive. So in the code I have search as my term to search and it should return the first id value, but instead is returning different values.
image_info = [
{
"id" : "34694102243_3370955cf9_z",
"title" : "Eastern",
"flickr_user" : "Sean Davis",
"tags" : ["Los Angeles", "California", "building"]
},
{
"id" : "37198655640_b64940bd52_z",
"title" : "Spreetunnel",
"flickr_user" : "Jens-Olaf Walter",
"tags" : ["Berlin", "Germany", "tunnel", "ceiling"]
},
{
"id" : "34944112220_de5c2684e7_z",
"title" : "View from our rental",
"flickr_user" : "Doug Finney",
"tags" : ["Mexico", "ocean", "beach", "palm"]
},
{
"id" : "36140096743_df8ef41874_z",
"title" : "Someday",
"flickr_user" : "Thomas Hawk",
"tags" : ["Los Angeles", "Hollywood", "California", "Volkswagen", "Beatle", "car"]
}
]
my_counter = 0
search = "CAT IN BUILding"
search = search.lower().split()
matches = {}
for image in image_info:
for word in search:
word = word.lower()
if word in image["title"].lower().split(" "):
my_counter += 1
print(my_counter)
if word in image["tags"]:
my_counter +=1
print(my_counter)
if my_counter > 0:
matches[image["id"]] = my_counter
my_counter = 0
This a variation of code where I have attempted to pre-index the data before doing search. This a very rudimentary implementation of how CloudSearch or ElasticSearch would index and search
import itertools
from collections import Counter
image_info = [
{
"id" : "34694102243_3370955cf9_z",
"title" : "Eastern",
"flickr_user" : "Sean Davis",
"tags" : ["Los Angeles", "California", "building"]
},
{
"id" : "37198655640_b64940bd52_z",
"title" : "Spreetunnel",
"flickr_user" : "Jens-Olaf Walter",
"tags" : ["Berlin", "Germany", "tunnel", "ceiling"]
},
{
"id" : "34944112220_de5c2684e7_z",
"title" : "View from our rental",
"flickr_user" : "Doug Finney",
"tags" : ["Mexico", "ocean", "beach", "palm"]
},
{
"id" : "36140096743_df8ef41874_z",
"title" : "Someday",
"flickr_user" : "Thomas Hawk",
"tags" : ["Los Angeles", "Hollywood", "California", "Volkswagen", "Beatle", "car"]
}
]
my_counter = 0
search = "CAT IN BUILding california"
search = set(search.lower().split())
matches = {}
index = {}
# Building a rudimentary search index
for info in image_info:
bag = info["title"].lower().split(" ")
tags = [t.lower().split(" ") for t in info["tags"]] # we want to be able to hit "los angeles" as will as "los" and "angeles"
tags = list(itertools.chain.from_iterable(tags))
for k in (bag + tags):
if k in index:
index[k].append(info["id"])
else:
index[k] = [info["id"]]
#print(index)
hits = []
for s in search:
if s in index:
hits += index[s]
print(Counter(hits).most_common(1)[0][0])
You are creating new entry in dictionary matches[image["id"]] = my_counter.
If you want to keep only 1 entry in dictionary for that search term and you want image_id and count. I have modified your dict and condition. Hope it helps.
my_counter = 0
search_term = "CAT IN BUILding"
search = search_term.lower().split()
matches = {}
matches[search_term] = {}
for image in image_info:
for word in search:
word = word.lower()
if word in image["title"].lower().split(" "):
my_counter += 1
print(my_counter)
if word in image["tags"]:
my_counter +=1
print(my_counter)
if my_counter > 0:
if not matches[search_term].values() or my_counter > matches[search_term].values()[0]:
matches[search_term][image["id"]] = my_counter
my_counter = 0

Categories

Resources