Multiple Csv to json with no repetitive childrens

Multiple Csv to json with no repetitive childrens - python

I have a csv columns as follows, now I am trying to convert it to name/Children/Size format as required in D3 in JSON. There are repetetive Childrens Occuring For example
In name ="type" there is children ="young", size = 400000
L1 L2 L3 L4 L5 L6 Size
Type cars young young young young 40000
Type cars student US US US 10000
Type cars student UK UK UK 20000
Type cars Graduates Young India Delhi 20000
Type cars Graduates Old UK London 30000
Type Bike Undergrads CB CB UNC 6000
prime prime prime prime prime prime 600
My output I am getting is :
{
"name": "Segments",
"children": [
{
"name": "Type",
"children": [
{
"name": "cars",
"children": [
{
"name": "young",
"children": [
{
"name": "young",
"children": [
{
"name": "young",
"children": [
{
"name": "young",
"size": "40000"
}
]
}
]
}
]
},
{
"name": "student",
"children": [
{
"name": "US",
"children": [
{
"name": "US",
"children": [
{
"name": "US",
"size": "10000"
}
]
}
]
},
{
"name": "UK",
"children": [
{
"name": "UK",
"children": [
{
"name": "UK",
"size": "20000"
}
]
}
]
}
]
}
]
}
]
},
{
"name": "prime",
"children": [
{
"name": "prime",
"children": [
{
"name": "prime",
"children": [
{
"name": "prime",
"children": [
{
"name": "prime",
"children": [
{
"name": "prime",
"size": "600"
}
]
}
]
}
]
}
]
}
]
}
]
}
expecting output is:
{
"name": "Segments",
"children": [
{
"name": "Type",
"children": [
{
"name": "cars",
"children": [
{
"name": "young",
"size": "40000"
}
]
},
{
"name": "student",
"children": [
{
"name": "US",
"size": "10000"
}
{
"name": "UK",
"size": "20000"
}
]
}
]
},
{
"name": "prime",
"size": "600"
}
]
}
I am using Following code:
import json
import csv
class Node(object):
def __init__(self, name, size=None):
self.name = name
self.children = []
self.size = size
def child(self, cname, size=None):
child_found = [c for c in self.children if c.name == cname]
if not child_found:
_child = Node(cname, size)
self.children.append(_child)
else:
_child = child_found[0]
return _child
def as_dict(self):
res = {'name': self.name}
if self.size is None:
res['children'] = [c.as_dict() for c in self.children]
else:
res['size'] = self.size
return res
root = Node('Segments')
with open('C:\\Users\\G01172472\\Desktop\\Book3.csv', 'r') as f:
reader = csv.reader(f)
p = list(reader)
for row in range(1, len(p)):
grp1, grp2, grp3, grp4, grp5, grp6, size = p[row]
root.child(grp1).child(grp2).child(grp3).child(grp4).child(grp5).child(grp6, size)
print(json.dumps(root.as_dict(), indent=4))

So what you want to first to is to remove duplicates from each row and create the children accordingly.
Here's what I changed:
with open('C:\\Users\\G01172472\\Desktop\\Book3.csv', 'r') as f:
reader = csv.reader(f)
p = list(reader)
for row in range(1, len(p)):
temp = []
for x in p[row]:
if x not in temp:
temp.append(x)
#Create a temporary list of the row but keep only unique elements
## Additional code according to your dictionary structure
#if row != 1:
# if 'cars' in temp:
# temp.remove('cars')
# elif 'Bike' in temp:
# temp.remove('Bike')
# Create a string to which will look similar to root.child(grp1)...
evalStr = 'root'
for i in range(len(temp)):
if i == len(temp)-2:
evalStr += '.child("' + temp[i] + '","' + temp[-1] + '")'
else:
evalStr += '.child("' + temp[i] + '")'
# eval(string) will evaluate the string as python code
eval(evalStr)
print(json.dumps(root.as_dict(),indent=2))
Let me know if that works.

First of all you need to remove the dups from your row. This can be done as follows,
p[row] = ('Type', 'cars', 'young', 'young', 'young', 'young', 'Size')
pp = set()
new_p_row = [el for el in p[row] if not (el in pp or pp.add(el))]
# ['Type', 'cars', 'young', 'Size']
Then add childrens to your root until the last two,
for r in new_p_row[:-2]:
root.child(r)
Add the last child to your root with the size of it,
root.child(new_p_row[-2], new_p_row[-1])

Related

Counting multiple json inputs python

I get an input like this:
input 1:
{
"name": "Ben",
"description": "Ben",
"attributes": [
{
"type": "Background",
"value": "Default"
},
{
"type": "Hair-color",
"value": "Brown"
}
]
}
input 2
{
"name": "Ice",
"description": "Ice",
"attributes": [
{
"type": "Background",
"value": "Green"
},
{
"type": "Hair-color",
"value": "White"
}
]
}
input 3
{
"name": "Itay",
"description": "Itay",
"attributes": [
{
"type": "Background",
"value": "Default"
},
{
"type": "Hair-color",
"value": "Brown"
}
]
}
What I want to do is count the amount of each type of background and each type of hair-color appearing.
(These are sample examples and in reality there are more types and different values)
Let's say in these examples we have 2 objects that have a background as default then I want to have a count of that like so:
Backround default count=2
hair-color brown = 2
background green = 1
hair-color white = 1
I want the most effective code because there are other aspects to the code, in addition it will run on thousands of queries not just two, so needs to run in good times too :D
My code so far:
import requests
import json
from collections import Counter
from collections import defaultdict
from time import sleep
attributes = []
test_dict = defaultdict(list)
for i in range(min_id, max_id+1):
api = 'api/v1/test/{}'.format(i)
response = requests.get(api)
item_dict = json.loads(response.text)
for item in item_dict['attributes']:
attributes.append(item["trait_type"]) if item["trait_type"] not in attributes else attributes
test_dict[item["trait_type"]].append(item["value"])
sleep(0.02)
for attribute in attributes:
print(attribute)
print(Counter(test_dict[attribute]))

This should work for you:
def constract_data(data_dict):
output = []
total_count = 0
for data in data_dict:
attributes = data["attributes"]
for attribute in attributes:
total_count += 1
dict_key = attribute["type"].lower()
dict_value = attribute["value"].lower()
dict_index = [index for index, data in enumerate(output) if data.get(dict_key, "") == dict_value]
if dict_index:
output[dict_index[0]]['count'] += 1
else:
atb_dict = {dict_key: dict_value, 'count': 1}
output.append(atb_dict)
return output, total_count
def calculate_occurrence_ratio(data_dict, total_count):
for index, data in enumerate(data_dict):
count = data.get('count', 0)
ratio = round(((count / total_count) * 100), 2)
data['ratio'] = f'{ratio}%'
return data_dict
data_dict = [
{
"name":"Ice",
"description":"Ice",
"attributes":[
{
"type":"Background",
"value":"Green"
},
{
"type":"Hair-color",
"value":"White"
}
]
},
{
"name":"Ben",
"description":"Ben",
"attributes":[
{
"type":"Background",
"value":"Default"
},
{
"type":"Hair-color",
"value":"Brown"
}
]
},
{
"name":"Itay",
"description":"Itay",
"attributes":[
{
"type":"Background",
"value":"Default"
},
{
"type":"Hair-color",
"value":"Brown"
}
]
}
]
output_data, total_count = constract_data(data_dict)
output_data = calculate_occurrence_ratio(output_data, total_count)
print(output_data)
Output:
[{'background': 'green', 'count': 1, 'ratio': '16.67%'}, {'hair-color': 'white', 'count': 1, 'ratio': '16.67%'}, {'background': 'default', 'count': 2, 'ratio': '33.33%'}, {'hair-color': 'brown', 'count': 2, 'ratio': '33.33%'}]

this solution will work for you:
list_data = [
{
"name": "Ice",
"description": "Ice",
"attributes": [
{
"type": "Background",
"value": "Green"
},
{
"type": "Hair-color",
"value": "White"
},
{
"type": "other",
"value": "White"
}
]
},
{
"name": "Ben",
"description": "Ben",
"attributes": [
{
"type": "Background",
"value": "Default"
},
{
"type": "Hair-color",
"value": "Brown"
}
]
},{
"name": "Itay",
"description": "Itay",
"attributes": [
{
"type": "Background",
"value": "Default"
},
{
"type": "Hair-color",
"value": "Brown"
}
]
},
]
output = {}
all_count = {}
for user in list_data:
data = user["attributes"]
for dat in data:
typeu = dat["type"]
if typeu not in all_count:
all_count[typeu]=1
else:
all_count[typeu]+=1
for user in list_data:
data = user["attributes"]
for dat in data:
typeu = dat["type"]
if typeu not in output:
output[typeu]={}
if dat["value"] not in output[typeu]:
output[typeu][dat["value"]] = "1 with: {}%".format(int(1/all_count[typeu]*100))
else:
count = int(output[typeu][dat["value"]][0])+1
output[typeu][dat["value"]] = str(count)+" with: {}%".format(int(count/all_count[typeu]*100))
print(output)

Here is another approach:
from collections import defaultdict
data_dict = [
{
"name":"Ice",
"description":"Ice",
"attributes":[
{
"type":"Background",
"value":"Green"
},
{
"type":"Hair-color",
"value":"White"
}
]
},
{
"name":"Ben",
"description":"Ben",
"attributes":[
{
"type":"Background",
"value":"Default"
},
{
"type":"Hair-color",
"value":"Brown"
}
]
},
{
"name":"Itay",
"description":"Itay",
"attributes":[
{
"type":"Background",
"value":"Default"
},
{
"type":"Hair-color",
"value":"Brown"
}
]
}
]
out = defaultdict(lambda: defaultdict(int))
tot_count = defaultdict(int)
for data in data_dict:
for attri in data['attributes']:
tot_count[attri['type']]+=1
out[attri['type']][attri['value']]+=1
for k, v in out.items():
for k1, v1 in v.items():
print (f'{k.lower()} {k1} count={v1} ratio={int(v1*100/tot_count[k])}%')
Output:
background Green count=1 ratio=33%
background Default count=2 ratio=66%
hair-color White count=1 ratio=33%
hair-color Brown count=2 ratio=66%

duplicates in a JSON file based on two attributes

I have a JSON file and that is a nested JSON. I would like to remove duplicates based on two keys.
JSON example:
"books": [
{
"id": "1",
"story": {
"title": "Lonely lion"
},
"description": [
{
"release": false,
"author": [
{
"name": "John",
"main": 1
},
{
"name": "Jeroge",
"main": 0
},
{
"name": "Peter",
"main": 0
}
]
}
]
},
{
"id": "2",
"story": {
"title": "Lonely lion"
},
"description": [
{
"release": false,
"author": [
{
"name": "Jeroge",
"main": 1
},
{
"name": "Peter",
"main": 0
},
{
"name": "John",
"main": 0
}
]
}
]
},
{
"id": "3",
"story": {
"title": "Lonely lion"
},
"description": [
{
"release": false,
"author": [
{
"name": "John",
"main": 1
},
{
"name": "Jeroge",
"main": 0
}
]
}
]
}
]
Here I try to match the title and author name. For example, for id 1 and id 2 are duplicates( as the title is same and author names are also same(the author sequence doesn't matter and no need to consider the main attributes). So, in the output JSON only id:1 or id:2 will remain with id:3. In the final output I need two file.
Output_JSON:
"books": [
{
"id": "1",
"story": {
"title": "Lonely lion"
},
"description": [
{
"release": false,
"author": [
{
"name": "John",
"main": 1
},
{
"name": "Jeroge",
"main": 0
},
{
"name": "Peter",
"main": 0
}
]
}
]
},
{
"id": "3",
"story": {
"title": "Lonely lion"
},
"description": [
{
"release": false,
"author": [
{
"name": "John",
"main": 1
},
{
"name": "Jeroge",
"main": 0
}
]
}
]
}
]
duplicatedID.csv:
1-2
The following method I tried but it is not giving correct results:
list= []
duplicate_Id = []
for data in (json_data['books'])[:]:
elements= []
id = data['id']
title = data['story']['title']
elements.append(title)
for i in (data['description'][0]['author']):
name = (i['name'])
elements.append(name)
if not list:
list.append(elements)
else:
for j in list:
if set(elements) == set(j):
duplicate_Id.append(id)
elements = []

The general idea is to:
Get the groups identified by some function that collects duplicates.
Then return the first entry of each group, ensuring no duplicates.
Define the key function as the sorted list of authors and. As the list of authors is by definition the unique key, but may appear in any order.
import json
from itertools import groupby
j = json.load(books)
def transform(books):
groups = [list(group) for _, group in groupby(books, key=getAuthors)]
return [group[0] for group in groups]
def getAuthors(book):
authors = book['description'][0]['author']
return sorted([author['name'] for author in authors])
print(transform(j['books']))
If we wanted to get the duplicates, then we do the same computation, but return any sublist with length > 1 as this is by our definition duplicated data.
def transform(books):
groups = [list(group) for _, group in groupby(books, key=getAuthors)]
return [group for group in groups if len(group) > 1]
Where j['books'] is the JSON you gave enclosed in an object.

How to convert DataFrame into nested JSON

I'm trying to export a dataFrame into a nested JSON (hierarchical) for D3.js using solution which is only for one level ( parent , children)
Any help would be appreciated. I'm new to python
My DataFrame contains 7 levels
Here is the expected solution
JSON Example:
{
"name": "World",
"children": [
{
"name": "Europe",
"children": [
{
"name": "France",
"children": [
{
"name": "Paris",
"population": 1000000
}]
}]
}]
}
and here is the python method:
def to_flare_json(df, filename):
"""Convert dataframe into nested JSON as in flare files used for D3.js"""
flare = dict()
d = {"name":"World", "children": []}
for index, row in df.iterrows():
parent = row[0]
child = row[1]
child1 = row[2]
child2 = row[3]
child3 = row[4]
child4 = row[5]
child5 = row[6]
child_value = row[7]
# Make a list of keys
key_list = []
for item in d['children']:
key_list.append(item['name'])
#if 'parent' is NOT a key in flare.JSON, append it
if not parent in key_list:
d['children'].append({"name": parent, "children":[{"value": child_value, "name1": child}]})
# if parent IS a key in flare.json, add a new child to it
else:
d['children'][key_list.index(parent)]['children'].append({"value": child_value, "name11": child})
flare = d
# export the final result to a json file
with open(filename +'.json', 'w') as outfile:
json.dump(flare, outfile, indent=4,ensure_ascii=False)
return ("Done")
[EDIT]
Here is a sample of my df
World Continent Region Country State City Boroughs Population
1 Europe Western Europe France Ile de France Paris 17 821964
1 Europe Western Europe France Ile de France Paris 19 821964
1 Europe Western Europe France Ile de France Paris 20 821964

The structure you want is clearly recursive so I made a recursive function to fill it:
def create_entries(df):
entries = []
# Stopping case
if df.shape[1] == 2: # only 2 columns left
for i in range(df.shape[0]): # iterating on rows
entries.append(
{"Name": df.iloc[i, 0],
df.columns[-1]: df.iloc[i, 1]}
)
# Iterating case
else:
values = set(df.iloc[:, 0]) # Getting the set of unique values
for v in values:
entries.append(
{"Name": v,
# reiterating the process but without the first column
# and only the rows with the current value
"Children": create_entries(
df.loc[df.iloc[:, 0] == v].iloc[:, 1:]
)}
)
return entries
All that's left is to create the dictionary and call the function:
mydict = {"Name": "World",
"Children": create_entries(data.iloc[:, 1:])}
Then you just write your dict to a JSON file.
I hope my comments are explicit enough, the idea is to recursively use the first column of the dataset as the "Name" and the rest as the "Children".

Thank you Syncrossus for the answer, but this result in different branches for each boroughs or city
The result is this:
"Name": "World",
"Children": [
{
"Name": "Western Europe",
"Children": [
{
"Name": "France",
"Children": [
{
"Name": "Ile de France",
"Children": [
{
"Name": "Paris",
"Children": [
{
"Name": "17ème",
"Population": 821964
}
]
}
]
}
]
}
]
},{
"Name": "Western Europe",
"Children": [
{
"Name": "France",
"Children": [
{
"Name": "Ile de France",
"Children": [
{
"Name": "Paris",
"Children": [
{
"Name": "10ème",
"Population": 154623
}
]
}
]
}
]
}
]
}
But the desired result is this
"Name": "World",
"Children": [
{
"Continent": "Europe",
"Children": [
{
"Region": "Western Europe",
"Children": [
{
"Country": "France",
"Children": [
{
"State": "Ile De France",
"Children": [
{
"City": "Paris",
"Children": [
{
"Boroughs": "17ème",
"Population": 82194
},
{
"Boroughs": "16ème",
"Population": 99194
}
]
},
{
"City": "Saint-Denis",
"Children": [
{
"Boroughs": "10ème",
"Population": 1294
},
{
"Boroughs": "11ème",
"Population": 45367
}
]
}
]
}
]
},
{
"Country": "Belgium",
"Children": [
{
"State": "Oost-Vlaanderen",
"Children": [
{
"City": "Gent",
"Children": [
{
"Boroughs": "2ème",
"Population": 1234
},
{
"Boroughs": "4ème",
"Population": 7456
}
]
}
]
}
]
}
]
}
]
}
]

How to attach an element in a list and write it to JSON? [duplicate]

This question already has answers here:
List append is overwriting my previous values [duplicate]
(3 answers)
Closed 4 years ago.
Load JSON file to aPool list at beginning.
def LoadPoolTemplate():
sFilename = './pool_template.json'
if os.path.exists(sFilename):
with open(sFilename, 'r') as fJsonFile:
aPool = json.load(fJsonFile)
return aPool
Append aTempJson to aPool[index]['devices']
def UpdateJSON(aPool, aDevList):
sFilename = './device_template.json'
if os.path.exists(sFilename):
with open(sFilename, 'r') as fJsonFile:
aTempJson = json.load(fJsonFile)
else:
print ("No such file names " + sFilename)
for item in aDevList:
aTempJson['id'] = item[1]
aTempJson['atti']['high'] = item[2]
for (i,pid) in enumerate(aPool):
if pid['id'] == item[0]:
aPool[i]['devices'].append(aTempJson)
break
Update aPool list to JSON file
def CreateDeviceJSON(aDevice):
with open(gDevice, 'w') as fOutfile:
json.dump(aDevice, fOutfile, indent=2)
Read list
def ReadDeviceList():
aDevList = []
with open(gDevList, 'r') as fList:
for line in fList:
columns = line.strip().split(',')
aDevList.append(columns)
return aDevList
main function
def main():
aDevDist = []
aDeviceJson = []
aDeviceJson = LoadPoolTemplate()
aDeviceList = ReadDeviceList()
aDeviceJson = UpdateJSON(aDeviceJson, aDeviceList)
CreateDeviceJSON(aDeviceJson)
I don't know why all the elements in devices list are the same, please help me.
JSON file output:
[
{
"id": "id1",
"devices": [
{
"atti": {
"high": "190",
"weight": "80"
},
"id": "Jordan"
},
{
"atti": {
"high": "190",
"weight": "80"
},
"id": "Jordan"
},
{
"atti": {
"high": "190",
"weight": "80"
},
"id": "Jordan"
}
]
},
{
"id": "id2",
"devices": [
{
"atti": {
"high": "190",
"weight": "80"
},
"id": "Jordan"
}
]
},
{
"id": "id3",
"devices": [
{
"atti": {
"high": "190",
"weight": "80"
},
"id": "Jordan"
}
]
}
]
Input source as following:
["id1", "apple", "167"]
["id1", "carter", "203"]
["id1", "jason", "188"]
["id2", "paul", "178"]
["id3", "Jordan", "190"]
Pool template
[
{
"id": "id1",
"devices": [
]
},
{
"id": "id2",
"devices": [
]
},
{
"id": "id3",
"devices": [
]
}
]

Here:
for item in aDevList:
aTempJson['id'] = item[1]
aTempJson['atti']['high'] = item[2]
for (i,pid) in enumerate(aPool):
if pid['id'] == item[0]:
aPool[i]['devices'].append(aTempJson)
break
you are modifying the same aTempJson dict and appending it to the list over and over. What you have to do is to create a new dict for each item.

Flatten nested JSON arrays with inherits properties in Python

I have a big json/dictionary with different levels of nested json arrays, I would like to flatten it, and also capture the relationship of the structure,
Part of my json looks like:
{
"name": "root",
"type": "all",
"children": [
{
"name": "properties",
"type": "feature",
"children": [
{
"name": "print",
"type": "feature",
"children": [
{
"name": "graphic print",
"type": "feature",
"inherits": true
},
{
"name": "striped print",
"type": "feature",
"inherits": true,
"children": [
{
"name": "pinstriped",
"type": "feature",
"inherits": true
},
{
"name": "light stripe",
"type": "feature",
"inherits": true
},
{
"name": "wide stripe",
"type": "feature",
"inherits": true
}
]
}
]
}
]
},
{
"name": "colours",
"type": "colour",
"children": [
{
"name": "main colours",
"type": "colour",
"children": [
{
"name": "black",
"type": "colour",
"children": [
{
"name": "light black",
"type": "colour",
"inherits": true
},
{
"name": "blue black",
"type": "colour",
"inherits": true
}
]
},
{
"name": "red",
"type": "colour",
"children": [
{
"name": "bright red",
"type": "colour",
"inherits": true
},
{
"name": "light red",
"type": "colour"
}
]
}
]
}
]
},
{
"name": "genders",
"type": "gender",
"children": [
{
"name": "female",
"type": "gender"
},
{
"name": "male",
"type": "gender"
}
]
}
]
}
The depth of nests is not all the same. I
- want all the nodes (values of "name")
- also want all its parents if the node has "Inherit" key of True value.
Something like:
But if there are better ideas on how to store this data, will be happy to accept as well!
Many Thanks!

I think this should do your need
def parse_dict_of_dict(_dict, _parent = '', ret_dict={}):
_name, _children, _inherit = _dict["name"], _dict.get('children', None), _dict.get('inherits', False)
if _children is not None:
if isinstance(_children, list):
for _child in _children:
parse_dict_of_dict(_child, _name+ ', ' + _parent if _inherit else _name , ret_dict)
ret_dict[ _name] = _parent.strip(' ').strip(',') if _inherit else None
return ret_dict

Can you elaborate more on your output?
OR you can use this function to flatten a nested JSON to a simple JSON.
def parse_dict_of_dict(_dict, _str = ''):
ret_dict = {}
for k, v in _dict.iteritems():
if isinstance(v, dict):
ret_dict.update(parse_dict_of_dict(v, _str= _str+k+'_'))
elif isinstance(v, list):
for index, item in enumerate(v):
if isinstance(item, dict):
ret_dict.update(parse_dict_of_dict(item, _str=_str+k+'_%d_'%(index)))
else:
ret_dict.update({k+'_%d'%(index): item})
else:
try:
ret_dict[_str + k] = str(v)
except Exception as e:
ret_dict[_str + k] = unicode.encode(v, errors='ignore')
return ret_dict

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Multiple Csv to json with no repetitive childrens - python

Related

Counting multiple json inputs python

duplicates in a JSON file based on two attributes

How to convert DataFrame into nested JSON

How to attach an element in a list and write it to JSON? [duplicate]

Flatten nested JSON arrays with inherits properties in Python

Categories

Resources