csv to json with column data that needs to be grouped - python

I have a CSV file in a format similar to this
order_id, customer_name, item_1_id, item_1_quantity, Item_2_id, Item_2_quantity, Item_3_id, Item_3_quantity
1, John, 4, 1, 24, 4, 16, 1
2, Paul, 8, 3, 41, 1, 33, 1
3, Andrew, 1, 1, 34, 4, 8, 2
I want to export to json, currently I am doing this.
df = pd.read_csv('simple.csv')
print ( df.to_json(orient = 'records') )
And the output is
[
{
"Item_2_id": 24,
"Item_2_quantity": 4,
"Item_3_id": 16,
"Item_3_quantity": 1,
"customer_name": "John",
"item_1_id": 4,
"item_1_quantity": 1,
"order_id": 1
},
......
However, I would like the output to be
[
{
"customer_name": "John",
"order_id": 1,
"items": [
{ "id": 4, "quantity": 1 },
{ "id": 24, "quantity": 4 },
{ "id": 16, "quantity": 1 },
]
},
......
Any suggestions on a good way to do this?
In this particular project, there will not be more than 5 times per order

Try the following:
import pandas as pd
import json
output_lst = []
##specify the first row as header
df = pd.read_csv('simple.csv', header=0)
##iterate through all the rows
for index, row in df.iterrows():
dict = {}
items_lst = []
## column_list is a list of column headers
column_list = df.columns.values
for i, col_name in enumerate(column_list):
## for the first 2 columns simply copy the value into the dictionary
if i<2:
element = row[col_name]
if isinstance(element, str):
## strip if it is a string type value
element = element.strip()
dict[col_name] = element
elif "_id" in col_name:
## i+1 is used assuming that the item_quantity comes right after the corresponding item_id for each item
item_dict = {"id":row[col_name], "quantity":row[column_list[i+1]]}
items_lst.append(item_dict)
dict["items"] = items_lst
output_lst.append(dict)
print json.dumps(output_lst)
If you run the above file with the sample.csv described in the question then you get the following output:
[
{
"order_id": 1,
"items": [
{
"id": 4,
"quantity": 1
},
{
"id": 24,
"quantity": 4
},
{
"id": 16,
"quantity": 1
}
],
" customer_name": "John"
},
{
"order_id": 2,
"items": [
{
"id": 8,
"quantity": 3
},
{
"id": 41,
"quantity": 1
},
{
"id": 33,
"quantity": 1
}
],
" customer_name": "Paul"
},
{
"order_id": 3,
"items": [
{
"id": 1,
"quantity": 1
},
{
"id": 34,
"quantity": 4
},
{
"id": 8,
"quantity": 2
}
],
" customer_name": "Andrew"
}
]

Source DF:
In [168]: df
Out[168]:
order_id customer_name item_1_id item_1_quantity Item_2_id Item_2_quantity Item_3_id Item_3_quantity
0 1 John 4 1 24 4 16 1
1 2 Paul 8 3 41 1 33 1
2 3 Andrew 1 1 34 4 8 2
Solution:
In [169]: %paste
import re
x = df[['order_id','customer_name']].copy()
x['id'] = \
pd.Series(df.loc[:, df.columns.str.contains(r'item_.*?_id',
flags=re.I)].values.tolist(),
index=df.index)
x['quantity'] = \
pd.Series(df.loc[:, df.columns.str.contains(r'item_.*?_quantity',
flags=re.I)].values.tolist(),
index=df.index)
x.to_json(orient='records')
## -- End pasted text --
Out[169]: '[{"order_id":1,"customer_name":"John","id":[4,24,16],"quantity":[1,4,1]},{"order_id":2,"customer_name":"Paul","id":[8,41,33],"qua
ntity":[3,1,1]},{"order_id":3,"customer_name":"Andrew","id":[1,34,8],"quantity":[1,4,2]}]'
Intermediate helper DF:
In [82]: x
Out[82]:
order_id customer_name id quantity
0 1 John [4, 24, 16] [1, 4, 1]
1 2 Paul [8, 41, 33] [3, 1, 1]
2 3 Andrew [1, 34, 8] [1, 4, 2]

j = df.set_index(['order_id','customer_name']) \
.groupby(lambda x: x.split('_')[-1], axis=1) \
.agg(lambda x: x.values.tolist()) \
.reset_index() \
.to_json(orient='records')
import json
Beatufied result:
In [122]: print(json.dumps(json.loads(j), indent=2))
[
{
"order_id": 1,
"customer_name": "John",
"id": [
4,
24,
16
],
"quantity": [
1,
4,
1
]
},
{
"order_id": 2,
"customer_name": "Paul",
"id": [
8,
41,
33
],
"quantity": [
3,
1,
1
]
},
{
"order_id": 3,
"customer_name": "Andrew",
"id": [
1,
34,
8
],
"quantity": [
1,
4,
2
]
}
]

Related

Join nested list to ID value

I retrieve data from my DB for a Python app and it comes in the following format (as a list, tbl):
[
{
"id": "rec2fiwnTQewTv9HC",
"createdTime": "2022-06-27T08:25:47.000Z",
"fields": {
"Num": 19,
"latitude": 31.101405,
"longitude": 36.391831,
"State": 2,
"Label": "xyz",
"Red": 0,
"Green": 255,
"Blue": 0
}
},
{
"id": "rec4y7vhgZVDHrhrQ",
"createdTime": "2022-06-27T08:25:47.000Z",
"fields": {
"Num": 30,
"latitude": 31.101405,
"longitude": 36.391831,
"State": 2,
"Label": "abc",
"Red": 0,
"Green": 255,
"Blue": 0
}
}
]
I can retrieve the values in the fields nested list by doing this:
pd.DataFrame([d['fields'] for d in tbl])
I would like to add the id field to each row of the dataframe but I can't figure out how to do this.
Try:
data = [
{
"id": "rec2fiwnTQewTv9HC",
"createdTime": "2022-06-27T08:25:47.000Z",
"fields": {
"Num": 19,
"latitude": 31.101405,
"longitude": 36.391831,
"State": 2,
"Label": "xyz",
"Red": 0,
"Green": 255,
"Blue": 0,
},
},
{
"id": "rec4y7vhgZVDHrhrQ",
"createdTime": "2022-06-27T08:25:47.000Z",
"fields": {
"Num": 30,
"latitude": 31.101405,
"longitude": 36.391831,
"State": 2,
"Label": "abc",
"Red": 0,
"Green": 255,
"Blue": 0,
},
},
]
df = pd.DataFrame([{"id": d["id"], **d["fields"]} for d in data])
print(df)
Prints:
id Num latitude longitude State Label Red Green Blue
0 rec2fiwnTQewTv9HC 19 31.101405 36.391831 2 xyz 0 255 0
1 rec4y7vhgZVDHrhrQ 30 31.101405 36.391831 2 abc 0 255 0

Turning parent-child data frame into a dictionary

I have the following data frame:
data = [
{"id": 1, "parent_id": -1, "level": 1, "name": "Company"},
{"id": 2, "parent_id": 1, "level": 2, "name": "Bakery"},
{"id": 3, "parent_id": 1, "level": 2, "name": "Frozen"},
{"id": 4, "parent_id": 2, "level": 3, "name": "Bread"},
{"id": 5, "parent_id": 2, "level": 3, "name": "Pastry"},
{"id": 6, "parent_id": 3, "level": 3, "name": "Ice Cream"},
{"id": 7, "parent_id": 3, "level": 3, "name": "Sorbet"},
]
df = pd.DataFrame(data)
that looks like this:
id parent_id level name
0 1 -1 1 Company
1 2 1 2 Bakery
2 3 1 2 Frozen
3 4 2 3 Bread
4 5 2 3 Pastry
5 6 3 3 Ice Cream
6 7 3 3 Sorbet
I'm trying to represent the data as a dictionay like this:
data = {
"Company": {
"Bakery": [
"Bread",
"Pastry",
],
"Frozen": [
"Ice Cream",
"Sorbet",
],
},
}
Heavily struggling with achieving this result, so any help is appreciated! I've tried various for-loops but getting muddled up!
This is what I came up with (this code assumes consistency between parent_ids and levels and that all parent_ids exist):
# to store the final result
result = {}
# to store references of dictionaries by their ids
by_id = {}
for d in sorted(data, key=lambda d: d['level']):
new_dict = {}
if d['parent_id'] == -1:
result[d['name']] = new_dict
else:
by_id[d['parent_id']][d['name']] = new_dict
by_id[d['id']] = new_dict
At this point:
>>> result
{'Company': {'Bakery': {'Bread': {}, 'Pastry': {}}, 'Frozen': {'Ice Cream': {}, 'Sorbet': {}}}}
Now to convert empty dictionaries to a list of items, we use a recursive function:
def transform_dicts_to_lists(r):
if any(r.values()):
for k, v in r.items():
r[k] = transform_dicts_to_lists(v)
return r
else:
return list(r.keys())
result = transform_dicts_to_lists(result)
>>> result
{'Company': {'Bakery': ['Bread', 'Pastry'], 'Frozen': ['Ice Cream', 'Sorbet']}}
You can avoid final processing if you know that the maximum level is always 3.

Take the first n dictionaries of a specific key in a sorted list

I writing a script which calculates the distance in miles between an order's shipping address and each store location for a specific chain of stores. So far, I have created a sorted list of dictionaries (sorted by order_id and then distance). It looks like this:
[
{
"order_id": 1,
"distance": 10,
"storeID": 1112
},
{
"order_id": 1,
"distance": 20,
"storeID": 1116
},
{
"order_id": 1,
"distance": 30,
"storeID": 1134
},
{
"order_id": 1,
"distance": 40,
"storeID": 1133
},
{
"order_id": 2,
"distance": 6,
"storeID": 1112
},
{
"order_id": 2,
"distance": 12,
"storeID": 1116
},
{
"order_id": 2,
"distance": 18,
"storeID": 1134
},
{
"order_id": 2,
"distance": 24,
"storeID": 1133
}
]
From here, I would like to find the two closest stores for each order_id, as well as their distances.
What I'd ultimately want to end up with is a list that looks like this:
[
{
"order_id": 1,
"closet_store_distance": 10,
"closest_store_id": 1112,
"second_closet_store_distance": 20,
"second_closest_store_id": 1116
},
{
"order_id": 2,
"closet_store_distance": 6,
"closest_store_id": 1112,
"second_closet_store_distance": 12,
"second_closest_store_id": 1116
}
]
I am unsure of how to loop through each order_id in this list and select the two closest stores. Any help is appreciated.
Try something like this, I made the assumption that the initial data was in a file called sample.txt.
import json
from operator import itemgetter
def make_order(stores, id):
return {
"order_id": id,
"closet_store_distance": stores[0][1],
"closest_store_id": stores[0][0],
"second_closet_store_distance": stores[1][1],
"second_closest_store_id": stores[1][0]
}
def main():
with open('sample.txt', 'r') as data_file:
data = json.loads(data_file.read())
id1 = {}
id2 = {}
for i in data:
if i["order_id"] == 1:
id1[i["storeID"]] = i["distance"]
else:
id2[i["storeID"]] = i["distance"]
top1 = sorted(id1.items(), key=itemgetter(1))
top2 = sorted(id2.items(), key=itemgetter(1))
with open('results.json', 'w') as result_file:
order1 = make_order(top1, 1)
order2 = make_order(top2, 2)
json.dump([order1, order2], result_file, indent=3, separators=(',', ': '))
if __name__ == '__main__':
main()
The resulting file looks like:
[
{
"second_closest_store_id": 1116,
"closet_store_distance": 10,
"closest_store_id": 1112,
"order_id": 1,
"second_closet_store_distance": 20
},
{
"second_closest_store_id": 1116,
"closet_store_distance": 6,
"closest_store_id": 1112,
"order_id": 2,
"second_closet_store_distance": 12
}
]
A nice readable answer (but using one of my free libraries.):
from PLOD import PLOD
order_store_list = [
{
"order_id": 1,
"distance": 10,
"storeID": 1112
},
{
"order_id": 1,
"distance": 20,
"storeID": 1116
},
{
"order_id": 1,
"distance": 30,
"storeID": 1134
},
{
"order_id": 1,
"distance": 40,
"storeID": 1133
},
{
"order_id": 2,
"distance": 6,
"storeID": 1112
},
{
"order_id": 2,
"distance": 12,
"storeID": 1116
},
{
"order_id": 2,
"distance": 18,
"storeID": 1134
},
{
"order_id": 2,
"distance": 24,
"storeID": 1133
}
]
#
# first, get the order_ids (place in a dictionary to ensure uniqueness)
#
order_id_keys = {}
for entry in order_store_list:
order_id_keys[entry["order_id"]] = True
#
# next, get the two closest stores per order_id
#
closest_stores = []
for order_id in order_id_keys:
top_two = PLOD(order_store_list).eq("order_id", order_id).sort("distance").returnList(limit=2)
closest_stores.append({
"order_id": order_id,
"closet_store_distance": top_two[0]["distance"],
"closest_store_id": top_two[0]["storeID"],
"second_closet_store_distance": top_two[1]["distance"],
"second_closest_store_id": top_two[1]["storeID"]
})
#
# sort by order_id again (if that is important)
#
closest_stores = PLOD(closest_stores).sort("order_id").returnList()
This example assumes the production order_store_list will fit in memory. If you are using a larger dataset, I strongly recommend using a database and python library for that database.
My PLOD library is free and open source (MIT), but requires Python 2.7. I'm about two weeks away from a Python 3.5 release. See https://pypi.python.org/pypi/PLOD/0.1.7

Create hierarchical json dump from list of dictionary in python

The table:
categories = Table("categories", metadata,
Column("id", Integer, primary_key=True),
Column("name", String),
Column("parent_id", Integer, ForeignKey("categories.id"),
CheckConstraint('id!=parent_id'), nullable=True),
)
A category can have many children, but only 1 parent. I have got the list of dictionary values as follows using CTE: eg. For id :14, parent is 13 and traversed from parent 8->10->12->13->14 where parent 8 has no parent id.
[
{
"id": 14,
"name": "cat14",
"parent_id": 13,
"path_info": [
8,
10,
12,
13,
14
]
},
{
"id": 15,
"name": "cat15",
"parent_id": 13,
"path_info": [
8,
10,
12,
13,
15
]
}
]
I would like to get the attributes of the parent also embedded as subcategories in the list as:
{
"id": 14,
"name": "cat14",
"parent_id": 13,
"subcats": [
{
"id: 8",
"name": "cat8",
"parent_id":null
},
{
"id: 10",
"name": "cat10",
"parent_id":8
},
{
"id: 12",
"name": "cat12",
"parent_id":10
},
and similarly for ids 13 and 14.....
]
},
{
"id": 15,
"name": "cat15",
"parent_id": 13,
"subcats": [
{
"id: 8",
"name": "cat8",
"parent_id":null
},
{
"id: 10",
"name": "cat10",
"parent_id":8
},
{
"id: 12",
"name": "cat12",
"parent_id":10
},
and similarly for ids 13, 14, 15.....
]
}
]
Notice that 'path_info' has been deleted from the dictionary and each id has been displayed with its details. I want json dumps with the above indented format. How to go about? Using flask 0.10, python 2.7
There is a tolerable way to do this with a few list/dict comprehensions.
lst = [{"id": 14, "name": "cat14", "parent_id": 13, "path_info": [8, 10, 12, 13, 14]}, {"id": 15, "name": "cat15", "parent_id": 13, "path_info": [8, 10, 12, 13, 15]}]
master_dct = { d['id'] : d for d in lst}
for d in lst:
d['subcats'] = [{field : master_dct[i][field] for field in ['id', 'name', 'parent_id']} \
for i in d['path_info'] if i in master_dct]
import json
with open('out.json', 'w') as f:
json.dump(lst, f)
You can perform it in python code:
Given we have a json object. I've slightly modified it - added absent nodes and wrap into an object as it is required by the specification:
{
"array": [
{
"id": 14,
"name": "cat14",
"parent_id": 13,
"path_info": [
8,
10,
12,
13,
14
]
},
{
"id": 15,
"name": "cat15",
"parent_id": 13,
"path_info": [
8,
10,
12,
13,
15
]
},
{
"id": 13,
"name": "cat13",
"parent_id": 12,
"path_info": [
8,
10,
12,
13
]
},
{
"id": 12,
"name": "cat12",
"parent_id": 10,
"path_info": [
8,
10,
12
]
},
{
"id": 10,
"name": "cat10",
"parent_id": 8,
"path_info": [
8,
10
]
},
{
"id": 8,
"name": "cat8",
"parent_id": null,
"path_info": [
8
]
}
]
}
Then you may use following code:
# load data above from file
j=json.load(open('json_file_above.json')) #
# the array with real data we need
a=j['array']
# auxiliary dict which have node identificators as keys and nodes as values
d={x['id']:x for x in a}
# here the magic begins :)
for x in a:
# add new key with list to each element
x['subcats'] = [
# compose dict element for subcats
dict(id=i, name=d[i]['name'], parent_id=d[i]['parent_id'])
for
i
in [
# we take path_info id list and
# cut off the first element - itself
y for y in x['path_info'][1:]
]
]
del x['path_info']
To be sure you are getting the thing you need:
>>> print(json.dumps(a, indent=True))
[
{
"name": "cat14",
"subcats": [
{
"name": "cat10",
"id": 10,
"parent_id": 8
},
{
"name": "cat12",
"id": 12,
"parent_id": 10
},
{
"name": "cat13",
"id": 13,
"parent_id": 12
},
{
"name": "cat14",
"id": 14,
"parent_id": 13
}
],
"id": 14,
"parent_id": 13
},
{
"name": "cat15",
"subcats": [
{
"name": "cat10",
"id": 10,
"parent_id": 8
},
{
"name": "cat12",
"id": 12,
"parent_id": 10
},
{
"name": "cat13",
"id": 13,
"parent_id": 12
},
{
"name": "cat15",
"id": 15,
"parent_id": 13
}
],
"id": 15,
"parent_id": 13
},
{
"name": "cat13",
"subcats": [
{
"name": "cat10",
"id": 10,
"parent_id": 8
},
{
"name": "cat12",
"id": 12,
"parent_id": 10
},
{
"name": "cat13",
"id": 13,
"parent_id": 12
}
],
"id": 13,
"parent_id": 12
},
{
"name": "cat12",
"subcats": [
{
"name": "cat10",
"id": 10,
"parent_id": 8
},
{
"name": "cat12",
"id": 12,
"parent_id": 10
}
],
"id": 12,
"parent_id": 10
},
{
"name": "cat10",
"subcats": [
{
"name": "cat10",
"id": 10,
"parent_id": 8
}
],
"id": 10,
"parent_id": 8
},
{
"name": "cat8",
"subcats": [],
"id": 8,
"parent_id": null
}
]
>>>
The pythonic code for this: Simple and straightforward
import json
categories = [] #input
def transform(category, child_node_id):
category['subcats'].append({
'id': child_node_id,
'name': 'cat%s' % child_node_id,
'parent_id': category['id']
})
for category in categories:
category['subcats'] = []
[transform(category, child_node_id) for child_node_id in category['path_info']]
category.pop('path_info', None)
print(json.dumps(categories, indent=4))

Count usage of foreign key in two tables in Flask-SqlAlchemy

I have three tables. One defines groups, with a groupid column, and the other two define users and events, which can each belong to a group.
class InterestGroup(db.Model):
__tablename__ = "interest_groups"
groupid = db.Column(db.Integer, primary_key=True)
groupcode = db.Column(db.String(10))
groupname = db.Column(db.String(200), unique=True)
class InterestGroupEvent(db.Model):
__tablename__ = "interest_group_events"
eventid = db.Column(db.Integer, db.ForeignKey("social_events.eventid"), primary_key=True)
groupid = db.Column(db.Integer, db.ForeignKey("interest_groups.groupid"), primary_key=True)
class InterestGroupUser(db.Model):
__tablename__ = "interest_group_users"
userid = db.Column(db.Integer, db.ForeignKey("users.userid"), primary_key=True)
groupid = db.Column(db.Integer, db.ForeignKey("interest_groups.groupid"), primary_key=True)
I want to list all the groups, with a count of how many users and how many events belong to each one, even if there aren't any.
I can get all the data I need, as follows:
IGs = InterestGroup.query.all()
IGEs = (
db.session.query(
InterestGroupEvent.groupid,
func.count(InterestGroupEvent.groupid)
)
.group_by(InterestGroupEvent.groupid)
.all()
)
IGUs = (
db.session.query(
InterestGroupUser.groupid,
func.count(InterestGroupUser.groupid)
)
.group_by(InterestGroupUser.groupid)
.all()
)
return json.dumps({
"groups": [{"groupid": IG.groupid} for IG in IGs],
"events": [{"groupid": IGE.groupid, "count": IGE[1]} for IGE in IGEs],
"users": [{"groupid": IGU.groupid, "count": IGU[1]} for IGU in IGUs]
})
which returns the following:
{
"events": [
{
"count": 2,
"groupid": 1
},
{
"count": 1,
"groupid": 2
}
],
"groups": [
{
"groupid": 1
},
{
"groupid": 2
},
{
"groupid": 3
}
],
"users": [
{
"count": 2,
"groupid": 1
},
{
"count": 1,
"groupid": 3
}
]
}
but what I want is the following:
[
{
"groupid": 1,
"eventcount": 2,
"usercount": 2
},
{
"groupid": 2,
"eventcount": 1,
"usercount": 0
},
{
"groupid": 3,
"eventcount": 0,
"usercount": 1
}
]
Obviously I could merge it manually, but I'm sure there's a way to get it direct from the database in a single query. I've tried the following:
IGs = (
db.session.query(InterestGroup.groupid,
func.count(InterestGroupEvent.groupid),
func.count(InterestGroupUser.groupid)
)
.group_by(InterestGroup.groupid)
.all()
)
return json.dumps([{"groupid": IG[0], "eventcount": IG[1], "usercount": IG[2]} for IG in IGs])
but that returns this:
[
{
"usercount": 9,
"groupid": 1,
"eventcount": 9
},
{
"usercount": 9,
"groupid": 2,
"eventcount": 9
},
{
"usercount": 9,
"groupid": 3,
"eventcount": 9
}
]
Hmm, this is close:
IGs = (
db.session.query(InterestGroup.groupid,
func.count(InterestGroupEvent.groupid),
func.count(InterestGroupUser.groupid)
)
.outerjoin(InterestGroupEvent)
.outerjoin(InterestGroupUser)
.group_by(InterestGroup.groupid)
.all()
)
return json.dumps([
{"groupid": IG[0], "eventcount": IG[1], "usercount": IG[2]} for IG in IGs
])
It returns:
[
{
"usercount": 4,
"groupid": 1,
"eventcount": 4
},
{
"usercount": 0,
"groupid": 2,
"eventcount": 1
},
{
"usercount": 1,
"groupid": 3,
"eventcount": 0
}
]
but the counts for group 1 are wrong. How should I go about it, please?
EDIT
In the absence of a better solution, this seems to work and will do for now:
IGEs = (
InterestGroup.query
.outerjoin(InterestGroupEvent)
.add_columns(func.count(InterestGroupEvent.groupid))
.group_by(InterestGroup.groupid)
.order_by(InterestGroup.groupid)
.all()
)
IGUs = (
InterestGroup.query
.outerjoin(InterestGroupUser)
.add_columns(func.count(InterestGroupUser.groupid))
.group_by(InterestGroup.groupid)
.order_by(InterestGroup.groupid)
.all()
)
results = []
for i in range(len(IGEs)):
results.append({"groupid": IGEs[i][0].groupid, "eventcount": IGEs[i][1], "usercount": IGUs[i][1]})
return json.dumps(results)
But I'd still like to know how to do it with a single query.

Categories

Resources