Related
I have a JSON file loaded and written to CSV in this way:
# Opening JSON file and loading the data
# into the variable data
with open('personal.json') as json_file:
data = json.load(json_file)
employee_data = data['results']
#inventory_data = data['inventory']
print (inventory_data)
#print(f'Total users: {res.json().get("total")}')
# now we will open a file for writing
data_file = open('data_file.csv', 'a')
# create the csv writer object
csv_writer = csv.writer(data_file)
# Counter variable used for writing
# headers to the CSV file
count = 0
for emp in employee_data:
if count == 0:
# Writing headers of CSV file
header = emp.keys()
csv_writer.writerow(header)
count += 1
# Writing data of CSV file
csv_writer.writerow(emp.values())
data_file.close()
Works perfect there is no problem at all. This is the example of some JSON file created by this:
{
"count": 1100,
"next": "https://any.com/?limit=10&offset=10",
"previous": null,
"results": [
{
"list_no": "00011",
"item_no": "MZVL2256HC",
"upc_code": "",
"manufacturer_no": "MZVL2256HCHQ-00",
"manufacturer": "SAMSUNGOEM",
"category": "SSD",
"product_name": "Samsung PM9A1 - SSD - 256 GB - PCIe 4.0 x4 (NVMe)",
"price": 56.65,
"instant_rebate": "",
"instant_rebate_item_no": "",
"weight": 0.06,
"unit": 300,
"length": 14.75,
"width": 12.0,
"height": 3.75,
"package": "BULK",
"specorder": "Y",
"is_domestic_only": "N",
"inventory": {
"": 50
}
},
{
"list_no": "00012",
"item_no": "MZVL2512HC",
"upc_code": "",
"manufacturer_no": "MZVL2512HCJQ-00",
"manufacturer": "SAMSUNGOEM",
"category": "SSD",
"product_name": "Samsung PM9A1 MZVL2512HCJQ - SSD - 512 GB - PCIe 4.0 x4 (NVMe)",
"price": 70.04,
"instant_rebate": "",
"instant_rebate_item_no": "",
"weight": 0.06,
"unit": 300,
"length": 23.0,
"width": 17.0,
"height": 10.0,
"package": "BULK",
"specorder": "Y",
"is_domestic_only": "N",
"inventory": {
"": 55
}
},
{
"list_no": "00013",
"item_no": "MZVL21T0HC",
"upc_code": "",
"manufacturer_no": "MZVL21T0HCLR-00A00",
"manufacturer": "SAMSUNGOEM",
"category": "SSD",
"product_name": "Samsung PM9A1 MZVL21T0HCLR - SSD - 1 TB - PCIe 4.0 x4 (NVMe)",
"price": 105.06,
"instant_rebate": "",
"instant_rebate_item_no": "",
"weight": 0.06,
"unit": 300,
"length": 23.0,
"width": 17.0,
"height": 10.0,
"package": "BULK",
"specorder": "Y",
"is_domestic_only": "N",
"inventory": {
"": 52,
"M": 0
}
},
{
"list_no": "00014",
"item_no": "MZVL22T0HB",
"upc_code": "",
"manufacturer_no": "MZVL22T0HBLB-00A00",
"manufacturer": "SAMSUNGOEM",
"category": "SSD",
"product_name": "Samsung PM9A1 - SSD - 2 TB - PCIe 4.0 x4 (NVMe)",
"price": 187.46,
"instant_rebate": "",
"instant_rebate_item_no": "",
"weight": 0.06,
"unit": 300,
"length": 23.0,
"width": 17.0,
"height": 10.0,
"package": "BULK",
"specorder": "Y",
"is_domestic_only": "N",
"inventory": {
"": 58
}
},
{
"list_no": "00102",
"item_no": "MT48RD64A1",
"upc_code": "649528921666",
"manufacturer_no": "MTC40F2046S1RC48BA1R",
"manufacturer": "MICRON",
"category": "MEM",
"product_name": "Micron - DDR5 - module - 64 GB - DIMM 288-pin - 4800 MHz / PC5-38400 - registered",
"price": 297.67,
"instant_rebate": "",
"instant_rebate_item_no": "",
"weight": 0.09,
"unit": 100,
"length": 14.75,
"width": 12.0,
"height": 3.75,
"package": "BULK",
"specorder": "Y",
"is_domestic_only": "Y",
"inventory": {
"": 36,
"L": 0
}
},
{
"list_no": "00104",
"item_no": "MT48RD32A1",
"upc_code": "649528921598",
"manufacturer_no": "MTC20F2085S1RC48BA1R",
"manufacturer": "MICRON",
"category": "MEM",
"product_name": "Micron - DDR5 - module - 32 GB - DIMM 288-pin - 4800 MHz / PC5-38400 - registered",
"price": 164.8,
"instant_rebate": "",
"instant_rebate_item_no": "",
"weight": 0.1,
"unit": 100,
"length": 14.75,
"width": 12.0,
"height": 3.75,
"package": "BULK",
"specorder": "N",
"is_domestic_only": "N",
"inventory": {
"": 53
}
},
{
"list_no": "00105",
"item_no": "MT48RS32A2",
"upc_code": "649528921529",
"manufacturer_no": "MTC20F1045S1RC48BA2R",
"manufacturer": "MICRON",
"category": "MEM",
"product_name": "Micron - DDR5 - module - 32 GB - DIMM 288-pin - 4800 MHz / PC5-38400 - registered",
"price": 164.8,
"instant_rebate": "",
"instant_rebate_item_no": "",
"weight": 0.1,
"unit": 100,
"length": 14.75,
"width": 12.0,
"height": 3.75,
"package": "BULK",
"specorder": "N",
"is_domestic_only": "Y",
"inventory": {
"": 38
}
},
{
"list_no": "00108",
"item_no": "MT48RS16A1",
"upc_code": "649528921376",
"manufacturer_no": "MTC10F1084S1RC48BA1R",
"manufacturer": "MICRON",
"category": "MEM",
"product_name": "Micron - DDR5 - module - 16 GB - DIMM 288-pin - 4800 MHz / PC5-38400",
"price": 95.79,
"instant_rebate": "",
"instant_rebate_item_no": "",
"weight": 0.09,
"unit": 100,
"length": 14.75,
"width": 12.0,
"height": 3.75,
"package": "BULK",
"specorder": "N",
"is_domestic_only": "Y",
"inventory": {
"": 364,
"L": 50
}
},
{
"list_no": "00116",
"item_no": "MT48E32A1",
"upc_code": "649528931481",
"manufacturer_no": "MTC20C2085S1EC48BA1R",
"manufacturer": "MICRON",
"category": "MEM",
"product_name": "Micron - DDR5 - module - 32 GB - DIMM 288-pin - 4800 MHz / PC5-38400 - unbuffered",
"price": 164.8,
"instant_rebate": "",
"instant_rebate_item_no": "",
"weight": 0.09,
"unit": 100,
"length": 14.75,
"width": 12.0,
"height": 3.65,
"package": "BULK",
"specorder": "N",
"is_domestic_only": "Y",
"inventory": {
"": 32,
"N": 0,
"L": 0
}
},
{
"list_no": "00118",
"item_no": "MT48E16A1",
"upc_code": "649528931429",
"manufacturer_no": "MTC10C1084S1EC48BA1R",
"manufacturer": "MICRON",
"category": "MEM",
"product_name": "Micron - DDR5 - module - 16 GB - DIMM 288-pin - 4800 MHz / PC5-38400 - unbuffered",
"price": 95.79,
"instant_rebate": "",
"instant_rebate_item_no": "",
"weight": 0.09,
"unit": 100,
"length": 14.75,
"width": 12.0,
"height": 3.75,
"package": "BULK",
"specorder": "N",
"is_domestic_only": "Y",
"inventory": {
"": 21
}
}
]
}
The issue I have is with one column created in the csv called inventory I have some values as:
"inventory": {"": 52, "N": 57, "L": 34, "M": 15}
or
"inventory": {"": 21}
Can be clearly seen in the JSON text. What I want is that all instances in inventory to that instances I want to make a sum of all that values appear there and that sum that is the value to send to the column inventory. Why? Because actually it sends me to csv as that I copied literal and they don’ work for the purpose. In primer example the value in inventory would be 52+57+34+15=158 and second example 21 and like that with all values. How can I modify the code to do that?
Thank you
I tried to investigate how to call values. For example:
employee_data = data['results']
This variable reads the values in the JSON file as count, results, etc but after results I don’t know how to call the values as list_no, inventory, etc.
You can calculate the sum of inventory with return all values from inventory dict from data['results']. Then you can call list_no data with iterate data['results'] with single data i and make i['list_no'] to get the data. These codes should be works:
import json
# Opening JSON file and loading the data
# into the variable data
with open('personal.json') as json_file:
data = json.load(json_file)
employee_data = data['results']
for i in employee_data:
# calculate sum of Inventory data
sumOfInventory = sum(list(i['inventory'].values()))
# print a few data
print('List_No :', i['list_no'])
print('Inventory :', i['inventory'])
print('Sum of Inventory :', sumOfInventory)
print('----------------------')
# insert that sum into the json data
i['sumOfInventory'] = sumOfInventory
#print new json data
print('new json data :')
print(employee_data)
print('')
#inventory_data = data['inventory']
# print (inventory_data)
#print(f'Total users: {res.json().get("total")}')
# now we will open a file for writing
data_file = open('data_file.csv', 'a')
# create the csv writer object
csv_writer = csv.writer(data_file)
# Counter variable used for writing
# headers to the CSV file
count = 0
for emp in employee_data:
if count == 0:
# Writing headers of CSV file
header = emp.keys()
csv_writer.writerow(header)
count += 1
# Writing data of CSV file
csv_writer.writerow(emp.values())
data_file.close()
The console output must be like this :
List_No : 00011
Inventory : {'': 50}
Sum of Inventory : 50
----------------------
List_No : 00012
Inventory : {'': 55}
Sum of Inventory : 55
----------------------
List_No : 00013
Inventory : {'': 52, 'M': 0}
Sum of Inventory : 52
----------------------
List_No : 00014
Inventory : {'': 58}
Sum of Inventory : 58
----------------------
List_No : 00102
Inventory : {'': 36, 'L': 0}
Sum of Inventory : 36
----------------------
List_No : 00104
Inventory : {'': 53}
Sum of Inventory : 53
----------------------
List_No : 00105
Inventory : {'': 38}
Sum of Inventory : 38
----------------------
List_No : 00108
Inventory : {'': 364, 'L': 50}
Sum of Inventory : 414
----------------------
List_No : 00116
Inventory : {'': 32, 'N': 0, 'L': 0}
Sum of Inventory : 32
----------------------
List_No : 00118
Inventory : {'': 21}
Sum of Inventory : 21
----------------------
new json data :
[{'list_no': '00011', 'item_no': 'MZVL2256HC', 'upc_code': '', 'manufacturer_no': 'MZVL2256HCHQ-00', 'manufacturer': 'SAMSUNGOEM', 'category': 'SSD', 'product_name': 'Samsung PM9A1 - SSD - 256 GB - PCIe 4.0 x4 (NVMe)', 'price': 56.65, 'instant_rebate': '', 'instant_rebate_item_no': '', 'weight': 0.06, 'unit': 300, 'length': 14.75, 'width': 12.0, 'height': 3.75, 'package': 'BULK', 'specorder': 'Y', 'is_domestic_only': 'N', 'inventory': {'': 50}, 'sumOfInventory': 50}, {'list_no': '00012', 'item_no': 'MZVL2512HC', 'upc_code': '', 'manufacturer_no': 'MZVL2512HCJQ-00', 'manufacturer': 'SAMSUNGOEM', 'category': 'SSD', 'product_name': 'Samsung PM9A1 MZVL2512HCJQ - SSD - 512 GB - PCIe 4.0 x4 (NVMe)', 'price': 70.04, 'instant_rebate': '', 'instant_rebate_item_no': '', 'weight': 0.06, 'unit': 300, 'length': 23.0, 'width': 17.0, 'height': 10.0, 'package': 'BULK', 'specorder': 'Y', 'is_domestic_only': 'N', 'inventory': {'': 55}, 'sumOfInventory': 55}, {'list_no': '00013', 'item_no': 'MZVL21T0HC', 'upc_code': '', 'manufacturer_no': 'MZVL21T0HCLR-00A00', 'manufacturer': 'SAMSUNGOEM', 'category': 'SSD', 'product_name': 'Samsung PM9A1 MZVL21T0HCLR - SSD - 1 TB - PCIe 4.0 x4 (NVMe)', 'price': 105.06, 'instant_rebate': '', 'instant_rebate_item_no': '', 'weight': 0.06, 'unit': 300, 'length': 23.0, 'width': 17.0, 'height': 10.0, 'package': 'BULK', 'specorder': 'Y', 'is_domestic_only': 'N', 'inventory': {'': 52, 'M': 0}, 'sumOfInventory': 52}, {'list_no': '00014', 'item_no': 'MZVL22T0HB', 'upc_code': '', 'manufacturer_no': 'MZVL22T0HBLB-00A00', 'manufacturer': 'SAMSUNGOEM', 'category': 'SSD', 'product_name': 'Samsung PM9A1 - SSD - 2 TB - PCIe 4.0 x4 (NVMe)', 'price': 187.46, 'instant_rebate': '', 'instant_rebate_item_no': '', 'weight': 0.06, 'unit': 300, 'length': 23.0, 'width': 17.0, 'height': 10.0, 'package': 'BULK', 'specorder': 'Y', 'is_domestic_only': 'N', 'inventory': {'': 58}, 'sumOfInventory': 58}, {'list_no': '00102', 'item_no': 'MT48RD64A1', 'upc_code': '649528921666', 'manufacturer_no': 'MTC40F2046S1RC48BA1R', 'manufacturer': 'MICRON', 'category': 'MEM', 'product_name': 'Micron - DDR5 - module - 64 GB - DIMM 288-pin - 4800 MHz / PC5-38400 - registered', 'price': 297.67, 'instant_rebate': '', 'instant_rebate_item_no': '', 'weight': 0.09, 'unit': 100, 'length': 14.75, 'width': 12.0, 'height': 3.75, 'package': 'BULK', 'specorder': 'Y', 'is_domestic_only': 'Y', 'inventory': {'': 36, 'L': 0}, 'sumOfInventory': 36}, {'list_no': '00104', 'item_no': 'MT48RD32A1', 'upc_code': '649528921598', 'manufacturer_no': 'MTC20F2085S1RC48BA1R', 'manufacturer': 'MICRON', 'category': 'MEM', 'product_name': 'Micron - DDR5 - module - 32 GB - DIMM 288-pin - 4800 MHz / PC5-38400 - registered', 'price': 164.8, 'instant_rebate': '', 'instant_rebate_item_no': '', 'weight': 0.1, 'unit': 100, 'length': 14.75, 'width': 12.0, 'height': 3.75, 'package': 'BULK', 'specorder': 'N', 'is_domestic_only': 'N', 'inventory': {'': 53}, 'sumOfInventory': 53}, {'list_no': '00105', 'item_no': 'MT48RS32A2', 'upc_code': '649528921529', 'manufacturer_no': 'MTC20F1045S1RC48BA2R', 'manufacturer': 'MICRON', 'category': 'MEM', 'product_name': 'Micron - DDR5 - module - 32 GB - DIMM 288-pin - 4800 MHz / PC5-38400 - registered', 'price': 164.8, 'instant_rebate': '', 'instant_rebate_item_no': '', 'weight': 0.1, 'unit': 100, 'length': 14.75, 'width': 12.0, 'height': 3.75, 'package': 'BULK', 'specorder': 'N', 'is_domestic_only': 'Y', 'inventory': {'': 38}, 'sumOfInventory': 38}, {'list_no': '00108', 'item_no': 'MT48RS16A1', 'upc_code': '649528921376', 'manufacturer_no': 'MTC10F1084S1RC48BA1R', 'manufacturer': 'MICRON', 'category': 'MEM', 'product_name': 'Micron - DDR5 - module - 16 GB - DIMM 288-pin - 4800 MHz / PC5-38400', 'price': 95.79, 'instant_rebate': '', 'instant_rebate_item_no': '', 'weight': 0.09, 'unit': 100, 'length': 14.75, 'width': 12.0, 'height': 3.75, 'package': 'BULK', 'specorder': 'N', 'is_domestic_only': 'Y', 'inventory': {'': 364, 'L': 50}, 'sumOfInventory': 414}, {'list_no': '00116', 'item_no': 'MT48E32A1', 'upc_code': '649528931481', 'manufacturer_no': 'MTC20C2085S1EC48BA1R', 'manufacturer': 'MICRON', 'category': 'MEM', 'product_name': 'Micron - DDR5 - module - 32 GB - DIMM 288-pin - 4800 MHz / PC5-38400 - unbuffered', 'price': 164.8, 'instant_rebate': '', 'instant_rebate_item_no': '', 'weight': 0.09, 'unit': 100, 'length': 14.75, 'width': 12.0, 'height': 3.65, 'package': 'BULK', 'specorder': 'N', 'is_domestic_only': 'Y', 'inventory': {'': 32, 'N': 0, 'L': 0}, 'sumOfInventory': 32}, {'list_no': '00118', 'item_no': 'MT48E16A1', 'upc_code': '649528931429', 'manufacturer_no': 'MTC10C1084S1EC48BA1R', 'manufacturer': 'MICRON', 'category': 'MEM', 'product_name': 'Micron - DDR5 - module - 16 GB - DIMM 288-pin - 4800 MHz / PC5-38400 - unbuffered', 'price': 95.79, 'instant_rebate': '', 'instant_rebate_item_no': '', 'weight': 0.09, 'unit': 100, 'length': 14.75, 'width': 12.0, 'height': 3.75, 'package': 'BULK', 'specorder': 'N', 'is_domestic_only': 'Y', 'inventory': {'': 21}, 'sumOfInventory': 21}]
I have dictionaries in a list that already have some data and I want to add a vin number to each brand in these dictionaries.
my_brand_dict = [
{"key": {"Brand": "Tesla", "Date": "20203"}, "Total": 56},
{"key": {"Brand": "Tesla", "Date": "20207"}, "Total": 88},
{"key": {"Brand": "Audi", "Date": "202014"}, "Total": 79},
{"key": {"Brand": "Mercedes", "Date": "20201"}, "Total": 49},
]
my_vins = ["f60a0a", "#2019c0", "#a81b1b", "#468650", "#21248a", "#ff7a00"]
When Brand is Tesla add '#468650'
When Brand is Mercedes add '#2019c0'
When Brand is Toyota add '#21248a'
When Brand is Audi add '#ff7a00'
My expected output:
my_brand_dict = [
{"key": {"Brand": "Tesla", "Date": "20203"}, "Total": 56, "my_vin": "#468650"},
{"key": {"Brand": "Toyota", "Date": "20207"}, "Total": 88, "my_vin": "#21248a"},
{"key": {"Brand": "Audi", "Date": "202014"}, "Total": 79, "my_vin": "#ff7a00"},
{"key": {"Brand": "Mercedes", "Date": "20201"}, "Total": 49, "my_vin": "#2019c0"},
]
Couldn't find anything that matches what I want to achieve
Conditionally add values to dictionary
I would suggest using a dictionary instead of a list for your my_vins so that it maps brands to vins. This way you can easily get corresponding vin.
my_brand_dict = [
{"key": {"Brand": "Tesla", "Date": "20203"}, "Total": 56},
{"key": {"Brand": "Tesla", "Date": "20207"}, "Total": 88},
{"key": {"Brand": "Audi", "Date": "202014"}, "Total": 79},
{"key": {"Brand": "Mercedes", "Date": "20201"}, "Total": 49},
]
my_vins = {
"Mercedes": "#2019c0",
"Tesla": "#468650",
"Toyota": "#21248a",
"Audi": "#ff7a00",
}
for d in my_brand_dict:
brand = d["key"]["Brand"]
vin = my_vins[brand]
d["my_vin"] = vin
print(my_brand_dict)
Then take care of what should happen if a brand doesn't have a vin, You can raise exception or assign a default value.
You can define a dict base Brand & my_vins. Then use the defined dict and change value in-place in the my_brand_dict like the below.
my_vins_dct = {'Tesla' : '#468650',
'Mercedes' : '#2019c0',
'Toyota' : '#21248a',
'Audi' : '#ff7a00'}
my_brand_dict = [
{"key": {"Brand": "Tesla", "Date": "20203"}, "Total": 56},
{"key": {"Brand": "Tesla", "Date": "20207"}, "Total": 88},
{"key": {"Brand": "Audi", "Date": "202014"}, "Total": 79},
{"key": {"Brand": "Mercedes", "Date": "20201"}, "Total": 49},
{"key": {"Brand": "xxxx", "Date": "20201"}, "Total": 49},
]
for dct in my_brand_dict:
# First approach try/except and 'continue'
try :
dct['my_vin'] = my_vins_dct[dct['key']['Brand']]
except KeyError:
continue
# Second approach for adding 'Not Found'
# dct['my_vin'] = my_vins_dct.get(dct['key']['Brand'], 'Brand Not Found')
print(my_brand_dict)
Output:
[
{'key': {'Brand': 'Tesla', 'Date': '20203'}, 'Total': 56, 'my_vin': '#468650'},
{'key': {'Brand': 'Tesla', 'Date': '20207'}, 'Total': 88, 'my_vin': '#468650'},
{'key': {'Brand': 'Audi', 'Date': '202014'}, 'Total': 79, 'my_vin': '#ff7a00'},
{'key': {'Brand': 'Mercedes', 'Date': '20201'}, 'Total': 49, 'my_vin': '#2019c0'},
{'key': {'Brand': 'xxxx', 'Date': '20201'}, 'Total': 49}
]
# Output Second approach
# [
# {'key': {'Brand': 'Tesla', 'Date': '20203'}, 'Total': 56, 'my_vin': '#468650'},
# {'key': {'Brand': 'Tesla', 'Date': '20207'}, 'Total': 88, 'my_vin': '#468650'},
# {'key': {'Brand': 'Audi', 'Date': '202014'}, 'Total': 79, 'my_vin': '#ff7a00'},
# {'key': {'Brand': 'Mercedes', 'Date': '20201'}, 'Total': 49, 'my_vin': '#2019c0'},
# {'key': {'Brand': 'xxxx', 'Date': '20201'}, 'Total': 49, 'my_vin': 'Brand Not Found'}
# ]
my_brand_dict = [{'key': {'Brand': 'Tesla', 'Date': '20203'}, 'Total': 56}, {'key': {'Brand': 'Tesla', 'Date': '20207'}, 'Total': 88},
{'key': {'Brand': 'Audi', 'Date': '202014'}, 'Total': 79}, {'key': {'Brand': 'Mercedes', 'Date': '20201'}, 'Total': 49}]
my_vins = ['f60a0a', '#2019c0', '#a81b1b', '#468650', '#21248a', '#ff7a00']
# When Brand Tesla add '#468650'
# When Brand Mercedes add '#2019c0'
# When Brand Toyota add '#21248a'
# When Brand Audi add '#ff7a00'
for item in my_brand_dict:
if item['key']['Brand'] == 'Tesla':
item['my_vin'] = '#468650'
elif item['key']['Brand'] == 'Mercedes':
item['my_vin'] = '#2019c0'
elif item['key']['Brand'] == 'Toyota':
item['my_vin'] = '#21248a'
elif item['key']['Brand'] == 'Audi':
item['my_vin'] = '#ff7a00'
print(my_brand_dict)
This code works for me
I have a pandas dataframe that looks like this:
user_id cat_id prod_id score pref_prod
29762 9 3115 1.000000 335.0
29762 58 1335 1.000000 335.0
234894 58 1335 1.000000 335.0
413276 43 1388 1.000000 335.0
413276 58 1335 1.000000 335.0
413276 73 26 1.000000 335.0
9280593 9 137 1.000000 335.0
9280593 58 1335 1.000000 335.0
9280593 74 160 1.000000 335.0
4554542 66 1612 0.166667 197.0
4554542 66 1406 0.166767 197.0
4554542 66 2021 1.000000 197.0
I want to group this df by user_id & cat_id and convert it to json so that it looks something like this:
{
29762: {
'cat_id': {
9: [{
'prod_id': 3115,
'score': 1.0
}],
58: [{
'prod_id': 1335,
'score': 1.0
}]
},
'pref_prod': 335.0
}
234894: {
'cat_id': {
58: [{
'prod_id': 1335,
'score': 1.0
}]
},
'pref_prod': 335.0
}
413276: {
'cat_id': {
43: [{
'prod_id': 1388,
'score': 1.0,
'fav_provider': 335.0
}],
58: [{
'prod_id': 1335,
'score': 1.0,
'fav_provider': 335.0
}],
73: [{
'prod_id': 26,
'score': 1.0,
}]
},
'pref_prod': 335.0
}
4554542: {
'cat_id': {
66: [{
'prod_id': 1612,
'score': 0.166
}, {
'prod_id': 1406,
'score': 0.16
}, {
'prod_id': 2021,
'score': 1.0,
}]
},
'pref_prod': 197.0
}
}
As of now I can do
gb = df.groupby(['user_id', 'cat_id']).apply(lambda g: g.drop(['user_id', 'cat_id'], axis=1).to_dict(orient='records')).to_dict()
which gives me user_id and cat_id in tuple keys:
{
(29762, 9): [{
'prod_id': 3115,
'score': 1.0,
'pref_prod': 335.0
}],
(29762, 58): [{
'prod_id': 1335,
'score': 1.0,
'pref_prod': 335.0
}],
(234894, 58): [{
'prod_id': 1335,
'score': 1.0,
'pref_prod': 335.0
}],
(413276, 43): [{
'prod_id': 1388,
'score': 1.0,
'pref_prod': 335.0
}],
(413276, 58): [{
'prod_id': 1335,
'score': 1.0,
'pref_prod': 335.0
}],
(413276, 73): [{
'prod_id': 26,
'score': 1.0,
'pref_prod': 335.0
}],
(9280593, 9): [{
'prod_id': 137,
'score': 1.0,
'pref_prod': 335.0
}],
(9280593, 58): [{
'prod_id': 1335,
'score': 1.0,
'pref_prod': 335.0
}],
(9280593, 74): [{
'prod_id': 160,
'score': 1.0,
'pref_prod': 335.0
}],
(4554542,
66): [{
'prod_id': 1612,
'score': 0.16666666666666666,
'pref_prod': 197.0
}, {
'prod_id': 1406,
'score': 0.16676666666666665,
'pref_prod': 197.0
}, {
'prod_id': 2021,
'score': 1.0,
'pref_prod': 197.0
}]
}
How can I get the json in the desired format
I can't think of any direct way to do it with pandas only. But you can construct a new dictionary with the desired format based on gb, using a defaultdict
from collections import defaultdict
import json # just to prettyprint the resulting dictionary
gb = df.groupby(['user_id', 'cat_id']).apply(lambda g: g.drop(['user_id', 'cat_id'], axis=1).to_dict(orient='records')).to_dict()
d = defaultdict(lambda: {'cat_id':{}} )
for (user_id, cat_id), records in gb.items():
for record in records:
# drop 'pref_prod' key of each record
# I'm assuming its unique for each (user_id, cat_id) group
pref_prod = record.pop('pref_prod')
d[user_id]['cat_id'][cat_id] = records
d[user_id]['pref_prod'] = pref_prod
>>> print(json.dumps(d, indent=4))
{
"29762": {
"cat_id": {
"9": [
{
"prod_id": 3115,
"score": 1.0
}
],
"58": [
{
"prod_id": 1335,
"score": 1.0
}
]
},
"pref_prod": 335.0
},
"234894": {
"cat_id": {
"58": [
{
"prod_id": 1335,
"score": 1.0
}
]
},
"pref_prod": 335.0
},
"413276": {
"cat_id": {
"43": [
{
"prod_id": 1388,
"score": 1.0
}
],
"58": [
{
"prod_id": 1335,
"score": 1.0
}
],
"73": [
{
"prod_id": 26,
"score": 1.0
}
]
},
"pref_prod": 335.0
},
"4554542": {
"cat_id": {
"66": [
{
"prod_id": 1612,
"score": 0.166667
},
{
"prod_id": 1406,
"score": 0.166767
},
{
"prod_id": 2021,
"score": 1.0
}
]
},
"pref_prod": 197.0
},
"9280593": {
"cat_id": {
"9": [
{
"prod_id": 137,
"score": 1.0
}
],
"58": [
{
"prod_id": 1335,
"score": 1.0
}
],
"74": [
{
"prod_id": 160,
"score": 1.0
}
]
},
"pref_prod": 335.0
}
}
I used a namedtuple from a dataframe conversion to create the json tree. if the tree has more than one level than I would use recursion to build it. the dataframe did not contain lists of list so recursion was not required.
from io import StringIO
import io
from collections import namedtuple
data="""user_id,cat_id,prod_id,score,pref_prod
29762,9,3115,1.000000,335.0
29762,58,1335,1.000000,335.0
234894,58,1335,1.000000,335.0
413276,43,1388,1.000000,335.0
413276,58,335,1.000000,335.0
413276,73,26,1.000000,335.0
9280593,9,137,1.000000,335.0
9280593,58,1335,1.000000,335.0
9280593,74,160,1.000000,335.0
4554542,66,1612,0.166667,197.0
4554542,66,1406,0.166767,197.0
4554542,66,2021,1.000000,197.0"""
df = pd.read_csv(io.StringIO(data), sep=',')
Record=namedtuple('Generic',['user_id','cat_id','prod_id','score','pref_prod'])
def map_to_record(row):
return Record(row.user_id, row.cat_id, row.prod_id,row.score,row.pref_prod)
my_list = list(map(map_to_record, df.itertuples()))
def named_tuple_to_json(named_tuple):
"""
convert a named tuple to a json tree structure
"""
json_string="records:["
for record in named_tuple:
json_string+="{"
json_string+="'user_id': {},'cat_id': {},'prod_id': {},'score': {},'pref_prod': {},".format(
record.user_id,record.cat_id,record.prod_id,record.score,record.pref_prod)
json_string+="},"
json_string+="]"
return json_string
# convert the list of named tuples to a json tree structure
json_tree = named_tuple_to_json(my_list)
print(json_tree)
output
records:[{'user_id': 29762,'cat_id': 9,'prod_id': 3115,'score': 1.0,'pref_prod': 335.0,},{'user_id': 29762,'cat_id': 58,'prod_id': 1335,'score': 1.0,'pref_prod': 335.0,},{'user_id': 234894,'cat_id': 58,'prod_id': 1335,'score': 1.0,'pref_prod': 335.0,},{'user_id': 413276,'cat_id': 43,'prod_id': 1388,'score': 1.0,'pref_prod': 335.0,},{'user_id': 413276,'cat_id': 58,'prod_id': 335,'score': 1.0,'pref_prod': 335.0,},{'user_id': 413276,'cat_id': 73,'prod_id': 26,'score': 1.0,'pref_prod': 335.0,},{'user_id': 9280593,'cat_id': 9,'prod_id': 137,'score': 1.0,'pref_prod': 335.0,},{'user_id': 9280593,'cat_id': 58,'prod_id': 1335,'score': 1.0,'pref_prod': 335.0,},{'user_id': 9280593,'cat_id': 74,'prod_id': 160,'score': 1.0,'pref_prod': 335.0,},{'user_id': 4554542,'cat_id': 66,'prod_id': 1612,'score': 0.166667,'pref_prod': 197.0,},{'user_id': 4554542,'cat_id': 66,'prod_id': 1406,'score': 0.166767,'pref_prod': 197.0,},{'user_id': 4554542,'cat_id': 66,'prod_id': 2021,'score': 1.0,'pref_prod': 197.0,},]
I have a created a dataframe from a JSON but want to keep only the first 5 columns of the result.
Here is a part of the JSON:
{
"lat": 52.517,
"lon": 13.3889,
"timezone": "Europe/Berlin",
"timezone_offset": 7200,
"current": {
"dt": 1628156947,
"sunrise": 1628134359,
"sunset": 1628189532,
"temp": 295.54,
"feels_like": 295.43,
"pressure": 1009,
"humidity": 61,
"dew_point": 287.66,
"uvi": 4.53,
"clouds": 20,
"visibility": 10000,
"wind_speed": 3.58,
"wind_deg": 79,
"wind_gust": 4.92,
"weather": [
{
"id": 801,
"main": "Clouds",
"description": "few clouds",
"icon": "02d"
}
]
},
"hourly": [
{
"dt": 1628154000,
"temp": 295.26,
"feels_like": 295.09,
"pressure": 1009,
"humidity": 60,
"dew_point": 287.14,
"uvi": 4.01,
"clouds": 36,
"visibility": 10000,
"wind_speed": 3.6,
"wind_deg": 83,
"wind_gust": 4.76,
"weather": [
{
"id": 500,
"main": "Rain",
"description": "light rain",
"icon": "10d"
}
],
"pop": 0.49,
"rain": {
"1h": 0.52
}
},
{
"dt": 1628157600,
"temp": 295.54,
"feels_like": 295.43,
"pressure": 1009,
"humidity": 61,
"dew_point": 287.66,
"uvi": 4.53,
"clouds": 20,
"visibility": 10000,
"wind_speed": 3.76,
"wind_deg": 85,
"wind_gust": 4.91,
"weather": [
{
"id": 801,
"main": "Clouds",
"description": "few clouds",
"icon": "02d"
}
],
"pop": 0.55
},
{
"dt": 1628161200,
"temp": 295.58,
"feels_like": 295.42,
"pressure": 1009,
"humidity": 59,
"dew_point": 287.18,
"uvi": 4.9,
"clouds": 36,
"visibility": 10000,
"wind_speed": 3.58,
"wind_deg": 95,
"wind_gust": 4.73,
"weather": [
{
"id": 802,
"main": "Clouds",
"description": "scattered clouds",
"icon": "03d"
}
],
"pop": 0.59
}
]
}
I have flattened the JSON first like this:
df_history = pd.json_normalize(data_history, max_level=1)`
That gave me this structure:
lat lon timezone timezone_offset hourly current.dt current.sunrise current.sunset current.temp current.feels_like ... current.humidity current.dew_point current.uvi current.clouds current.visibility current.wind_speed current.wind_deg current.wind_gust current.weather current.rain
0 52.517 13.3889 Europe/Berlin 7200 [{'dt': 1627776000, 'temp': 17.82, 'feels_like... 1627855200 1627874869 1627930649 16.36 16.4 ... 90 14.72 0 0 10000 3.13 254 11.18 [{'id': 500, 'main': 'Rain', 'description': 'l... {'1h': 0.17}
But I want to keep only the columns up to the column "hourly" and then flatten it.
I have tried this but to no avail:
df_history_small = pd.json_normalize(data_history, record_path='hourly',meta=['dt','temp', 'humidity'], errors='ignore')
What am I doing wrong? How can I achieve my goal?
my final goal it to have a dataframe that looks like this:
lat lon timezone timezone_offset timestamp temp feels_like humidity pressure
0 52.517 13.3889 Europe/Berlin 7200 08/01/2021 00:00:00 17.82 17.46 69 1005
Try:
cols = ['lat', 'lon', 'timezone', 'timezone_offset',
'dt', 'temp', 'feels_like', 'humidity']
out = pd.json_normalize(data_history, ['hourly'], meta=cols[:4])[cols]
>>> out
lat lon timezone timezone_offset dt temp feels_like humidity
0 52.517 13.3889 Europe/Berlin 7200 1628154000 295.26 295.09 60
1 52.517 13.3889 Europe/Berlin 7200 1628157600 295.54 295.43 61
2 52.517 13.3889 Europe/Berlin 7200 1628161200 295.58 295.42 59
Feel free to convert dt to timestamp with:
df['timestamp'] = pd.to_datetime(out['dt'], unit='s')
I have a list of dicts as follows :
[
{
"status": "BV",
"max_total_duration": null,
"min_total_duration": null,
"75th_percentile": 420,
"median": 240.0,
"25th_percentile": 180,
"avg_total_duration": null
},
{
"status": "CORR",
"max_total_duration": null,
"min_total_duration": null,
"75th_percentile": 1380,
"median": 720.0,
"25th_percentile": 420,
"avg_total_duration": null
},
{
"status": "FILL",
"max_total_duration": null,
"min_total_duration": null,
"75th_percentile": 1500,
"median": 840.0,
"25th_percentile": 480,
"avg_total_duration": null
},
{
"status": "INIT",
"max_total_duration": 11280,
"min_total_duration": 120,
"75th_percentile": 720,
"median": 360.0,
"25th_percentile": 180,
"avg_total_duration": 2061
},
]
As is evident,max_total_duration,min_total_duration and avg_total_duration is null for all status except when status is "INIT".What I would want is to remove all the entries for null values and for INIT where max_total_duration,min_total_duration and avg_total_duration have correct values, add them as a new dictionary in the list as follows:
[
{
"status": "BV",
"75th_percentile": 420,
"median": 240.0,
"25th_percentile": 180,
},
{
"status": "CORR",
"75th_percentile": 1380,
"median": 720.0,
"25th_percentile": 420,
},
{
"status": "FILL",
"75th_percentile": 1500,
"median": 840.0,
"25th_percentile": 480,
},
{
"status": "INIT",
"75th_percentile": 720,
"median": 360.0,
"25th_percentile": 180,
},
{
"max_total_duration": 11280,
"min_total_duration": 120,
"avg_total_duration": 2061,
}
]
I have tried doing this by iterating over the list and it is computationally very expensive.Is there an easier way of doing this with pandas ?
data =[
{
"status": "BV",
"max_total_duration": None,
"min_total_duration": None,
"75th_percentile": 420,
"median": 240.0,
"25th_percentile": 180,
"avg_total_duration": None
},
{
"status": "CORR",
"max_total_duration": None,
"min_total_duration": None,
"75th_percentile": 1380,
"median": 720.0,
"25th_percentile": 420,
"avg_total_duration": None
},
{
"status": "FILL",
"max_total_duration": None,
"min_total_duration": None,
"75th_percentile": 1500,
"median": 840.0,
"25th_percentile": 480,
"avg_total_duration": None
},
{
"status": "INIT",
"max_total_duration": 11280,
"min_total_duration": 120,
"75th_percentile": 720,
"median": 360.0,
"25th_percentile": 180,
"avg_total_duration": 2061
},
]
data = [{key: val for key, val in d.iteritems() if val} for d in data]
final = []
for d in data:
status = d.get('status')
if status == 'INIT':
final.append({'max_total_duration': d.get('max_total_duration'), 'min_total_duration': d.get('min_total_duration'), 'avg_total_duration': d.get('avg_total_duration')})
del d['max_total_duration']
del d['min_total_duration']
del d['avg_total_duration']
final.append(d)
print final
import pandas as pd
# Substituting your 'null' for 'None'
df = pd.DataFrame(data)
>>> df
25th_percentile 75th_percentile avg_total_duration max_total_duration \
0 180 420 NaN NaN
1 420 1380 NaN NaN
2 480 1500 NaN NaN
3 180 720 2061 11280
median min_total_duration status
0 240 NaN BV
1 720 NaN CORR
2 840 NaN FILL
3 360 120 INIT
Grabbing the percentiles part:
df_percentiles = df[['status','25th_percentile','median','75th_percentile']]
>>> df_percentiles
status 25th_percentile median 75th_percentile
0 BV 180 240 420
1 CORR 420 720 1380
2 FILL 480 840 1500
3 INIT 180 360 720
Grabbing the durations part:
df_durations = df[df['status'] == 'INIT'][['max_total_duration','min_total_duration','avg_total_duration']]
>>> df_durations
max_total_duration min_total_duration avg_total_duration
3 11280 120 2061
Loop and combine to list:
summary = df_percentiles.T.to_dict().values()
summary.append(df_durations.T.to_dict().values())
>>> summary
[{'25th_percentile': 180,
'75th_percentile': 420,
'median': 240.0,
'status': 'BV'},
{'25th_percentile': 420,
'75th_percentile': 1380,
'median': 720.0,
'status': 'CORR'},
{'25th_percentile': 480,
'75th_percentile': 1500,
'median': 840.0,
'status': 'FILL'},
{'25th_percentile': 180,
'75th_percentile': 720,
'median': 360.0,
'status': 'INIT'},
{'avg_total_duration': 2061.0,
'max_total_duration': 11280.0,
'min_total_duration': 120.0}]