Convert JSON to Excel by Python - python

I have a JSON that need to convert to Excel.
I'm using Python 3.8 with xlsxwriter library.
Below is sample JSON.
{
"companyId": "123456",
"companyName": "Test",
"companyStatus": "ACTIVE",
"document": {
"employee": {
"employeeId": "EM1567",
"employeeLastName": "Test Last",
"employeeFirstName": "Test Fist"
},
"expenseEntry": [
{
"allocation": [
{
"allocationId": "03B249B3598",
"journal": [
{
"journalAccountCode": "888",
"journalPayee": "EMPL",
"journalPayer": "COMP",
"taxGuid": [
"51645A638114E"
]
},
{
"journalAccountCode": "999",
"journalPayee": "EMPL",
"journalPayer": "EMPL",
"taxGuid": [
"8114E51645A63"
]
},
],
"tax": [
{
"taxCode": "TAX123",
"taxSource": "SYST"
},
{
"taxCode": "TAX456",
"taxSource": "SYST"
}
]
}
],
"approvedAmount": 200.0,
"entryDate": "2020-12-10",
"entryId": "ENTRY9988"
}
],
"report": {
"currencyCode": "USD",
"reportCreationDate": "2020-12-10",
"reportId": "ACA849BBB",
"reportName": "Test Report",
"totalApprovedAmount": 200.0
}
},
"id": "c71b7d756f549"
}
And my current code:
https://repl.it/#tonyiscoming/jsontoexcel
I tried with pandas
import pandas as pd
df = pd.json_normalize(data, max_level=5)
df.to_excel('test.xlsx', index=False)
And got the result
I tried with json_excel_converter
from json_excel_converter import Converter
from json_excel_converter.xlsx import Writer
conv = Converter()
conv.convert(data, Writer(file='test.xlsx'))
And got the result
This is my expectation
Would anyone please help me in this case? Thank you so much.

Here is the code what you are looking for. I did this using XlsxWriter package. First I made the template with some cell format stuff. After that, I entered values using according to your JSON.
import xlsxwriter
from itertools import zip_longest
data = [
{
"companyId": "123456",
"companyName": "Test",
"companyStatus": "ACTIVE",
"document": {
"employee": {
"employeeId": "EM1567",
"employeeLastName": "Test Last",
"employeeFirstName": "Test Fist"
},
"expenseEntry": [
{
"allocation": [
{
"allocationId": "03B249B3598",
"journal": [
{
"journalAccountCode": "888",
"journalPayee": "EMPL",
"journalPayer": "COMP",
"taxGuid": [
"51645A638114E"
]
},
{
"journalAccountCode": "999",
"journalPayee": "EMPL",
"journalPayer": "EMPL",
"taxGuid": [
"8114E51645A63"
]
},
],
"tax": [
{
"taxCode": "TAX123",
"taxSource": "SYST"
},
{
"taxCode": "TAX456",
"taxSource": "SYST"
}
]
}
],
"approvedAmount": 200.0,
"entryDate": "2020-12-10",
"entryId": "ENTRY9988"
}
],
"report": {
"currencyCode": "USD",
"reportCreationDate": "2020-12-10",
"reportId": "ACA849BBB",
"reportName": "Test Report",
"totalApprovedAmount": 200.0
}
},
"id": "c71b7d756f549"
}
]
xlsx_file = 'your_file_name_here.xlsx'
# define the excel file
workbook = xlsxwriter.Workbook(xlsx_file)
# create a sheet for our work, defaults to Sheet1.
worksheet = workbook.add_worksheet()
# common merge format
merge_format = workbook.add_format({'align': 'center', 'valign': 'vcenter'})
# set all column width to 20
worksheet.set_column('A:V', 20)
# column wise template creation (A-V)
worksheet.merge_range(0, 0, 4, 0, 'companyId', merge_format) # A
worksheet.merge_range(0, 1, 4, 1, 'companyName', merge_format) # B
worksheet.merge_range(0, 2, 4, 2, 'companyStatus', merge_format) # C
worksheet.merge_range(0, 3, 0, 20, 'document', merge_format) # C-U
worksheet.merge_range(1, 3, 1, 5, 'employee', merge_format) # D-F
worksheet.merge_range(2, 3, 4, 3, 'employeeId', merge_format) # D
worksheet.merge_range(2, 4, 4, 4, 'employeeLastName', merge_format) # E
worksheet.merge_range(2, 5, 4, 5, 'employeeFirstName', merge_format) # F
worksheet.merge_range(1, 6, 1, 15, 'expenseEntry', merge_format) # G-P
worksheet.merge_range(2, 6, 2, 12, 'allocation', merge_format) # G-M
worksheet.merge_range(3, 6, 4, 6, 'allocationId', merge_format) # G
worksheet.merge_range(3, 7, 3, 10, 'journal', merge_format) # H-K
worksheet.write(4, 7, 'journalAccountCode') # H
worksheet.write(4, 8, 'journalPayee') # I
worksheet.write(4, 9, 'journalPayer') # J
worksheet.write(4, 10, 'taxGuid') # K
worksheet.merge_range(3, 11, 3, 12, 'tax', merge_format) # L-M
worksheet.write(4, 11, 'taxCode') # L
worksheet.write(4, 12, 'taxSource') # M
worksheet.merge_range(2, 13, 4, 13, 'approvedAmount', merge_format) # N
worksheet.merge_range(2, 14, 4, 14, 'entryDate', merge_format) # O
worksheet.merge_range(2, 15, 4, 15, 'entryId', merge_format) # P
worksheet.merge_range(1, 16, 1, 20, 'report', merge_format) # Q-U
worksheet.merge_range(2, 16, 4, 16, 'currencyCode', merge_format) # Q
worksheet.merge_range(2, 17, 4, 17, 'reportCreationDate', merge_format) # R
worksheet.merge_range(2, 18, 4, 18, 'reportId', merge_format) # S
worksheet.merge_range(2, 19, 4, 19, 'reportName', merge_format) # T
worksheet.merge_range(2, 20, 4, 20, 'totalApprovedAmount', merge_format) # U
worksheet.merge_range(0, 21, 4, 21, 'id', merge_format) # V
# inserting data
row = 5
for obj in data:
worksheet.write(row, 0, obj.get('companyId'))
worksheet.write(row, 1, obj.get('companyName'))
worksheet.write(row, 2, obj.get('companyStatus'))
document = obj.get('document', {})
# employee details
employee = document.get('employee', {})
worksheet.write(row, 3, employee.get('employeeId'))
worksheet.write(row, 4, employee.get('employeeLastName'))
worksheet.write(row, 5, employee.get('employeeFirstName'))
# report details
report = document.get('report', {})
worksheet.write(row, 16, report.get('currencyCode'))
worksheet.write(row, 17, report.get('reportCreationDate'))
worksheet.write(row, 18, report.get('reportId'))
worksheet.write(row, 19, report.get('reportName'))
worksheet.write(row, 20, report.get('totalApprovedAmount'))
worksheet.write(row, 21, obj.get('id'))
# expenseEntry details
expense_entries = document.get('expenseEntry', [])
for expense_entry in expense_entries:
worksheet.write(row, 13, expense_entry.get('approvedAmount'))
worksheet.write(row, 14, expense_entry.get('entryDate'))
worksheet.write(row, 15, expense_entry.get('entryId'))
# allocation details
allocations = expense_entry.get('allocation', [])
for allocation in allocations:
worksheet.write(row, 6, allocation.get('allocationId'))
# journal and tax details
journals = allocation.get('journal', [])
taxes = allocation.get('tax', [])
for journal_and_tax in list(zip_longest(journals, taxes)):
journal, tax = journal_and_tax
worksheet.write(row, 7, journal.get('journalAccountCode'))
worksheet.write(row, 8, journal.get('journalPayee'))
worksheet.write(row, 9, journal.get('journalPayer'))
worksheet.write(row, 11, tax.get('taxCode'))
worksheet.write(row, 12, tax.get('taxSource'))
# taxGuid details
tax_guides = journal.get('taxGuid', [])
if not tax_guides:
row = row + 1
continue
for tax_guide in tax_guides:
worksheet.write(row, 10, tax_guide)
row = row + 1
# finally close the created excel file
workbook.close()
One thing, instead of creating a template in the script you can make your own one and save it somewhere else. Then get the copy of that template and just add data using the script. This will give you a chance to make your own base template, otherwise, you have to format your excel using the script, such as border formattings, merge cells, etc.
I used zip_longest python built-in function from itertools to zip journal and tax objects. Just follow Python – Itertools.zip_longest() or Python's zip_longest Function article for examples. If you didn't understand anything from my code, please comment below.

Having empty cells in an Excel Grid is not something really "propper", which is why json_excel_converter beahaves like this.
So, If you want to achieve this, I'm afraid you'll have to develop it all by yourself.

Related

Drawing custom error bars when using plotly subplots

This question is closely related to an earlier one that I posted. I would like to draw confidence intervals for each bar within subplots of a figure, using the information from two columns in my data frame describing the upper and lower limit of each confidence interval. I tried to use the solution from that earlier post, but it does not seem to be applicable when one wants to use different colors and/or different rows in order to draw subplots for the figure.
For example, the following code does not produce the right confidence intervals. For instance, the CI of the 3rd bar in the second row should go from 11 to 5:
import pandas as pd
import plotly.express as px
df = pd.DataFrame(
{"x": [0, 1, 2, 3, 0, 1, 2, 3],
"y": [6, 10, 2, 5, 8, 9, 10, 11],
"ci_upper": [8, 11, 2.5, 4, 9, 10, 11, 12],
"ci_lower": [5, 9, 1.5, 3, 7, 6, 5, 10],
"state": ['foo','foo','foo','foo','bar','bar','bar','bar'],
"color": ['0','0','1','1','0','0','1','1']}
)
fig = px.bar(df, x="x", y="y",facet_row='state',color='color').update_traces(
error_y={
"type": "data",
"symmetric": False,
"array": df["ci_upper"] - df["y"],
"arrayminus": df["y"] - df["ci_lower"],
}
)
fig.update_yaxes(dtick=1)
fig.show(renderer='png')
it's the same technique but solution needs to consider it's multiple traces (4 in this example)
encoded in hovertemplate of each trace are the facet and color. Extract these and filter data down to appropriate rows
then build instruction for error bars as with simpler condition
import pandas as pd
import plotly.express as px
df = pd.DataFrame(
{
"x": [0, 1, 2, 3, 0, 1, 2, 3],
"y": [6, 10, 2, 5, 8, 9, 10, 11],
"ci_upper": [8, 11, 2.5, 4, 9, 10, 11, 12],
"ci_lower": [5, 9, 1.5, 3, 7, 6, 5, 10],
"state": ["foo", "foo", "foo", "foo", "bar", "bar", "bar", "bar"],
"color": ["0", "0", "1", "1", "0", "0", "1", "1"],
}
)
fig = px.bar(df, x="x", y="y", facet_row="state", color="color")
fig.update_yaxes(dtick=1)
def error_facet(t):
# filter data frame based on contents of hovertemplate
d = df.query(
" and ".join(
[
f"{q.split('=')[0]}==\"{q.split('=')[1]}\""
for q in t.hovertemplate.split("<br>")[0:2]
]
)
)
t.update(
{
"error_y": {
"type": "data",
"symmetric": False,
"array": d["ci_upper"] - d["y"],
"arrayminus": d["y"] - d["ci_lower"],
}
}
)
fig.for_each_trace(error_facet)
fig

Filter an array of datetime given the start and end date in pymongo

I'm having a problem when i go to filter an array of dates using "$gte" and "$lte" on pymongo. I leave you a piece of code to better understand the problem.
import datetime
from pymongo import MongoClient
client = MongoClient('mongodb://localhost:27017')
db = client["AirQuality"]
demo = db["demo"]
demo.save({
"devId": 1,
"samples": [
{"value":3, "datetime":datetime.datetime(2021, 3, 4, 20, 15, 22)},
{"value":6, "datetime":datetime.datetime(2021, 3, 4, 22, 35, 12)},
{"value":2, "datetime":datetime.datetime(2021, 3, 6, 10, 15, 00)}
]
})
and I would like to filter the values for a particular range:
start = datetime.datetime(2021, 3, 4, 22, 00, 00)
end = datetime.datetime(2021, 3, 5, 2, 26, 49)
list(demo.find( { 'samples.datetime': { "$gte":start, "$lte":end } } ))
the output is as follows:
[{'_id': ObjectId('604353efad253df2602dfaf9'), 'devId': 1, 'samples': [{'value': 3, 'datetime': datetime.datetime(2021, 3, 4, 20, 15, 22)}, {'value': 6, 'datetime': datetime.datetime(2021, 3, 4, 22, 35, 12)}, {'value': 2, 'datetime': datetime.datetime(2021, 3, 6, 10, 15)}]}]
but I expect:
[{'_id': ObjectId('604353efad253df2602dfaf9'), 'devId': 1, 'samples': [{'value': 6, 'datetime': datetime.datetime(2021, 3, 4, 22, 35, 12)}]}]
Where am I doing wrong? Even if I apply a filter on "value" it doesn't work, so I believe the error is in the query! Thanks! 🙏
Solved with aggregation:
result = demo.aggregate([
{
"$project": {
"samples": {
"$filter": {
"input": "$samples",
"as": "item",
"cond": {
"$and":[
{ "$gte": [ "$$item.datetime", start ] },
{ "$lte": [ "$$item.datetime", end ] }
]}
}
}
}
}
])
list(result)
that return:
[{'_id': ObjectId('604353efad253df2602dfaf9'), 'samples': [{'value': 6, 'datetime': datetime.datetime(2021, 3, 4, 22, 35, 12)}]}]
use find like this i think that it solve your problem:
list(demo.find( { "$and":['samples.datetime':{"$gte":start}, 'samples.datetime':{"$lte":end} ] } ))

read_pickle failing stochastically

I have a dataframe that I saved to a pickle file. When I load it with read_pickle it fails with the following error on roughly 1/10th of runs:
ValueError: Level values must be unique: [Timestamp('2020-06-03 15:59:59.999999+0000', tz='UTC'), datetime.date(2020, 6, 3), datetime.date(2020, 6, 4), datetime.date(2020, 6, 5)] on level 0
What is causing this stochastic behaviour?
The issue can be reproduced with the following:
from datetime import timedelta, date
import pandas as pd
import pytz
from pandas import Timestamp
utc = pytz.UTC
data = {
"date": [
Timestamp("2020-06-03 15:00:00").replace(tzinfo=utc).replace(minute=59, second=59, microsecond=999999),
Timestamp("2020-06-03 15:00:00").replace(tzinfo=utc).date(),
Timestamp("2020-06-03 15:00:00").replace(tzinfo=utc).date(),
Timestamp("2020-06-03 15:00:00").replace(tzinfo=utc).date() + timedelta(days=1),
Timestamp("2020-06-03 15:00:00").replace(tzinfo=utc).date() + timedelta(days=1),
Timestamp("2020-06-03 15:00:00").replace(tzinfo=utc).date() + timedelta(days=2),
Timestamp("2020-06-03 15:00:00").replace(tzinfo=utc).date() + timedelta(days=2),
],
"status": ["in_progress", "in_progress", "done", "in_progress", "done", "in_progress", "done"],
"issue_count": [20, 18, 2, 14, 6, 10, 10],
"points": [100, 90, 10, 70, 30, 50, 50],
"stories": [0, 0, 0, 0, 0, 0, 0],
"tasks": [100, 100, 100, 100, 100, 100, 100],
"bugs": [0, 0, 0, 0, 0, 0, 0],
"subtasks": [0, 0, 0, 0, 0, 0, 0],
"assignee": ["Name", "Name", "Name", "Name", "Name", "Name", "Name"],
}
df = pd.DataFrame(data).groupby(["date", "status"]).sum()
df.to_pickle("~/failing_df.pkl")
pd.read_pickle("~/failing_df.pkl")
try to_csv() or to_dict()
# write it to csv
df.to_csv('temp.csv')
# read it from csv
df2 = pd.read_csv('temp.csv')
df2.set_index(['date', 'status'], inplace=True)
or optionally
df_dict = df.to_dict()
# pickle it
df.to_pickle('temp.pkl')
# unpickle it
df2 = pd.read_pickle('temp.pkl')

csv to json with column data that needs to be grouped

I have a CSV file in a format similar to this
order_id, customer_name, item_1_id, item_1_quantity, Item_2_id, Item_2_quantity, Item_3_id, Item_3_quantity
1, John, 4, 1, 24, 4, 16, 1
2, Paul, 8, 3, 41, 1, 33, 1
3, Andrew, 1, 1, 34, 4, 8, 2
I want to export to json, currently I am doing this.
df = pd.read_csv('simple.csv')
print ( df.to_json(orient = 'records') )
And the output is
[
{
"Item_2_id": 24,
"Item_2_quantity": 4,
"Item_3_id": 16,
"Item_3_quantity": 1,
"customer_name": "John",
"item_1_id": 4,
"item_1_quantity": 1,
"order_id": 1
},
......
However, I would like the output to be
[
{
"customer_name": "John",
"order_id": 1,
"items": [
{ "id": 4, "quantity": 1 },
{ "id": 24, "quantity": 4 },
{ "id": 16, "quantity": 1 },
]
},
......
Any suggestions on a good way to do this?
In this particular project, there will not be more than 5 times per order
Try the following:
import pandas as pd
import json
output_lst = []
##specify the first row as header
df = pd.read_csv('simple.csv', header=0)
##iterate through all the rows
for index, row in df.iterrows():
dict = {}
items_lst = []
## column_list is a list of column headers
column_list = df.columns.values
for i, col_name in enumerate(column_list):
## for the first 2 columns simply copy the value into the dictionary
if i<2:
element = row[col_name]
if isinstance(element, str):
## strip if it is a string type value
element = element.strip()
dict[col_name] = element
elif "_id" in col_name:
## i+1 is used assuming that the item_quantity comes right after the corresponding item_id for each item
item_dict = {"id":row[col_name], "quantity":row[column_list[i+1]]}
items_lst.append(item_dict)
dict["items"] = items_lst
output_lst.append(dict)
print json.dumps(output_lst)
If you run the above file with the sample.csv described in the question then you get the following output:
[
{
"order_id": 1,
"items": [
{
"id": 4,
"quantity": 1
},
{
"id": 24,
"quantity": 4
},
{
"id": 16,
"quantity": 1
}
],
" customer_name": "John"
},
{
"order_id": 2,
"items": [
{
"id": 8,
"quantity": 3
},
{
"id": 41,
"quantity": 1
},
{
"id": 33,
"quantity": 1
}
],
" customer_name": "Paul"
},
{
"order_id": 3,
"items": [
{
"id": 1,
"quantity": 1
},
{
"id": 34,
"quantity": 4
},
{
"id": 8,
"quantity": 2
}
],
" customer_name": "Andrew"
}
]
Source DF:
In [168]: df
Out[168]:
order_id customer_name item_1_id item_1_quantity Item_2_id Item_2_quantity Item_3_id Item_3_quantity
0 1 John 4 1 24 4 16 1
1 2 Paul 8 3 41 1 33 1
2 3 Andrew 1 1 34 4 8 2
Solution:
In [169]: %paste
import re
x = df[['order_id','customer_name']].copy()
x['id'] = \
pd.Series(df.loc[:, df.columns.str.contains(r'item_.*?_id',
flags=re.I)].values.tolist(),
index=df.index)
x['quantity'] = \
pd.Series(df.loc[:, df.columns.str.contains(r'item_.*?_quantity',
flags=re.I)].values.tolist(),
index=df.index)
x.to_json(orient='records')
## -- End pasted text --
Out[169]: '[{"order_id":1,"customer_name":"John","id":[4,24,16],"quantity":[1,4,1]},{"order_id":2,"customer_name":"Paul","id":[8,41,33],"qua
ntity":[3,1,1]},{"order_id":3,"customer_name":"Andrew","id":[1,34,8],"quantity":[1,4,2]}]'
Intermediate helper DF:
In [82]: x
Out[82]:
order_id customer_name id quantity
0 1 John [4, 24, 16] [1, 4, 1]
1 2 Paul [8, 41, 33] [3, 1, 1]
2 3 Andrew [1, 34, 8] [1, 4, 2]
j = df.set_index(['order_id','customer_name']) \
.groupby(lambda x: x.split('_')[-1], axis=1) \
.agg(lambda x: x.values.tolist()) \
.reset_index() \
.to_json(orient='records')
import json
Beatufied result:
In [122]: print(json.dumps(json.loads(j), indent=2))
[
{
"order_id": 1,
"customer_name": "John",
"id": [
4,
24,
16
],
"quantity": [
1,
4,
1
]
},
{
"order_id": 2,
"customer_name": "Paul",
"id": [
8,
41,
33
],
"quantity": [
3,
1,
1
]
},
{
"order_id": 3,
"customer_name": "Andrew",
"id": [
1,
34,
8
],
"quantity": [
1,
4,
2
]
}
]

MongoDB aggregation and group by id and then date

My audit_records collections is as below :
{u'policy_holder': u'Kapil', u'_id': ObjectId('4d663451d1e7242c4b68e000'), u'audit_time': datetime.datetime(2015, 9, 6, 10, 5, 12, 474000), u'policy_ids': [u'92b7bbfa-688e9e5304d5'], u'category': u'TIManagement'}
{u'policy_holder': u'Sunil', u'_id': ObjectId('4d6634514cb5cb2c4b69e000'), u'audit_time': datetime.datetime(2015, 9, 6, 11, 5, 12, 474000), u'policy_ids': [u'92b7bbfa-688e9e5304d5'], u'category': u'PIManagement'}
{u'policy_holder': u'Edward', u'_id': ObjectId('4d6634514cb5cb2c4b65e000'), u'audit_time': datetime.datetime(2015, 8, 3, 12, 4, 2, 723000), u'policy_ids': [u'92b7ccge-688e9e5304d5'], u'category': u'TIManagement'}
I'm querying my database using aggregation and pipeline to group by policy_ids and no of policy_holder associated with that policy_ids
and my code is as below:
startdate = datetime.datetime.strptime("2015-01-06",'%Y-%m-%d')
enddate = datetime.datetime.strptime("2015-10-01",'%Y-%m-%d')
pipe = [{'$match':{"audit_time": {"$gt": startdate,"$lte": enddate}}},{'$group': {'_id': '$policy_ids', 'policy_holder': {'$sum': 1}}}]
for data in db.audit_records.aggregate(pipeline=pipe):
Out got :
{u'policy_holder': 2, u'_id': u'92b7bbfa-688e9e5304d5'}
{u'policy_holder': 1, u'_id': u'92b7ccge-688e9e5304d5'}
Now want to group this whole output by date, is it possible and how?
you have to use the aggregation pipeline with $unwind with group
db.collection.aggregate([{$unwind:"$policy_ids"},{$group:{_id:{policy_id:"$policy_ids",audit_time:"$audit_time"},sum:{$sum:1}}}])
I Modify a bit in your document
Inserted the document like this
{'policy_holder': 'Kapil', '_id': ObjectId('4d663451d1e7242c4b68e000'), 'audit_time': new Date(2015, 9, 6, 10, 5, 12, 474000), 'policy_ids': ['92b7bbfa-688e9e5304d5'], 'category': 'TIManagement'}
{'policy_holder': 'Sunil', '_id': ObjectId('4d6634514cb5cb2c4b69e000'), 'audit_time': new Date(2015, 9, 6, 11, 5, 12, 474000), 'policy_ids': ['92b7bbfa-688e9e5304d5'], 'category': 'PIManagement'}
{'policy_holder': 'Edward', '_id': ObjectId('4d6634514cb5cb2c4b65e000'), 'audit_time': new Date(2015, 8, 3, 12, 4, 2, 723000), 'policy_ids': ['92b7ccge-688e9e5304d5'], 'category': 'TIManagement'}
Update the Aggregation Query
db.policy.aggregate([{$unwind:"$policy_ids"},{$group:{_id:{"policy":"$policy_ids",day: { $dayOfYear: "$audit_time"}, year: { $year: "$audit_time" }},total:{$sum:1}}}])
**
Output is
**
{ "_id" : { "policy" : "92b7ccge-688e9e5304d5", "day" : 246, "year" : 2015 }, "total" : 1 }
{ "_id" : { "policy" : "92b7bbfa-688e9e5304d5", "day" : 279, "year" : 2015 }, "total" : 2 }
Hope this is you are expecting

Categories

Resources