Splitting Excel Data by Groupings into Separate Workbook Sheets - python

Background:I have a large 40MB XLSX file that contains client data which is Grouped over multiple levels, like so:
Expanded -
Not Expanded (sorry about the terrible dummy data!) -
Objective:I would like to split Client A, B C etc... and all their respective underlying data into separate sheets (named 'Client A' etc...) in a Workbook.
Question:Am I correct in assuming that there is no python library that would help with this (e.g., xlsxwriter) and that I will likely have to save into multiple pandas df before splitting and writing to the xlsx file?
Sample Data:Here is a link to some randomized sample data. In this file you will see only 1 client (the total row can be ignored) however imagine the normal file having 40 clients / groupings and sub levels.
Sample Code: this function takes the '.xlsxand writes each grouping to an appropriately named tab (e.g., 'Client A') to a separate Worksheet in a new.xlsx`. The issue with this code is that because I am basically going through and copying each cell individually, I didn't think to consider more holistically however to ensure the Groupings/Levels would be preserved. I think this code needs a complete re-write, and welcome feedback
import openpyxl
from copy import copy
from openpyxl import load_workbook
columns=['A','B','C','D','E','F','G','H','I','J','K','L']
def copy_cell(ws, row,ws_row,ws1):
for col in columns:
ws_cell=ws1[col+str(ws_row)]
new_cell = ws[col+str(row)]
if ws_cell.has_style:
new_cell.font = copy(ws_cell.font)
new_cell.border = copy(ws_cell.border)
new_cell.fill = copy(ws_cell.fill)
new_cell.number_format = copy(ws_cell.number_format)
new_cell.protection = copy(ws_cell.protection)
new_cell.alignment = copy(ws_cell.alignment)
wb1 = openpyxl.load_workbook('annonamized_test_data_to_be_split.xlsx')
ws1=wb1.active
indexs=[]
clients=[]
index=1
while ws1['A'+str(index)]:
if str(ws1['A'+str(index)].alignment.indent)=='0.0':
indexs.append(index)
clients.append(ws1['A'+str(index)].value)
if ws1['A'+str(index)].value is None:
indexs.append(index)
break
index+=1
wb1.close()
wb = openpyxl.Workbook()
ws=wb.active
start_index=1
headers=['Ownership Structure', 'Fee Schedule', 'Management Style', 'Advisory Firm', 'Inception Date', 'Days in Time Period', 'Adjusted Average Daily Balance (No Div, USD)', 'Assets Billed On (USD)',
'Effective Billing Rate', 'Billing Fees (USD)', 'Bill To Account', 'Model Type']
for y,index in enumerate(indexs):
try:
client=0
if len(clients[y])>=32:
client=clients[y][:31]
else:
client=clients[y]
wb.create_sheet(client)
ws=wb[client]
ws.column_dimensions['A'].width=35
ws.append(headers)
row_index=2
for i in range(start_index,indexs[y+1]):
ws.append([ws1[col+str(i)].value for col in columns])
copy_cell(ws,row_index,i,ws1)
row_index+=1
start_index=indexs[y+1]
except:
pass
wb.save('split_data.xlsx')
wb.close()
try:
wb1 = openpyxl.load_workbook('split_data.xlsx')
a=wb1['Sheet']
wb1.remove(a)
a=wb1['Sheet1']
wb1.remove(a)
wb1.save('split_data.xlsx')
wb1.close()
except:
pass
Please can someone point me in the right direction of a resource that might teach me how to achieve this?

from openpyxl import load_workbook
def get_client_rows(sheet):
"""Get client rows.
Skip header and then look for row dimensions without outline level
"""
return [row[0].row for row in sheet.iter_rows(2) if row[0].alignment.indent == 0.0]
return [
row_index
for row_index, row_dimension in sheet.row_dimensions.items()
if row_index > 1 and row_dimension.outline_level == 0
]
def delete_client_block(sheet, start, end):
"""
Delete rows starting from up to and including end.
"""
for row in range(start, end + 1):
sheet.row_dimensions.pop(row, None)
sheet.delete_rows(start, end - start + 1)
def split_workbook(input_file, output_file):
"""
Split workbook each main group into its own sheet.
Not too loose any formatting we copy the current sheet and remove all rows
which do not belong to extacted group.
"""
try:
workbook = load_workbook(input_file)
data_sheet = workbook.active
client_rows = get_client_rows(data_sheet)
for index, client_row in enumerate(client_rows):
# create new sheet for given client, shorten client as it might be too long
client_sheet = workbook.copy_worksheet(data_sheet)
client_sheet.title = data_sheet.cell(client_row, 1).value[:32]
# delete rows after current client if available
if index < len(client_rows) - 1:
row_after_client = client_rows[index + 1]
delete_client_block(
client_sheet, row_after_client, client_sheet.max_row
)
# delete rows before current client if available
if index > 0:
first_client_row = client_rows[0]
delete_client_block(
client_sheet, first_client_row, client_row - first_client_row + 1
)
# move left over dimensions to top of the sheet
for row_index in list(client_sheet.row_dimensions.keys()):
# skip header row dimension
if row_index > first_client_row - 1:
row_dimension = client_sheet.row_dimensions.pop(row_index)
new_index = row_index - client_row + first_client_row
row_dimension.index = new_index
client_sheet.row_dimensions[new_index] = row_dimension
del workbook[data_sheet.title]
workbook.save(output_file)
finally:
workbook.close()
if __name__ == "__main__":
# input_file = "annonamized_test_data_to_be_split.xlsx"
input_file = 'partial_Q1_Client_Billing_Data.xlsx'
# output_file = "split_data.xlsx"
output_file = "splitting_full_data.xlsx"
split_workbook(input_file, output_file)

Related

Read and Write multiple excel data into one excel file using openpyxl

I am trying to copy the data from multiple excel into one excel. I am novice to python and openpyxl. So i have opened each file and went row by row and copied them. I want to do this with multiple files. How do i loop through row and columns and copy the data consider the column in all the files are same order?
import openpyxl as xl
from openpyxl import workbook
incident_wb = xl.load_workbook('incident resolved yesterday.xlsx')
incident_sheet = incident_wb['Page 1']
combined_wb = xl.Workbook()
combined_sheet = combined_wb.active
combined_sheet.title = "combined_sheet"
combined_wb.save('combined_sheet.xlsx')
for row in range(1, incident_sheet.max_row+1):
incident_no = incident_sheet.cell(row,1)
opened_date = incident_sheet.cell(row,2)
shrt_desc = incident_sheet.cell(row,3)
requester = incident_sheet.cell(row,4)
incdnt_type = incident_sheet.cell(row,5)
priority = incident_sheet.cell(row,6)
assgn_grp = incident_sheet.cell(row,7)
assgn_to = incident_sheet.cell(row,8)
updated = incident_sheet.cell(row,9)
status = incident_sheet.cell(row,10)
sub_status = incident_sheet.cell(row,11)
##copy the data into the new sheet
incident_no_1 = combined_sheet.cell(row,1)
incident_no_1.value = incident_no.value
opened_date_1 = combined_sheet.cell(row,2)
opened_date_1.value = opened_date.value
shrt_desc_1 = combined_sheet.cell(row,3)
shrt_desc_1.value = shrt_desc.value
requester_1 = combined_sheet.cell(row,4)
requester_1.value = requester.value
incdnt_type_1 = combined_sheet.cell(row,5)
incdnt_type_1.value = incdnt_type.value
priority_1 = combined_sheet.cell(row,6)
priority_1.value = priority.value
assgn_grp_1 = combined_sheet.cell(row,7)
assgn_grp_1.value = assgn_grp.value
assgn_to_1 = combined_sheet.cell(row,8)
assgn_to_1.value = assgn_to.value
updated_1 = combined_sheet.cell(row,9)
updated_1.value = updated.value
status_1 = combined_sheet.cell(row,10)
status_1.value = status.value
sub_status_1 = combined_sheet.cell(row,11)
sub_status_1.value = sub_status.value
##print(f"The incident resolved yesterday {incident_no.value}")
combined_wb.save('combined_sheet.xlsx')
An alternative approach would be to build a list of date from multiple excel files and then write it to another file.
As a proof of concept:
import openpyxl as xl
from openpyxl import workbook
def provide_data(workbookName, sheetName):
wb = xl.load_workbook(workbookName)
sheet = wb[sheetName]
return [[y.value for y in x] for x in sheet.iter_rows()]
# This creates an array of rows, which contain an array of cell values.
# It will be much better to provide mapping for cells and return business object.
def save_data(list_of_sheets):
combined_wb = xl.Workbook()
combined_sheet = combined_wb.active
combined_sheet.title = "combined_sheet"
for sheet in list_of_sheets:
for row in sheet:
combined_sheet.append(row) # combining multiple rows.
combined_wb.save('combined_sheet.xlsx')
workSheetsToCopy = [['incident resolved yesterday.xlsx', 'Page 1'], ['other.xlsx', 'Page 1']]
workSheetsToCopy = [provide_data(x[0], x[1]) for x in workSheetsToCopy]
save_data(workSheetsToCopy)

Search and find values in two excel sheets(xlrd to openpyxl)

wrbk = xlrd.open_workbook("D:Book1.xlsx")
idx = 0
book_1 = xlrd.open_workbook("D:Book2.xlsx")
sh_1 = book_1.sheet_by_name('Sheet4')
i = 0
for x in range(sh_1.nrows):
i = i + 1
if i >= sh_1.nrows:
break
if sh_1.cell(i, 2).value:
concat = sh_1.cell(i, 2).value
for y in range(len(wrbk.sheets())):
sht = wrbk.sheet_by_index(y)
for j in range(sht.ncols):
for cell in range(sht.nrows):
list = str(sht.cell(cell, j).value)
if list.__contains__(concat):
print(sh_1.cell(i, 2).value)
Im using this code to find a value in a workbook and then search that value in another workbook.
I'm using xlrd, the output is fine so far but i can't read and write with xlrd.i need suggestions to change this code from xlrd to openpyxl.
This defines a function to do the search and uses a Regular Expression to do the 'contains' match. Change the print to suit.
from openpyxl import load_workbook
import re
# open workbook
excel_file1 = 'D:Book1.xlsx'
wb1 = load_workbook(excel_file1) # wrbk
ws1 = wb1["Sheet1"]
excel_file2 = 'D:Book2.xlsx'
wb2 = load_workbook(excel_file2) # book_1
ws2 = wb2["Sheet4"] # sh_1
# fn to search all sheets in workbook
def myfind(wb,s):
for ws in wb.worksheets:
for c in range(1,ws.max_column+1):
for r in range(1,ws.max_row+1):
txt = ws.cell(r,c).value
if txt is None:
pass
elif re.search(s,txt):
print("Found",s,txt,ws,r,c)
# scan col C
for r in range(1,ws2.max_row+1):
s = ws2.cell(r, 3).value
if s is None:
pass
else:
print(r,s)
myfind(wb1,s)

xlsxwriter and pandas for reporting

I am trying to create a basic excel report.
I am trying display a dataframe as well as some custom text/titles, not part of the dataframe.
However, I can only get one or the other. I don't really understand the end of the code that is needed for the dataframe to appear (workbook = writer.book and worksheet = writer.sheets['Reports'].
Here is my code:
writer = pd.ExcelWriter('reportTemplate.xlsx', engine='xlsxwriter')
workbook = xlsxwriter.Workbook('reportTemplate.xlsx')
worksheet = workbook.add_worksheet('Reports')
# REPORT TITLE
worksheet.write('D2','Daily In-Store Report')
workbook = xlsxwriter.Workbook('reportTemplate.xlsx')
worksheet = workbook.add_worksheet('Reports')
worksheet.write('D2','Daily In-Store Report')
reportTimes = ['Day','Week','Period','Quarter','Year']
cityList = ['ontario','bayshore','ottawa','limeridge','oshawa','scarborough','sherway','massonville','gatineau',
'quebec','anjou','dix30','Fairview','laval','mtltrust','stbruno','gcapitale','stefoy','rivieres','chicoutimi','sherbrooke','canada']
# LOOP THROUGH FILES
rowNb = 4
for time in reportTimes:
# TITLE
tableTitle = time + ' report as of ...'
worksheet.write('A'+str(rowNb),tableTitle)
rowNb += 1
headRow, secondHead = createHeadings(time)
worksheet.write_row('B' + str(rowNb), headRow)
worksheet.write_row('B' + str(rowNb), secondHead)
rowNb += 2
df = pd.read_csv('fy_' + time.lower() + '.csv')
df.set_index('legacy_id',inplace=True)
df = df.reindex(cityList)
print(df)
df.to_excel(writer,sheet_name='Reports',startrow = rowNb,header=False)
workbook = writer.book
worksheet = writer.sheets['Reports']
writer.save()
As the code is right now, it only displays the dataframe
It's not clear to me whether you're trying to write multiple sheets in one Excel file. If so, the problem may be that you're re-writing the same sheet called 'Reports' four times. Also, here are some basics to try. Put the df.to_excel() after pd.ExcelWriter(). Then remove from the for loop the last four lines. Finally, put writer.save() after the for loop ends. (This was not very clear for me when I first learned them, too. See more examples at this link.)
Edit: here's fully executing code (with stub data). One of of the keys was to enable multiple writes to the worksheet using writer.sheets['Reports'] = worksheet - see this explanation.
dummy_df = pd.DataFrame([[10,np.NaN],[12,42],[16,np.NaN],[20,3],[25,16],[30,1],[40,19],[60,99]],columns=['legacy_id', 'b'])
writer = pd.ExcelWriter('reportTemplate.xlsx', engine='xlsxwriter')
workbook = writer.book
worksheet = workbook.add_worksheet('Reports')
writer.sheets['Reports'] = worksheet # enable multiple writes to sheet
# REPORT TITLE
worksheet.write('D2','Daily In-Store Report')
reportTimes = ['Day','Week','Period','Quarter','Year']
cityList = ['ontario','bayshore','ottawa','limeridge','oshawa','scarborough','sherway','massonville','gatineau',
'quebec','anjou','dix30','Fairview','laval','mtltrust','stbruno','gcapitale','stefoy','rivieres','chicoutimi','sherbrooke','canada']
# LOOP THROUGH FILES
rowNb = 4
for time in reportTimes:
# TITLE
tableTitle = time + ' report as of ...'
worksheet.write('A'+str(rowNb),tableTitle)
rowNb += 1
headRow, secondHead = "dummy head row", "dummy second head" #I don't have your createHeadings(time)
worksheet.write_row('B' + str(rowNb), headRow)
worksheet.write_row('B' + str(rowNb), secondHead)
rowNb += 2
df = dummy_df.copy(deep=True) # pd.read_csv('fy_' + time.lower() + '.csv')
df.set_index('legacy_id',inplace=True)
df = df.reindex(cityList)
#print(df)
df.to_excel(writer,sheet_name='Reports', startrow = rowNb)
rowNb += df.shape[0] #gives row count
writer.save()

Split excel file into multiple by excel groups

By groups I mean this expandable things:
When pressing on them we can expand some rows, in this particular case I need to extract rows to different data locations (in whatever, list of rows/better - other excel files), grouped by 1st, hm, group:
So that, for example in this case:
file1.xlsx will be include all rows from 6 to 572
file2.xlsx will be include rows from 573 to 627
and so on.
How to perform this? It can be VBA script, but better with some python library like openpyxl or win32com.client
# -*- coding: utf-8 -*-
import openpyxl
wb = openpyxl.load_workbook(r'path_to_xlsx_file')
ws = wb.active
range_string = ws.calculate_dimension()
print(range_string)
for row_index, row in enumerate(ws.iter_rows(range_string=range_string)):
print(ws.row_dimensions[row_index].index, # just for the great LULZ
ws.row_dimensions[row_index].outline_level, # THAT what I was looking for!
ws.row_dimensions[row_index].hidden, # couple other helpful parameters
ws.row_dimensions[row_index].collapsed,
ws.row_dimensions[row_index].height)
I built something similar - this takes a .xlsx and splits all groupings (in this case clients) into separate worksheets. It is optimized using openpyxl openpyxl.worksheet._read_only.ReadOnlyWorksheet -
OptimisedModes Read-only modeSometimes, you will need to open or
write extremely large XLSX files, and the common routines in openpyxl
won’t be able to handle that load. Fortunately, there are two modes
that enable you to read and write unlimited amounts of data with
(near) constant memory consumption.
Script:
from openpyxl import load_workbook
from openpyxl import LXML
import time
from copy import copy
from openpyxl.cell import WriteOnlyCell
from openpyxl import Workbook
import logging
import sys
logger = logging.getLogger()
def configure_logging():
logger.setLevel(logging.INFO)
handler = logging.StreamHandler(sys.stdout)
handler.setLevel(logging.INFO)
formatter = logging.Formatter("%(asctime)s - %(message)s")
handler.setFormatter(formatter)
logger.addHandler(handler)
INVALID_TITLE_CHARS = ["]", "[", "*", ":", "?", "/", "\\", "'"]
INVALID_TITLE_CHAR_MAP = {ord(x): "" for x in INVALID_TITLE_CHARS}
def clean_sheet_title(title):
title = title or ""
title = title.strip()
title = title.translate(INVALID_TITLE_CHAR_MAP)
return title[:31]
def is_client_row(row, row_dimension):
return row[0].alignment.indent == 0.0
# return row_dimension.outline_level == 0
def create_write_only_cell(source_cell, target_sheet):
target_cell = WriteOnlyCell(target_sheet, value=source_cell.value)
target_cell.data_type = source_cell.data_type
if source_cell.has_style:
target_cell.font = copy(source_cell.font)
# TODO save one border and use it
# target_cell.border = copy(source_cell.border)
# TODO copy client row
# target_cell.fill = copy(source_cell.fill)
target_cell.number_format = copy(source_cell.number_format)
# target_cell.protection = copy(source_cell.protection)
target_cell.alignment = copy(source_cell.alignment)
return target_cell
def create_write_only_row(source_row, target_sheet):
return [create_write_only_cell(cell, target_sheet) for cell in source_row]
def split_workbook(input_file, output_file):
"""
Split workbook each client into its own sheet.
"""
try:
logger.info(f"Loading workbook {input_file}")
workbook = load_workbook(input_file)
data_sheet = workbook.active
output_workbook = Workbook(write_only=True)
client_sheet = None
client_row_index = 2
processing_client = 0
rows = data_sheet.rows
header = next(rows)
for index, row in enumerate(rows, start=2):
# TODO implement skip row
# if skip_row(row) is True:
# continue
row_dimension = data_sheet.row_dimensions[index]
# create new sheet found new client is found
if is_client_row(row, row_dimension):
processing_client += 1
client_sheet_title = clean_sheet_title(row[0].value)
logger.info(f"Processing client {processing_client}")
client_sheet = output_workbook.create_sheet(client_sheet_title)
client_row_index = index
# copy column dimensions
for key, column_dimension in data_sheet.column_dimensions.items():
client_sheet.column_dimensions[key] = copy(column_dimension)
client_sheet.column_dimensions[key].worksheet = client_sheet
client_sheet.append(create_write_only_row(header, client_sheet))
# copy row dimensions
client_sheet.row_dimensions[index - client_row_index + 2] = copy(
row_dimension
)
client_sheet.row_dimensions[index].worksheet = client_sheet
# finally copy row
client_sheet.append(create_write_only_row(row, client_sheet))
if index % 10000 == 0:
logger.info(f"{index} rows processed")
logger.info(f"Writing workbook {output_file}")
output_workbook.save(output_file)
finally:
if workbook:
workbook.close()
if output_workbook:
output_workbook.close()
if __name__ == "__main__":
input_file = "input_file.xlsx"
output_file = "output_file.xlsx"
start = time.time()
configure_logging()
logger.info(f"Using lxml mode: {LXML}")
split_workbook(input_file, output_file)
logger.info("Time consumed: % s seconds" % (time.time() - start))

Importing Multiple Excel Files using OpenPyXL

I am trying to read in multiple excel files and append the data from each file into one master file. Each file will have the same headers (So I can skip the import of the first row after the initial file).
I am pretty new to both Python and the OpenPyXL module. I am able to import the first workbook without problem. My problem comes in when I need to open the subsequent file and copy the data to paste into the original worksheet.
Here is my code so far:
# Creating blank workbook
from openpyxl import Workbook
wb = Workbook()
# grab active worksheet
ws = wb.active
# Read in excel data
from openpyxl import load_workbook
wb = load_workbook('first_file.xlsx') #explicitly loading workbook, will automate later
# grab active worksheet in current workbook
ws = wb.active
#get max columns and rows
sheet = wb.get_sheet_by_name('Sheet1')
print ("Rows: ", sheet.max_row) # for debugging purposes
print ("Columns: ", sheet.max_column) # for debugging purposes
last_data_point = ws.cell(row = sheet.max_row, column = sheet.max_column).coordinate
print ("Last data point in current worksheet:", last_data_point) #for debugging purposes
#import next file and add to master
append_point = ws.cell(row = sheet.max_row + 1, column = 1).coordinate
print ("Start new data at:", append_point)
wb = load_workbook('second_file.xlsx')
sheet2 = wb.get_sheet_by_name('Sheet1')
start = ws.cell(coordinate='A2').coordinate
print("New data start: ", start)
end = ws.cell(row = sheet2.max_row, column = sheet2.max_column).coordinate
print ("New data end: ", end)
# write a value to selected cell
#sheet[append_point] = 311
#print (ws.cell(append_point).value)
#save file
wb.save('master_file.xlsx')
Thanks!
I don't really understand your code. It looks too complicated. When copying between worksheets you probably want to use ws.rows.
wb1 = load_workbook('master.xlsx')
ws2 = wb1.active
for f in files:
wb2 = load_workbook(f)
ws2 = wb2['Sheet1']
for row in ws2.rows[1:]:
ws1.append((cell.value for cell in row))

Categories

Resources