Split excel file into multiple by excel groups - python

By groups I mean this expandable things:
When pressing on them we can expand some rows, in this particular case I need to extract rows to different data locations (in whatever, list of rows/better - other excel files), grouped by 1st, hm, group:
So that, for example in this case:
file1.xlsx will be include all rows from 6 to 572
file2.xlsx will be include rows from 573 to 627
and so on.
How to perform this? It can be VBA script, but better with some python library like openpyxl or win32com.client

# -*- coding: utf-8 -*-
import openpyxl
wb = openpyxl.load_workbook(r'path_to_xlsx_file')
ws = wb.active
range_string = ws.calculate_dimension()
print(range_string)
for row_index, row in enumerate(ws.iter_rows(range_string=range_string)):
print(ws.row_dimensions[row_index].index, # just for the great LULZ
ws.row_dimensions[row_index].outline_level, # THAT what I was looking for!
ws.row_dimensions[row_index].hidden, # couple other helpful parameters
ws.row_dimensions[row_index].collapsed,
ws.row_dimensions[row_index].height)

I built something similar - this takes a .xlsx and splits all groupings (in this case clients) into separate worksheets. It is optimized using openpyxl openpyxl.worksheet._read_only.ReadOnlyWorksheet -
OptimisedModes Read-only modeSometimes, you will need to open or
write extremely large XLSX files, and the common routines in openpyxl
won’t be able to handle that load. Fortunately, there are two modes
that enable you to read and write unlimited amounts of data with
(near) constant memory consumption.
Script:
from openpyxl import load_workbook
from openpyxl import LXML
import time
from copy import copy
from openpyxl.cell import WriteOnlyCell
from openpyxl import Workbook
import logging
import sys
logger = logging.getLogger()
def configure_logging():
logger.setLevel(logging.INFO)
handler = logging.StreamHandler(sys.stdout)
handler.setLevel(logging.INFO)
formatter = logging.Formatter("%(asctime)s - %(message)s")
handler.setFormatter(formatter)
logger.addHandler(handler)
INVALID_TITLE_CHARS = ["]", "[", "*", ":", "?", "/", "\\", "'"]
INVALID_TITLE_CHAR_MAP = {ord(x): "" for x in INVALID_TITLE_CHARS}
def clean_sheet_title(title):
title = title or ""
title = title.strip()
title = title.translate(INVALID_TITLE_CHAR_MAP)
return title[:31]
def is_client_row(row, row_dimension):
return row[0].alignment.indent == 0.0
# return row_dimension.outline_level == 0
def create_write_only_cell(source_cell, target_sheet):
target_cell = WriteOnlyCell(target_sheet, value=source_cell.value)
target_cell.data_type = source_cell.data_type
if source_cell.has_style:
target_cell.font = copy(source_cell.font)
# TODO save one border and use it
# target_cell.border = copy(source_cell.border)
# TODO copy client row
# target_cell.fill = copy(source_cell.fill)
target_cell.number_format = copy(source_cell.number_format)
# target_cell.protection = copy(source_cell.protection)
target_cell.alignment = copy(source_cell.alignment)
return target_cell
def create_write_only_row(source_row, target_sheet):
return [create_write_only_cell(cell, target_sheet) for cell in source_row]
def split_workbook(input_file, output_file):
"""
Split workbook each client into its own sheet.
"""
try:
logger.info(f"Loading workbook {input_file}")
workbook = load_workbook(input_file)
data_sheet = workbook.active
output_workbook = Workbook(write_only=True)
client_sheet = None
client_row_index = 2
processing_client = 0
rows = data_sheet.rows
header = next(rows)
for index, row in enumerate(rows, start=2):
# TODO implement skip row
# if skip_row(row) is True:
# continue
row_dimension = data_sheet.row_dimensions[index]
# create new sheet found new client is found
if is_client_row(row, row_dimension):
processing_client += 1
client_sheet_title = clean_sheet_title(row[0].value)
logger.info(f"Processing client {processing_client}")
client_sheet = output_workbook.create_sheet(client_sheet_title)
client_row_index = index
# copy column dimensions
for key, column_dimension in data_sheet.column_dimensions.items():
client_sheet.column_dimensions[key] = copy(column_dimension)
client_sheet.column_dimensions[key].worksheet = client_sheet
client_sheet.append(create_write_only_row(header, client_sheet))
# copy row dimensions
client_sheet.row_dimensions[index - client_row_index + 2] = copy(
row_dimension
)
client_sheet.row_dimensions[index].worksheet = client_sheet
# finally copy row
client_sheet.append(create_write_only_row(row, client_sheet))
if index % 10000 == 0:
logger.info(f"{index} rows processed")
logger.info(f"Writing workbook {output_file}")
output_workbook.save(output_file)
finally:
if workbook:
workbook.close()
if output_workbook:
output_workbook.close()
if __name__ == "__main__":
input_file = "input_file.xlsx"
output_file = "output_file.xlsx"
start = time.time()
configure_logging()
logger.info(f"Using lxml mode: {LXML}")
split_workbook(input_file, output_file)
logger.info("Time consumed: % s seconds" % (time.time() - start))

Related

Splitting Excel Data by Groupings into Separate Workbook Sheets

Background:I have a large 40MB XLSX file that contains client data which is Grouped over multiple levels, like so:
Expanded -
Not Expanded (sorry about the terrible dummy data!) -
Objective:I would like to split Client A, B C etc... and all their respective underlying data into separate sheets (named 'Client A' etc...) in a Workbook.
Question:Am I correct in assuming that there is no python library that would help with this (e.g., xlsxwriter) and that I will likely have to save into multiple pandas df before splitting and writing to the xlsx file?
Sample Data:Here is a link to some randomized sample data. In this file you will see only 1 client (the total row can be ignored) however imagine the normal file having 40 clients / groupings and sub levels.
Sample Code: this function takes the '.xlsxand writes each grouping to an appropriately named tab (e.g., 'Client A') to a separate Worksheet in a new.xlsx`. The issue with this code is that because I am basically going through and copying each cell individually, I didn't think to consider more holistically however to ensure the Groupings/Levels would be preserved. I think this code needs a complete re-write, and welcome feedback
import openpyxl
from copy import copy
from openpyxl import load_workbook
columns=['A','B','C','D','E','F','G','H','I','J','K','L']
def copy_cell(ws, row,ws_row,ws1):
for col in columns:
ws_cell=ws1[col+str(ws_row)]
new_cell = ws[col+str(row)]
if ws_cell.has_style:
new_cell.font = copy(ws_cell.font)
new_cell.border = copy(ws_cell.border)
new_cell.fill = copy(ws_cell.fill)
new_cell.number_format = copy(ws_cell.number_format)
new_cell.protection = copy(ws_cell.protection)
new_cell.alignment = copy(ws_cell.alignment)
wb1 = openpyxl.load_workbook('annonamized_test_data_to_be_split.xlsx')
ws1=wb1.active
indexs=[]
clients=[]
index=1
while ws1['A'+str(index)]:
if str(ws1['A'+str(index)].alignment.indent)=='0.0':
indexs.append(index)
clients.append(ws1['A'+str(index)].value)
if ws1['A'+str(index)].value is None:
indexs.append(index)
break
index+=1
wb1.close()
wb = openpyxl.Workbook()
ws=wb.active
start_index=1
headers=['Ownership Structure', 'Fee Schedule', 'Management Style', 'Advisory Firm', 'Inception Date', 'Days in Time Period', 'Adjusted Average Daily Balance (No Div, USD)', 'Assets Billed On (USD)',
'Effective Billing Rate', 'Billing Fees (USD)', 'Bill To Account', 'Model Type']
for y,index in enumerate(indexs):
try:
client=0
if len(clients[y])>=32:
client=clients[y][:31]
else:
client=clients[y]
wb.create_sheet(client)
ws=wb[client]
ws.column_dimensions['A'].width=35
ws.append(headers)
row_index=2
for i in range(start_index,indexs[y+1]):
ws.append([ws1[col+str(i)].value for col in columns])
copy_cell(ws,row_index,i,ws1)
row_index+=1
start_index=indexs[y+1]
except:
pass
wb.save('split_data.xlsx')
wb.close()
try:
wb1 = openpyxl.load_workbook('split_data.xlsx')
a=wb1['Sheet']
wb1.remove(a)
a=wb1['Sheet1']
wb1.remove(a)
wb1.save('split_data.xlsx')
wb1.close()
except:
pass
Please can someone point me in the right direction of a resource that might teach me how to achieve this?
from openpyxl import load_workbook
def get_client_rows(sheet):
"""Get client rows.
Skip header and then look for row dimensions without outline level
"""
return [row[0].row for row in sheet.iter_rows(2) if row[0].alignment.indent == 0.0]
return [
row_index
for row_index, row_dimension in sheet.row_dimensions.items()
if row_index > 1 and row_dimension.outline_level == 0
]
def delete_client_block(sheet, start, end):
"""
Delete rows starting from up to and including end.
"""
for row in range(start, end + 1):
sheet.row_dimensions.pop(row, None)
sheet.delete_rows(start, end - start + 1)
def split_workbook(input_file, output_file):
"""
Split workbook each main group into its own sheet.
Not too loose any formatting we copy the current sheet and remove all rows
which do not belong to extacted group.
"""
try:
workbook = load_workbook(input_file)
data_sheet = workbook.active
client_rows = get_client_rows(data_sheet)
for index, client_row in enumerate(client_rows):
# create new sheet for given client, shorten client as it might be too long
client_sheet = workbook.copy_worksheet(data_sheet)
client_sheet.title = data_sheet.cell(client_row, 1).value[:32]
# delete rows after current client if available
if index < len(client_rows) - 1:
row_after_client = client_rows[index + 1]
delete_client_block(
client_sheet, row_after_client, client_sheet.max_row
)
# delete rows before current client if available
if index > 0:
first_client_row = client_rows[0]
delete_client_block(
client_sheet, first_client_row, client_row - first_client_row + 1
)
# move left over dimensions to top of the sheet
for row_index in list(client_sheet.row_dimensions.keys()):
# skip header row dimension
if row_index > first_client_row - 1:
row_dimension = client_sheet.row_dimensions.pop(row_index)
new_index = row_index - client_row + first_client_row
row_dimension.index = new_index
client_sheet.row_dimensions[new_index] = row_dimension
del workbook[data_sheet.title]
workbook.save(output_file)
finally:
workbook.close()
if __name__ == "__main__":
# input_file = "annonamized_test_data_to_be_split.xlsx"
input_file = 'partial_Q1_Client_Billing_Data.xlsx'
# output_file = "split_data.xlsx"
output_file = "splitting_full_data.xlsx"
split_workbook(input_file, output_file)

Openpyxl can't write an excel file more than once

I am trying to write to excel files using openpyxl module. For some reason it only lets me write once. If I try to write again it raises:
PermissionError: [Errno 13] Permission denied: 'expenses.xlsx'
The excel file and python program are in the same folder on D drive. What's the problem?
from openpyxl import Workbook
from openpyxl import load_workbook
from datetime import datetime
import os
class ExpenseTracker:
def __init__(self, file_name = "expenses.xlsx"):
self.fname = file_name
self.load_wb()
def load_wb(self):
"""
if the excel file doesn't exists it creates a new one
with a sheet, and calls self.col_values() which
adds values for first two columns in row 1
"""
try:
wb = load_workbook(self.fname)
except Exception:
wb = Workbook()
wb.create_sheet("Expenses", 0)
self.col_values()
wb.save(self.fname)
finally:
self.wb = wb
self.ws = self.wb["Expenses"]
def col_values(self):
# adds values for first two columns in row 1
self.ws.cell(row = 1, column = 1).value = "Date"
self.ws.cell(row = 1, column = 2).value = "Spent"
def spend_income(self, amount):
date = datetime.now()
date_formatted = date.strftime("%d.%b %Y")
last_row = self.ws.max_row + 1
last_col = self.ws.max_column + 1
self.ws.cell(row = last_row, column = 1).value = date_formatted
self.ws.cell(row = last_row, column = 2).value = amount
# writes under the last input in cols 1 and 2
self.wb.save(self.fname)
wbook = ExpenseTracker()
wbook.spend_income(5)

Read and Write multiple excel data into one excel file using openpyxl

I am trying to copy the data from multiple excel into one excel. I am novice to python and openpyxl. So i have opened each file and went row by row and copied them. I want to do this with multiple files. How do i loop through row and columns and copy the data consider the column in all the files are same order?
import openpyxl as xl
from openpyxl import workbook
incident_wb = xl.load_workbook('incident resolved yesterday.xlsx')
incident_sheet = incident_wb['Page 1']
combined_wb = xl.Workbook()
combined_sheet = combined_wb.active
combined_sheet.title = "combined_sheet"
combined_wb.save('combined_sheet.xlsx')
for row in range(1, incident_sheet.max_row+1):
incident_no = incident_sheet.cell(row,1)
opened_date = incident_sheet.cell(row,2)
shrt_desc = incident_sheet.cell(row,3)
requester = incident_sheet.cell(row,4)
incdnt_type = incident_sheet.cell(row,5)
priority = incident_sheet.cell(row,6)
assgn_grp = incident_sheet.cell(row,7)
assgn_to = incident_sheet.cell(row,8)
updated = incident_sheet.cell(row,9)
status = incident_sheet.cell(row,10)
sub_status = incident_sheet.cell(row,11)
##copy the data into the new sheet
incident_no_1 = combined_sheet.cell(row,1)
incident_no_1.value = incident_no.value
opened_date_1 = combined_sheet.cell(row,2)
opened_date_1.value = opened_date.value
shrt_desc_1 = combined_sheet.cell(row,3)
shrt_desc_1.value = shrt_desc.value
requester_1 = combined_sheet.cell(row,4)
requester_1.value = requester.value
incdnt_type_1 = combined_sheet.cell(row,5)
incdnt_type_1.value = incdnt_type.value
priority_1 = combined_sheet.cell(row,6)
priority_1.value = priority.value
assgn_grp_1 = combined_sheet.cell(row,7)
assgn_grp_1.value = assgn_grp.value
assgn_to_1 = combined_sheet.cell(row,8)
assgn_to_1.value = assgn_to.value
updated_1 = combined_sheet.cell(row,9)
updated_1.value = updated.value
status_1 = combined_sheet.cell(row,10)
status_1.value = status.value
sub_status_1 = combined_sheet.cell(row,11)
sub_status_1.value = sub_status.value
##print(f"The incident resolved yesterday {incident_no.value}")
combined_wb.save('combined_sheet.xlsx')
An alternative approach would be to build a list of date from multiple excel files and then write it to another file.
As a proof of concept:
import openpyxl as xl
from openpyxl import workbook
def provide_data(workbookName, sheetName):
wb = xl.load_workbook(workbookName)
sheet = wb[sheetName]
return [[y.value for y in x] for x in sheet.iter_rows()]
# This creates an array of rows, which contain an array of cell values.
# It will be much better to provide mapping for cells and return business object.
def save_data(list_of_sheets):
combined_wb = xl.Workbook()
combined_sheet = combined_wb.active
combined_sheet.title = "combined_sheet"
for sheet in list_of_sheets:
for row in sheet:
combined_sheet.append(row) # combining multiple rows.
combined_wb.save('combined_sheet.xlsx')
workSheetsToCopy = [['incident resolved yesterday.xlsx', 'Page 1'], ['other.xlsx', 'Page 1']]
workSheetsToCopy = [provide_data(x[0], x[1]) for x in workSheetsToCopy]
save_data(workSheetsToCopy)

openpyxl - error importing in Python script

I have a python script at work (that I didn't write) which cycles through a folder of SQL scripts, condenses the SQL queries into one line each and adds them to an Excel document (along with other columns). This script always worked fine until this week when my work computer died. I got a new one, installed miniconda and then installed openpyxl by opening Anaconda prompt and:
pip install openpyxl
(pip3 install didn't work).
Most of my Python scripts work fine but this one does not. It is throwing an error:
ImportError: cannot import name 'range' from 'openpyxl.compat' (C:\Users\xxx\AppData\Local\Continuum\miniconda3\lib\site-packages\openpyxl\compat\__init__.py)
I tried to drill down into the libraries/site-packages to see the details, and I don't see 'range' listed anymore in the init file for that path. Maybe they got rid of it? Does anyone know what the suitable alternative is for that? To be honest I can't even tell where it is being used in the script, but when i take out that import statement it runs but it results in a blank Excel output file. So clearly it is used somewhere
import sqlparse
import glob
import sys
import regex as re
import os
import openpyxl
from openpyxl import Workbook
from openpyxl.utils import get_column_letter
from openpyxl.compat import range
def main():
path = "C:/my_path\sql_files*"
fileList = glob.glob(path)
TABLE_ID = 0
# TABLE_NM = ''
# STEP_ID = 0
STEP_TYPE = ''
# workFile = open('C:/Dev\DataWarehouseTesting\workfile.txt', 'w')
# wb = openpyxl.load_workbook('workfile.xlsx')
wb = Workbook()
ws1 = wb.active
ws1.title = "auto_tests"
# Adding the headers for the Excel sheet.
ws1['A1'] = 'TABLE_ID'
ws1['B1'] = 'TABLE_NM'
ws1['C1'] = 'TEST_TABLE_NM'
ws1['D1'] = 'STEP_ID'
ws1['E1'] = 'STEP_TYPE'
ws1['F1'] = 'SQL_SCRIPT'
def createSQL(pTABLE_ID, pTABLE_NM, pTEST_TABLE_NM, pSTEP_ID, pSTEP_TYPE, pSqlStr):
pSqlStr = pSqlStr.replace('\'','"')
sliceLeft = pSqlStr[:6]
if sliceLeft == 'SELECT':
pSTEP_TYPE = 'T'
elif sliceLeft != 'SELECT':
pSTEP_TYPE = 'P'
max_row = ws1.max_row
nextRow = str(max_row + 1)
ws1['A' + nextRow] = str(pTABLE_ID)
ws1['B' + nextRow] = pTABLE_NM
ws1['C' + nextRow] = pTEST_TABLE_NM
ws1['D' + nextRow] = str(pSTEP_ID)
ws1['E' + nextRow] = pSTEP_TYPE
ws1['F' + nextRow] = pSqlStr
def createTableTestSQL(mFile, TABLE_ID, TABLE_NM,STEP_ID, STEP_TYPE):
mText = mFile.read()
mSqls = sqlparse.split(mText)
for mSql in mSqls:
STEP_ID += 1
sqlStr = str(mSql.replace('\n',' '))
sqlStr = re.sub('--([^\s]+)',' ',sqlStr)
sqlStr = sqlparse.format(sqlStr, strip_comments=True)
if STEP_ID == 1:
TEST_TABLE_NM = sqlStr.replace("DROP TABLE ", "").replace(";", "")
createSQL(TABLE_ID, TABLE_NM, TEST_TABLE_NM, STEP_ID, STEP_TYPE, sqlStr)
for filename in fileList:
if filename.endswith('.sql'):
mFile = open(filename,'r')
TABLE_ID += 1
TABLE_NM = os.path.split(filename)[1].replace('.sql','')
#TEST_TABLE_NM = ''
STEP_ID = 0
createTableTestSQL(mFile, TABLE_ID, TABLE_NM, STEP_ID, STEP_TYPE)
mFile.close()
wb.save(filename='test_data_TR.xlsx')
if __name__ == "__main__":
main()

getting formatting data in openpyxl

I'm having trouble extracting the styles from an excel worksheet using openpyxl, in the case below I'm creating a spreadsheet, and I can see that the formatting is correct, but I don't know how to get that data back.
In my real use case I'm just reading a file - I'm not the one creating it, so I'd like to be able to programmatically retrieve the formatting.
from openpyxl import Workbook
from openpyxl.reader.excel import load_workbook
from openpyxl.style import Color, Fill
#this is all setup
wb = Workbook()
dest_filename = 'c:\\temp\\test.xlsx'
ws = wb.worksheets[0]
ws.title = 'test'
ws.cell('A1').value = 'foo'
ws.cell('A1').style.font.bold = True
ws.cell('B1').value = 'bar'
ws.cell('B1').style.fill.fill_type = Fill.FILL_SOLID
ws.cell('B1').style.fill.start_color.index = Color.DARKYELLOW
wb.save(filename = dest_filename )
#setup complete
book = load_workbook( filename = dest_filename )
sheet = book.get_sheet_by_name('test')
#value work properly
print sheet.cell('A1').value #returns foo
print sheet.cell('B1').value #return bar
#formatting does not - THIS IS THE PROBLEM CODE
print sheet.cell('A1').style.font.bold #returns False
print sheet.cell('B1').style.fill.fill_type #returns none
print sheet.cell('B1').style.fill.start_color.index #returns FFFFFFFF
print sheet.cell('B1').has_style #returns true
#but these 2 return the same values! even thought C1 was never set and should be different
print sheet.get_style('A1')
print sheet.get_style('C1')

Categories

Resources