I am attempting to write some code where for every time I run a python script a data frame (that has been made) automatically becomes a excel table in a defined folder path. However I want it to work in such a way that by re running the code the data frame would append to the end of the existing excel table, creating a new excel table. Currently I am using this code to do the data overlap:
def append_df_to_excel(filename, df, sheet_name='Sheet2', startrow=None, startcol=None,
truncate_sheet=False, resizeColumns=True, na_rep = 'NA', **to_excel_kwargs):
"""
Append a DataFrame [df] to existing Excel file [filename]
into [sheet_name] Sheet.
If [filename] doesn't exist, then this function will create it.
Returns: None
"""
from openpyxl import load_workbook
from string import ascii_uppercase
from openpyxl.utils import get_column_letter
from openpyxl import Workbook
# ignore [engine] parameter if it was passed
if 'engine' in to_excel_kwargs:
to_excel_kwargs.pop('engine')
try:
f = open(filename)
# Do something with the file
except IOError:
# print("File not accessible")
wb = Workbook()
ws = wb.active
ws.title = sheet_name
wb.save(filename)
writer = pd.ExcelWriter(filename, engine='openpyxl', mode='a', if_sheet_exists = 'overlay')
# Python 2.x: define [FileNotFoundError] exception if it doesn't exist
try:
FileNotFoundError
except NameError:
FileNotFoundError = IOError
try:
# try to open an existing workbook
writer.book = load_workbook(filename)
# get the last row in the existing Excel sheet
# if it was not specified explicitly
if startrow is None and sheet_name in writer.book.sheetnames:
startrow = writer.book[sheet_name].max_row
# truncate sheet
if truncate_sheet and sheet_name in writer.book.sheetnames:
# index of [sheet_name] sheet
idx = writer.book.sheetnames.index(sheet_name)
# remove [sheet_name]
writer.book.remove(writer.book.worksheets[idx])
# create an empty sheet [sheet_name] using old index
writer.book.create_sheet(sheet_name, idx)
# copy existing sheets
writer.sheets = {ws.title:ws for ws in writer.book.worksheets}
except FileNotFoundError:
# file does not exist yet, we will create it
pass
if startrow is None:
# startrow = -1
startrow = 0
if startcol is None:
startcol = 0
# write out the new sheet
df.to_excel(writer, sheet_name, startrow=startrow, startcol=startcol, na_rep=na_rep, **to_excel_kwargs,header = False, index = False)
ws = writer.book[sheet_name]
if resizeColumns:
def auto_format_cell_width(ws):
for letter in range(1,ws.max_column):
maximum_value = 0
for cell in ws[get_column_letter(letter)]:
val_to_check = len(str(cell.value))
if val_to_check > maximum_value:
maximum_value = val_to_check
ws.column_dimensions[get_column_letter(letter)].width = maximum_value + 2
auto_format_cell_width(ws)
writer.save()
This code successfully allows me to run the code as many times as i want and append the data onto the end of the previously ran python script. However those outputted excel sheets are not in table format.
Currently my attempt to make a table is as follows:
ws = writer.book[sheet_name]
def make_table(worksheet, df):
column_settings = []
for header in df.columns:
column_settings.append( header)
table = Table(displayName="Contacts", ref="A1:" + get_column_letter(worksheet.max_column) + str(worksheet.max_row))
table._initialise_columns()
for column, value in zip(table.tableColumns, column_settings):
column.name = value
worksheet = worksheet.add_table(table)
However the column names do not update accordingly in the excel sheet, excel cites an error for this along the lines of 'had to recover/delete unworkable parts'
But also upon trying to run the script a second time the following python error:
'Table with name Contacts already exists'
Any help would be greatly appreciated!
Here is a toy data frame for testing:
df = pd.DataFrame(np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]),
columns=['a', 'b', 'c'])
Related
The following is a modified append script I obtained from stack overflow,
import pandas as pd
import openpyxl
import glob
import os
import xlsxwriter
from openpyxl import load_workbook
from tkinter import Tk, filedialog
from pathlib import Path
def append_df_to_excel(filename, df, sheet_name='Sheet1', startrow=None,
truncate_sheet=False,
**to_excel_kwargs):
"""
Append a DataFrame [df] to existing Excel file [filename]
into [sheet_name] Sheet.
If [filename] doesn't exist, then this function will create it.
#param filename: File path or existing ExcelWriter
(Example: '/path/to/file.xlsx')
#param df: DataFrame to save to workbook
#param sheet_name: Name of sheet which will contain DataFrame.
(default: 'Sheet1')
#param startrow: upper left cell row to dump data frame.
Per default (startrow=None) calculate the last row
in the existing DF and write to the next row...
#param truncate_sheet: truncate (remove and recreate) [sheet_name]
before writing DataFrame to Excel file
#param to_excel_kwargs: arguments which will be passed to `DataFrame.to_excel()`
[can be a dictionary]
#return: None
Usage examples:
# >>> append_df_to_excel('d:/temp/test.xlsx', df)
#
# >>> append_df_to_excel('d:/temp/test.xlsx', df, header=None, index=False)
#
# >>> append_df_to_excel('d:/temp/test.xlsx', df, sheet_name='Sheet2',
# index=False)
#
# >>> append_df_to_excel('d:/temp/test.xlsx', df, sheet_name='Sheet2',
# index=False, startrow=25)
(c) [MaxU](https://stackoverflow.com/users/5741205/maxu?tab=profile)
"""
# Excel file doesn't exist - saving and exiting
if not os.path.isfile(filename):
df.to_excel(
filename,
sheet_name=sheet_name,
startrow=startrow if startrow is not None else 0,
**to_excel_kwargs)
return
# ignore [engine] parameter if it was passed
if 'engine' in to_excel_kwargs:
to_excel_kwargs.pop('engine')
writer = pd.ExcelWriter(filename, engine='openpyxl', mode='a')
# try to open an existing workbook
writer.book = load_workbook(filename)
# get the last row in the existing Excel sheet
# if it was not specified explicitly
if startrow is None and sheet_name in writer.book.sheetnames:
startrow = writer.book[sheet_name].max_row
# truncate sheet
if truncate_sheet and sheet_name in writer.book.sheetnames:
# index of [sheet_name] sheet
idx = writer.book.sheetnames.index(sheet_name)
# remove [sheet_name]
writer.book.remove(writer.book.worksheets[idx])
# create an empty sheet [sheet_name] using old index
writer.book.create_sheet(sheet_name, idx)
# copy existing sheets
writer.sheets = {ws.title: ws for ws in writer.book.worksheets}
if startrow is None:
startrow = 0
# write out the new sheet
df.to_excel(writer, sheet_name, startrow=startrow, **to_excel_kwargs)
# save the workbook
writer.save()
root = Tk()
root.withdraw()
root.attributes('-topmost', True)
source_dir = filedialog.askdirectory()
file_names = glob.glob(os.path.join(source_dir, '*.xlsx'))
service_name = os.path.basename(os.path.dirname(source_dir))
year = os.path.basename(source_dir)
target_file = service_name + " " + year + ".xlsx"
workbook = xlsxwriter.Workbook(target_file)
worksheet = workbook.add_worksheet()
workbook.close()
srcfile = openpyxl.load_workbook(target_file, read_only=False,
keep_vba=False)
sheetname = srcfile['Sheet1']
r = 2
srcfile.save(target_file)
for file in file_names:
df=pd.read_excel(file)
append_df_to_excel(target_file, df, header=False, index=False, startrow=r, startcol=0)
index = df.index
Num_of_Rigs = len(index)
r += Num_of_Rigs
print (r)
I added some additional code to use the append function.
However, this code works in Python 3.9, but I get the following error in Python 3.10 :
/usr/local/bin/python3 "/Users/ahmedhamadto/PycharmProjects/DataFrame Concatenator/main.py"
Traceback (most recent call last):
File "/Users/ahmedhamadto/PycharmProjects/DataFrame Concatenator/main.py", line 136, in <module>
append_df_to_excel(target_file, df, header=False, index=False, startrow=r, startcol=0)
File "/Users/ahmedhamadto/PycharmProjects/DataFrame Concatenator/main.py", line 87, in append_df_to_excel
df.to_excel(writer, sheet_name, startrow=startrow, **to_excel_kwargs)
File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/pandas/core/generic.py", line 2284, in to_excel
formatter.write(
File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/pandas/io/formats/excel.py", line 840, in write
writer.write_cells(
File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/pandas/io/excel/_openpyxl.py", line 436, in write_cells
raise ValueError(
ValueError: Sheet 'Sheet1' already exists and if_sheet_exists is set to 'error'.
Process finished with exit code 1
I can't seem to understand what changed to make it not work in Python 3.10.
This is the only difference in both environments.
I am trying to create 3 different dataframes to output in my excel file in 3 separate worksheet called df, df_OK, df_KO. However the code below only outputs df and is not creating the other 2 dataframes df_OK, df_KO to have in the same Excel file but in 2 separate worksheets.
Any suggestions? Thanks
class blah:
def __init__(self, path, file_in, file_out):
self.path = path
self.file_in = file_in
self.file_out = file_out
def process_file(self):
df = pd.read_excel(self.path + self.file_in)
df_OK = df.loc[df['Status'] == 'OK']
df_KO = df.loc[df['Status'] == 'KO']
df_OK.loc['Total'] = df_OK[['Price']].sum(axis=0)
writer = pd.ExcelWriter(self.path + self.file_out, engine='xlsxwriter')
dfs = {
'All': df,
'OK': df_OK,
'KO': df_KO
}
for sheet_name in dfs.keys():
dfs[sheet_name].to_excel(writer, sheet_name=sheet_name, index=False)
writer.save()
b = blah('C:/Users/......./',
'path...',
'file_in....',
'file_out...')
b.process_file()
It is because you overwrite the same Excel file in every iteration of your for sheet_name in dfs.keys() loop. So every time you write an Excel file with only a single sheet to the same filename, thus overwriting the previous document.
You should move the writer.save() outside your loop like so:
for sheet_name in dfs.keys():
dfs[sheet_name].to_excel(writer, sheet_name=sheet_name, index=False)
writer.save()
I'm having 10 CSV files each of the CSV file is having same number of column from which I'm reading data one by one in the form of pandas data frame. I want those data to be displayed on console/Terminal in some table form. And it should be like if every time the data gets into new row. Any suggestions on this ?
Below is my sample CSV file :
Like this, there are 10 or more CSV file and I will be reading data from those file one by one and want to display on Console/Terminal.
Brief Introduction to my Application
I have a machine that is generating CSV files after a certain interval of time into a folder. I am using Watchdog library to put a watch on the folder where the CSV files are being generated. When I receive a CSV file I Read it into a pandas data frame. Sample CSV file is given above.
As far as the machine is running it will keep generating the CSV files. So if I want to see the data I need to open each and every CSV files, Instead, I want a View in which the Data gets updated when there is a new CSV file generated.
So Technically One CSV file is getting read gets converted into a data frame and then printed on Console/Terminal. And this process happens again when a new CSV file is generated, But when new data frame arrives, It should not overwrite the whole console, Instead it appends to the existing data on console.
Here is my main file :
import time
from watchdog.observers import Observer
from watchdog.events import PatternMatchingEventHandler
import pandas as pd
from Append_Function import append_df_to_excel
import os.path
import sys
class Watcher:
def __init__(self, args):
self.watch_dir = os.getcwd()
print(args[0])
self.directory_to_watch = os.path.join(self.watch_dir, args[1])
self.observer = Observer()
self.event_handler = Handler(patterns=["*.CSV"], ignore_patterns=["*.tmp"], ignore_directories=True)
def run(self):
self.observer.schedule(self.event_handler, self.directory_to_watch, recursive=False)
self.observer.start()
try:
while True:
time.sleep(1)
except:
self.observer.stop()
print("Error")
self.observer.join()
class Handler(PatternMatchingEventHandler):
#staticmethod
def on_any_event(event):
if event.is_directory:
return None
elif event.event_type == 'created':
# Take any action here when a file is first created.
print("Received created event - %s." % event.src_path)
df = pd.read_csv(event.src_path, header=1, index_col=0)
append_df_to_excel(os.path.join(os.getcwd(), "myfile.xlsx"), df)
elif event.event_type == 'modified':
# Taken any actionc here when a file is modified.
df = pd.read_csv(event.src_path, header=0, index_col=0)
append_df_to_excel(os.path.join(os.getcwd(), "myfile.xlsx"), df)
print("Received modified event - %s." % event.src_path)
if __name__ == '__main__':
print(sys.argv)
w = Watcher(sys.argv)
w.run()
Here is my Append Function:
import pandas as pd
import openpyxl as ox
def append_df_to_excel(filename, df, sheet_name='Sheet1', startrow=None,
truncate_sheet=False,
**to_excel_kwargs):
# ignore [engine] parameter if it was passed
if 'engine' in to_excel_kwargs:
to_excel_kwargs.pop('engine')
writer = pd.ExcelWriter(filename, engine='openpyxl')
# Python 2.x: define [FileNotFoundError] exception if it doesn't exist
try:
FileNotFoundError
except NameError:
FileNotFoundError = IOError
try:
# try to open an existing workbook
writer.book = ox.load_workbook(filename,keep_vba=True)
# get the last row in the existing Excel sheet
# if it was not specified explicitly
if startrow is None and sheet_name in writer.book.sheetnames:
startrow = writer.book[sheet_name].max_row
# truncate sheet
if truncate_sheet and sheet_name in writer.book.sheetnames:
# index of [sheet_name] sheet
idx = writer.book.sheetnames.index(sheet_name)
# remove [sheet_name]
writer.book.remove(writer.book.worksheets[idx])
# create an empty sheet [sheet_name] using old index
writer.book.create_sheet(sheet_name, idx)
# copy existing sheets
writer.sheets = {ws.title: ws for ws in writer.book.worksheets}
except FileNotFoundError:
# file does not exist yet, we will create it
pass
if startrow is None:
startrow = 0
# write out the new sheet
df.to_excel(writer, sheet_name, startrow=startrow, **to_excel_kwargs, header=True)
# save the workbook
writer.save()
Below creates a file and then fills in the excel. I would like to create an excel file with a condition based on a single column and redeposit the excel with a prefix of the column name.
So return only where columnX = i and create and save excel file i1_CCBHC_MONTHLY_CLAIMS.XLSX
i2_CCBHC_MONTHLY_CLAIMS.XLSX
I have the build of the large "parent" excel file.
filename = 'CCBHC_Monthly_Claims.xlsx'
if os.path.isfile(filename):
wb = xw.Book(filename)
ws = wb.sheets['CCBHC_DATA']
ws.range('A1').options(index=False).value = df_ora
wb = xw.Book(filename)
xw.apps[0].quit()
else:
writer = pd.ExcelWriter(filename, engine='xlsxwriter')
*df_ora.to_excel(writer, sheet_name='CCBHC_DATA',index=False)
wb = xw.Book(filename)
ws = wb.sheets['CCBHC_DATA']
ws.range('A1').options(in*dex=False).value = df_ora
wb = xw.Book(filename)
xw.apps[0].quit()
I am trying to read in multiple excel files and append the data from each file into one master file. Each file will have the same headers (So I can skip the import of the first row after the initial file).
I am pretty new to both Python and the OpenPyXL module. I am able to import the first workbook without problem. My problem comes in when I need to open the subsequent file and copy the data to paste into the original worksheet.
Here is my code so far:
# Creating blank workbook
from openpyxl import Workbook
wb = Workbook()
# grab active worksheet
ws = wb.active
# Read in excel data
from openpyxl import load_workbook
wb = load_workbook('first_file.xlsx') #explicitly loading workbook, will automate later
# grab active worksheet in current workbook
ws = wb.active
#get max columns and rows
sheet = wb.get_sheet_by_name('Sheet1')
print ("Rows: ", sheet.max_row) # for debugging purposes
print ("Columns: ", sheet.max_column) # for debugging purposes
last_data_point = ws.cell(row = sheet.max_row, column = sheet.max_column).coordinate
print ("Last data point in current worksheet:", last_data_point) #for debugging purposes
#import next file and add to master
append_point = ws.cell(row = sheet.max_row + 1, column = 1).coordinate
print ("Start new data at:", append_point)
wb = load_workbook('second_file.xlsx')
sheet2 = wb.get_sheet_by_name('Sheet1')
start = ws.cell(coordinate='A2').coordinate
print("New data start: ", start)
end = ws.cell(row = sheet2.max_row, column = sheet2.max_column).coordinate
print ("New data end: ", end)
# write a value to selected cell
#sheet[append_point] = 311
#print (ws.cell(append_point).value)
#save file
wb.save('master_file.xlsx')
Thanks!
I don't really understand your code. It looks too complicated. When copying between worksheets you probably want to use ws.rows.
wb1 = load_workbook('master.xlsx')
ws2 = wb1.active
for f in files:
wb2 = load_workbook(f)
ws2 = wb2['Sheet1']
for row in ws2.rows[1:]:
ws1.append((cell.value for cell in row))