So I've been trying to code a script which loads all excel files from a specific location and moves worksheets inside these files into one workbook. I'm ending with and error:
AttributeError: 'DataFrame' object has no attribute 'DataFrame'.
I'm pretty new to this so I would really appreciate any tip on how to make that work. I can stick only
with openpyxl because at the moment I cannot install xlrd module on my workstation.
from pandas import ExcelWriter
import glob
import pandas as pd
import openpyxl
writer = ExcelWriter("output.xlsx")
for filename in glob.glob (r"C:\path\*.xlsx"):
wb = openpyxl.load_workbook(filename)
for ws in wb.sheetnames:
ws = wb[ws]
print (ws)
data = ws.values
columns = next(data)[0:]
df= pd.DataFrame(data, columns=columns)
print(df)
for df in df.DataFrame:
df.to_excel([writer,sheet_name= ws)
writer.save()
first you have to use sheet_name as a string not an object and another thing is last for loop is not needed as we loop through sheet names.
from pandas import ExcelWriter
import glob
import pandas as pd
import openpyxl
writer = ExcelWriter("output.xlsx")
for filename in glob.glob (r"C:\path\*.xlsx"):
wb = openpyxl.load_workbook(filename)
for ws in wb.sheetnames:
ws1 = wb[ws]
data = ws1.values
columns = next(data)[0:]
df= pd.DataFrame(data, columns=columns)
df.to_excel(writer,sheet_name=ws,index = False)
writer.save()
Related
I have an excel data for three variables (Acct, Order, Date) in a Sheet name called Orders
I have created a data frame by reading this Sheet
import pandas as pd
sheet_file=pd_ExcelFile("Orders.xlsx", engine="openpyxl")
for sheet_name in worksheets:
df=pd.read_excel(sheet_file,sheet_name,header=1)
append_data.append(df)
append_data=pd.concat(append_data)
I have another Excel file called "Total_Orders.xlsx" with ~100k rows and I need to append the above dataframe to this excel file (Sheet Name="Orders")
with pd.ExcelWriter('Total_Orders.xlsx',sheet_name='Orders',engine="openpyxl") as writer:
append_data.to_excel(writer,startrow=2,header=False,index=False)
writer.save()
The above is overwriting the data instead of appending it. I know startrow is the key here but I am not sure how to fix this. Any help is much appreciated
Have you tried in mode="a", along these lines:
with pd.ExcelWriter("Total_Orders.xlsx", mode="a", engine="openpyxl") as writer:
append_data.to_excel(writer, sheet_name="Orders")
EDIT - in response to comment
import pandas as pd
from openpyxl.utils.dataframe import dataframe_to_rows
from openpyxl import load_workbook
append_data = pd.DataFrame([{'Acct':3, 'Order':333, 'Note':'third'},
{'Acct':4, 'Order':444, 'Note':'fourth'}])
wb = load_workbook(filename = "stackoverflow.xlsx")
ws = wb["Orders"]
for r in dataframe_to_rows(append_data, index=False, header=False): #No index and don't append the column headers
ws.append(r)
wb.save("stackoverflow.xlsx")
The stackoverflow.xlsx before:
The stackoverflow.xlsx after (the 'Other' sheet was not affected):
I am trying to use this code to append a dataframe to an existing sheet in Excel, but instead of appending the new data to it, it creates a new sheet. Here is the code:
import pandas as pd
import openpyxl as op
df = ['normal_dataframe']
with pd.ExcelWriter('test.xlsx', engine='openpyxl', mode='a') as writer:
df.to_excel(writer, sheet_name='Sheet1', header=False, index=False)
'test.xlsx' has a 'Sheet1', but when the file is appended, theres 2 sheets. 'Sheet1' and 'Sheet11'.
One approach with COM:
import win32com.client
xl = win32com.client.Dispatch("Excel.Application")
path = r'c:\Users\Alex20\Documents\test.xlsx'
wb = xl.Workbooks.Open(path)
ws = wb.Worksheets("Sheet1")
ws.Range("E9:F10").Value = [[9,9],[10,10]]
wb.Close(True)
xl.Quit()
Doesnt matter what i do, id ont get it done that alle the data from the xhtml files will be written in one Excel sheet. It looks like, that Python llops through all my files in the folder but as output i only get the data from the last file.
Help would be great!
#!/usr/bin/python3
# Import libaries
import pandas as pd
import openpyxl
from openpyxl import load_workbook
import glob
import time
#Path to folder
path_dir: str = r"C:\Users\Moench\Desktop\r2d2\EPUB\content1\*.xhtml"
#Read files
for filename in glob.glob(path_dir):
#Assign the table data to a Pandas dataframe
dfs = open(filename, 'r')
dfs1 = pd.read_html(dfs)
#Read data
df2 = dfs1[0][['Unnamed: 0_level_0','Unnamed: 1_level_0','Unnamed: 2_level_0','Unnamed: 3_level_0','Unnamed: 4_level_0','Unnamed: 12_level_0','Unnamed: 13_level_0']]
#Print result (Looks like that it goes through all files in the folder)
# print (df2)
# Write to existing Excel-Sheet
book = load_workbook('output.xlsx')
writer = pd.ExcelWriter('output.xlsx', engine='openpyxl')
writer.book = book
ts = time.time()
df3 = df2.append(df2)
writer.sheets = dict((ws.title, ws) for ws in book.worksheets)
df3.to_excel(writer, str(ts))
writer.save()
You are storing your data at each iteration in the same dataframe, rewriting it at each iteration, so you only have your last data (twice actually, because of df2.append(df2).
Here is a slightly modified version, storing each dataframe in df_list, and using pd.concat on this list to create df3:
#!/usr/bin/python3
# Import libaries
import pandas as pd
import openpyxl
from openpyxl import load_workbook
import glob
import time
#Path to folder
path_dir: str = r"C:\Users\Moench\Desktop\r2d2\EPUB\content1\*.xhtml"
# Initiate list of dataframes
df_list = list()
#Read files
for filename in glob.glob(path_dir):
#Assign the table data to a Pandas dataframe
dfs = open(filename, 'r')
dfs1 = pd.read_html(dfs)
#Read data
df2 = dfs1[0][['Unnamed: 0_level_0','Unnamed: 1_level_0','Unnamed: 2_level_0','Unnamed: 3_level_0','Unnamed: 4_level_0','Unnamed: 12_level_0','Unnamed: 13_level_0']]
df_list.append(df2)
#Print result (Looks like that it goes through all files in the folder)
# print (df2)
# Write to existing Excel-Sheet
book = load_workbook('output.xlsx')
writer = pd.ExcelWriter('output.xlsx', engine='openpyxl')
writer.book = book
ts = time.time()
# Concatenate all dataframes into one
df3 = pd.concat(df_list, ignore_index=True)
writer.sheets = dict((ws.title, ws) for ws in book.worksheets)
df3.to_excel(writer, str(ts))
writer.save()
I'm using to_excel to write multiple DataFrames to multiple Excel documents. This works fine except that the index of the Dataframes is appended in bold with a border around each cell (see image).
The following code is a simplification of the code I use but has the same problem:
import pandas as pd
from openpyxl import load_workbook
df = pd.DataFrame(np.random.randint(50,60, size=(20, 3)))
xls_loc = r'test_doc.xlsx'
wb = load_workbook(xls_loc)
writer = pd.ExcelWriter(xls_loc, engine='openpyxl')
writer.book = wb
df.to_excel(writer, sheet_name='test sheet',index=True,startrow=1,startcol=1, header=False)
writer.save()
writer.close()
Is there a way to append the index without making the index bold and add borders?
Make the index a new column and then set index=False in to_excel()
df.insert(0, 'index', df.index)
You could insert the dataframe using xlwings to avoid formatting:
import pandas as pd
import xlwings as xw
df = pd._testing.makeDataFrame()
with xw.App(visible=False) as app:
wb = xw.Book()
wb.sheets[0]["A1"].value = df
wb.save("test.xlsx")
wb.close()
import pandas as pd
data = [11,12,13,14,15]
df = pd.DataFrame(data)
wb = pd.ExcelWriter('FileName.xlsx', engine='xlsxwriter')
df.style.set_properties(**{'text-align': 'center'}).to_excel(wb, sheet_name='sheet_01',index=False,header=None)
wb.save()
In to_excel() method index=False & header=None is the main trick
I have two excel workbooks.
One with 3 sheets and the other with only one sheet. I am trying to combine these two into one workbook. This workbook should have 4 sheets.
from pandas import ExcelWriter
writer = ExcelWriter("Sample.xlsx")
for filename in glob.glob("*.xlsx"):
df_excel = pd.read_excel(filename)
(_, f_name) = os.path.split(filename)
(f_short_name, _) = os.path.splitext(f_name)
df_excel.to_excel(writer, f_short_name, index=False)
writer.save()
Doing this gives me a workbook, but with only 2 sheets. First sheet of the first workbook and second sheet of second workbook.
How to get all the 4 sheets in one workbook?
You have to loop through the sheet names. See the below code:
from pandas import ExcelWriter
import glob
import os
import pandas as pd
writer = ExcelWriter("output.xlsx")
for filename in glob.glob("*.xlsx"):
excel_file = pd.ExcelFile(filename)
(_, f_name) = os.path.split(filename)
(f_short_name, _) = os.path.splitext(f_name)
for sheet_name in excel_file.sheet_names:
df_excel = pd.read_excel(filename, sheet_name=sheet_name)
df_excel.to_excel(writer, f_short_name+'_'+sheet_name, index=False)
writer.save()