I have a function like this:
def DuplicateEachRow():
import pandas as pd
import pathlib
full_path = str(pathlib.Path().absolute()) + '\\' + new_loc
df = pd.read_excel(full_path, header=None, sheet_name='Sheet3')
print(df)
# duplicate the rows:
dup_df = pd.concat([df, df], ignore_index=True)
# using openpyxl
with pd.ExcelWriter(new_loc) as writer:
dup_df.to_excel(writer)
and I need to keep this same functionality, but instead of writing that one sheet to a new file. I need to edit that one particular sheet and save it back to my workbook that has other sheets.
EDIT (more explanation): I have 4 sheets in a workbook and in just one sheet (Sheet3) I need to use the functionality above and then save it back to a workbook.
This doesn't work either, specifying the sheet name when I save:
def DuplicateEachRow():
import pandas as pd
import pathlib
full_path = str(pathlib.Path().absolute()) + '\\' + new_loc
df = pd.read_excel(full_path, header=None, sheet_name='GTL | GWL Disclosures')
print(df)
# duplicate the rows:
dup_df = pd.concat([df, df], ignore_index=True)
# using openpyxl
with pd.ExcelWriter(new_loc) as writer:
dup_df.to_excel(writer, sheet_name='GTL | GWL Disclosures')
To add a news sheet in the same excel you have to open the file in mode append.
Have a look at the code below:
def DuplicateEachRow():
import pandas as pd
import pathlib
full_path = str(pathlib.Path().absolute()) + '\\' + new_loc
df = pd.read_excel(full_path, header=None, sheet_name='GTL | GWL Disclosures')
print(df)
# duplicate the rows:
# keep the index, so you can sort the rows after
dup_df = pd.concat([df, df])
#sort the rows by the index so you have the duplicate one just after the initial one
dup_df.sort_index(inplace=True)
# using openpyxl
#open the file in append mode
with pd.ExcelWriter(new_loc, mode='a') as writer:
#use a new name for the new sheet
#don't save the header (dataframe columns names) and index (dataframe row names) in the new sheet
dup_df.to_excel(writer, sheet_name='Sheet3', header=None, index=None)
Related
Need help please.
Using python 3.
I need to loop through a folder that contains excel files and each file has multiple sheets.
How do I loop through all the files and all the sheets and extract to a dataframe?
What I was able to accomplish only returns one excel file and all the worksheets for that file but I need for all files. Please help.
This is what I have so far:
from xlsxwriter import Workbook
import pandas as pd
import openpyxl
import glob
import os
path = 'filestoimport/*.xlsx'
for filepath in glob.glob(path):
xl = pd.ExcelFile(filepath)
# Define an empty list to store individual DataFrames
list_of_dfs = []
list_of_dferror= []
for sheet_name in xl.sheet_names:
df = xl.parse(sheet_name, usecols='A,D,N,B,C,E,F,G,H,I,J,K,L,M', header=0)
df.columns = df.columns.str.replace(' ', '')
df['sheetname'] = sheet_name # this adds `sheet_name` into the column
# using basename function from os
# module to print file name
file_name = os.path.basename(filepath)
df['sourcefilename'] = file_name
# only add sheets containing columns ['Status', 'ProjectID']
column_names = ['Status', 'ProjectID']
if set(column_names).issubset(df.columns):
df['Status'].fillna('', inplace=True)
df['Addedby'].fillna('', inplace=True)
# And append it to the list
list_of_dfs.append(df)
# Combine all DataFrames into one
data = pd.concat(list_of_dfs, ignore_index=True)
I am trying to understand how I can add to my current script where I'm able to make changes at sheet level. I want to be able to delete columns from the worksheets in my flat file here. For example, if a column is called 'company' I want to delete it so that my final wb.save drops those columns. I have multiple column names i want to drop from all sheets in the wb-
cols_to_drop = ['Company','Type','Firstname','lastname']
My code so far where I have managed to delete a specific sheet from a file and update colnames is below-
from openpyxl import load_workbook
import os
column_name_update_map = {'LocationName': 'Company Name','StreetAddress':'Address','City':'City','State':'State',
'Zip':'Zip','GeneralPhone':'Phone Number','GeneralEmail':'Email','DateJoined':'Status Date',
'Date Removed':'Status Date'}
for file in os.listdir("C:/Users/hhh/Desktop/aaa/python/Matching"):
if file.startswith("TVC"):
wb = load_workbook(file)
if 'Opt-Ins' in wb.sheetnames:
wb.remove(wb['Opt-Ins'])
wb.remove(wb['New Voting Members'])
wb.remove(wb['Temporary Members'])
for ws in wb:
for header in next(ws.rows):
try:
header.value = column_name_update_map[header.value]
except KeyError:
pass
wb.save(file + " (updated headers).xlsx")
This part of the code works perfectly and gives me the desired result. however, I'm unable to apply a dataframe logic like df.drop(['Company', 'Type', 'Firstname'], axis=1) since it is a workbook and not a dataframe
Since you've tagged the question as pandas, you could just use pandas to read and drop:
for file in os.listdir("C:/Users/hhh/Desktop/aaa/python/Matching"):
if file.startswith("TVC"):
dfs = pd.read_excel(file, sheet_name=None)
output = dict()
for ws, df in dfs.items():
if ws in ["Opt-Ins", "New Voting Members", "Temporary Members"]:
continue
#drop unneeded columns
temp = df.drop(cols_to_drop, errors="ignore", axis=1)
#rename columns
temp = temp.rename(columns=column_name_update_map)
#drop empty columns
temp = temp.dropna(how="all", axis=1)
output[ws] = temp
writer = pd.ExcelWriter(f'{file.replace(".xlsx","")} (updated headers).xlsx')
for ws, df in output.items():
df.to_excel(writer, index=None, sheet_name=ws)
writer.save()
writer.close()
I have two excel files and both of them have 10 worksheets. I wanted to read each worksheets, compare them and print data in 3rd excel file, even that would be written in multiple worksheets.
The below program works for single worksheet
import pandas as pd
df1 = pd.read_excel('zyx_5661.xlsx')
df2 = pd.read_excel('zyx_5662.xlsx')
df1.rename(columns= lambda x : x + '_file1', inplace=True)
df2.rename(columns= lambda x : x + '_file2', inplace=True)
df_join = df1.merge(right = df2, left_on = df1.columns.to_list(), right_on = df2.columns.to_list(), how = 'outer')
with pd.ExcelWriter('xl_join_diff.xlsx') as writer:
df_join.to_excel(writer, sheet_name='testing', index=False)
How can I optimize it to work with multiple worksheets?
I think this should achieve what you need. Loop through each sheet name (assuming they're named the same across both excel documents. If not, you can use numbers instead). Write the new output to a new sheet, and save the excel document.
import pandas as pd
writer = pd.ExcelWriter('xl_join_diff.xlsx')
for sheet in ['sheet1', 'sheet2', 'sheet3']: #list of sheet names
#Pull in data for each sheet, and merge together.
df1 = pd.read_excel('zyx_5661.xlsx', sheet_name=sheet)
df2 = pd.read_excel('zyx_5662.xlsx', sheet_name=sheet)
df1.rename(columns= lambda x : x + '_file1', inplace=True)
df2.rename(columns= lambda x : x + '_file2', inplace=True)
df_join = df1.merge(right=df2, left_on=df1.columns.to_list(),
right_on=df2.columns.to_list(), how='outer')
df_join.to_excel(writer, sheet, index=False) #write to excel as new sheet
writer.save() #save excel document once all sheets have been done
You can use the loop to read files and sheets
writer = pd.ExcelWriter('multiple.xlsx', engine='xlsxwriter')
# create writer for writing all sheets in 1 file
list_files=['zyx_5661.xlsx','zyx_5662.xlsx']
count_sheets=0
for file_name in list_files:
file = pd.ExcelFile(file_name)
for sheet_name in file.sheet_names:
df = pd.read_excel(file, sheet_name)
# ... you can do your process
count_sheets=count_sheets + 1
df.to_excel(writer, sheet_name='Sheet-'+count_sheets)
writer.save()
I want to copy the columns from an Excel file in a certain order, and then paste the columns I concatenated to the data frame df1 into another excel file at a certain interval.
In other words, in the dataframe, paste the first column from A1 to A1 in the excel file, the second column to A3 to the third column A5 ... (assuming we have pasted 50 such columns). I've used these codes so far. But I'm blocked at this point. Thanks in advance for your help.
import os
import pandas as pd
from os.path import expanduser
os.chdir('C:\Table')
files = os.listdir('C:\Table')
print('List of files at *.xls ve *.xlsx format:\n', files)
all_files = [f for f in files if (f[-3:] == 'xls' or f[-4:] == 'xlsx')]
df1 = pd.DataFrame() # Creating empty dataframe
for f in all_files:
# Take values on C column
names= pd.read_excel(f, skiprows=1, parse_cols="C:C",sheetname='Sheet1', header=None)
df1 = pd.concat([df1, names[:1]], axis=1)
print(df1)
home = expanduser("~\Desktop") #For saving desktop
Saving = input("Please Insert Name Of File:")
writer = pd.ExcelWriter(os.path.join(home,Saving+'.xlsx'), engine='xlsxwriter')
df1.to_excel(writer,startcol=1,startrow=5, sheet_name='Sheet1', header=None, index=False)
workbook = writer.book
worksheet = writer.sheets['Sheet1']
writer.save()
I have a lot of files excel, I want to append multiple excel files using the following code:
import pandas as pd
import glob
import os
import openpyxl
df = []
for f in glob.glob("*.xlsx"):
data = pd.read_excel(f, 'Sheet1')
data.index = [os.path.basename(f)] * len(data)
df.append(data)
df = pd.concat(df)
writer = pd.ExcelWriter('output.xlsx')
df.to_excel(writer,'Sheet1')
writer.save()
Excel files have this structure:
the output is the following:
Why does python alter the first column when concatenating excel files?
I think you need:
df = []
for f in glob.glob("*.xlsx"):
data = pd.read_excel(f, 'Sheet1')
name = os.path.basename(f)
#create Multiindex for not overwrite original index
data.index = pd.MultiIndex.from_product([[name], data.index], names=('files','orig'))
df.append(data)
#reset index for columns from MultiIndex
df = pd.concat(df).reset_index()
Another solution is use parameter keys in concat:
files = glob.glob("*.xlsx")
names = [os.path.basename(f) for f in files]
dfs = [pd.read_excel(f, 'Sheet1') for f in files]
df = pd.concat(dfs, keys=names).rename_axis(('files','orig')).reset_index()
What is same as:
df = []
names = []
for f in glob.glob(".xlsx"):
df.append(pd.read_excel(f, 'Sheet1'))
names.append(os.path.basename(f))
df = pd.concat(df, keys=names).rename_axis(('files','orig')).reset_index()
Last write to excel with no index and no columns names:
writer = pd.ExcelWriter('output.xlsx')
df.to_excel(writer,'Sheet1', index=False, header=False)
writer.save()