I want to copy the columns from an Excel file in a certain order, and then paste the columns I concatenated to the data frame df1 into another excel file at a certain interval.
In other words, in the dataframe, paste the first column from A1 to A1 in the excel file, the second column to A3 to the third column A5 ... (assuming we have pasted 50 such columns). I've used these codes so far. But I'm blocked at this point. Thanks in advance for your help.
import os
import pandas as pd
from os.path import expanduser
os.chdir('C:\Table')
files = os.listdir('C:\Table')
print('List of files at *.xls ve *.xlsx format:\n', files)
all_files = [f for f in files if (f[-3:] == 'xls' or f[-4:] == 'xlsx')]
df1 = pd.DataFrame() # Creating empty dataframe
for f in all_files:
# Take values on C column
names= pd.read_excel(f, skiprows=1, parse_cols="C:C",sheetname='Sheet1', header=None)
df1 = pd.concat([df1, names[:1]], axis=1)
print(df1)
home = expanduser("~\Desktop") #For saving desktop
Saving = input("Please Insert Name Of File:")
writer = pd.ExcelWriter(os.path.join(home,Saving+'.xlsx'), engine='xlsxwriter')
df1.to_excel(writer,startcol=1,startrow=5, sheet_name='Sheet1', header=None, index=False)
workbook = writer.book
worksheet = writer.sheets['Sheet1']
writer.save()
Related
I have 10 csv files and want to save all the files as 10 worksheets of 1 xlsx file.
data1.csv,data2.csv,.......,data10.csv.
Attempt
import glob
import numpy as np
import pandas as pd
all_datasets = pd.DataFrame()
for x in glob.glob("*.csv"):
df = pd.read_csv(x)
# I want to export the corresponding csv files as 10 worksheets of 1 xlsx
#initialze the excel writer
writer = pd.ExcelWriter('all_datasets_combinedworksheets.xlsx', engine='xlsxwriter')
frames = {'sheetName_1': df1, 'sheetName_2': df2,
'sheetName_3': df3,'sheetName_4': df4}
for sheet, frame in frames.iteritems(): # .use .items for python 3.X
frame.to_excel(writer, sheet_name = sheet)
#critical last step
writer.save()
I'm open to other approach, please share your code, thanks in advance
You're overwriting the variable df in every iteration of your loop.
You don't have any variables called df1, df2 etc. but you're trying to use them to create your frames dictionary.
You never use all_datasets.
Try:
import os
frames = {f: pd.read_csv(f) for f in os.listdir(path) if f.endswith(".csv")}
writer = pd.ExcelWriter('all_datasets_combinedworksheets.xlsx', engine='xlsxwriter')
for sheet, frame in frames.items():
frame.to_excel(writer,sheet_name=sheet)
writer.save()
I have two excel files and both of them have 10 worksheets. I wanted to read each worksheets, compare them and print data in 3rd excel file, even that would be written in multiple worksheets.
The below program works for single worksheet
import pandas as pd
df1 = pd.read_excel('zyx_5661.xlsx')
df2 = pd.read_excel('zyx_5662.xlsx')
df1.rename(columns= lambda x : x + '_file1', inplace=True)
df2.rename(columns= lambda x : x + '_file2', inplace=True)
df_join = df1.merge(right = df2, left_on = df1.columns.to_list(), right_on = df2.columns.to_list(), how = 'outer')
with pd.ExcelWriter('xl_join_diff.xlsx') as writer:
df_join.to_excel(writer, sheet_name='testing', index=False)
How can I optimize it to work with multiple worksheets?
I think this should achieve what you need. Loop through each sheet name (assuming they're named the same across both excel documents. If not, you can use numbers instead). Write the new output to a new sheet, and save the excel document.
import pandas as pd
writer = pd.ExcelWriter('xl_join_diff.xlsx')
for sheet in ['sheet1', 'sheet2', 'sheet3']: #list of sheet names
#Pull in data for each sheet, and merge together.
df1 = pd.read_excel('zyx_5661.xlsx', sheet_name=sheet)
df2 = pd.read_excel('zyx_5662.xlsx', sheet_name=sheet)
df1.rename(columns= lambda x : x + '_file1', inplace=True)
df2.rename(columns= lambda x : x + '_file2', inplace=True)
df_join = df1.merge(right=df2, left_on=df1.columns.to_list(),
right_on=df2.columns.to_list(), how='outer')
df_join.to_excel(writer, sheet, index=False) #write to excel as new sheet
writer.save() #save excel document once all sheets have been done
You can use the loop to read files and sheets
writer = pd.ExcelWriter('multiple.xlsx', engine='xlsxwriter')
# create writer for writing all sheets in 1 file
list_files=['zyx_5661.xlsx','zyx_5662.xlsx']
count_sheets=0
for file_name in list_files:
file = pd.ExcelFile(file_name)
for sheet_name in file.sheet_names:
df = pd.read_excel(file, sheet_name)
# ... you can do your process
count_sheets=count_sheets + 1
df.to_excel(writer, sheet_name='Sheet-'+count_sheets)
writer.save()
I have a function like this:
def DuplicateEachRow():
import pandas as pd
import pathlib
full_path = str(pathlib.Path().absolute()) + '\\' + new_loc
df = pd.read_excel(full_path, header=None, sheet_name='Sheet3')
print(df)
# duplicate the rows:
dup_df = pd.concat([df, df], ignore_index=True)
# using openpyxl
with pd.ExcelWriter(new_loc) as writer:
dup_df.to_excel(writer)
and I need to keep this same functionality, but instead of writing that one sheet to a new file. I need to edit that one particular sheet and save it back to my workbook that has other sheets.
EDIT (more explanation): I have 4 sheets in a workbook and in just one sheet (Sheet3) I need to use the functionality above and then save it back to a workbook.
This doesn't work either, specifying the sheet name when I save:
def DuplicateEachRow():
import pandas as pd
import pathlib
full_path = str(pathlib.Path().absolute()) + '\\' + new_loc
df = pd.read_excel(full_path, header=None, sheet_name='GTL | GWL Disclosures')
print(df)
# duplicate the rows:
dup_df = pd.concat([df, df], ignore_index=True)
# using openpyxl
with pd.ExcelWriter(new_loc) as writer:
dup_df.to_excel(writer, sheet_name='GTL | GWL Disclosures')
To add a news sheet in the same excel you have to open the file in mode append.
Have a look at the code below:
def DuplicateEachRow():
import pandas as pd
import pathlib
full_path = str(pathlib.Path().absolute()) + '\\' + new_loc
df = pd.read_excel(full_path, header=None, sheet_name='GTL | GWL Disclosures')
print(df)
# duplicate the rows:
# keep the index, so you can sort the rows after
dup_df = pd.concat([df, df])
#sort the rows by the index so you have the duplicate one just after the initial one
dup_df.sort_index(inplace=True)
# using openpyxl
#open the file in append mode
with pd.ExcelWriter(new_loc, mode='a') as writer:
#use a new name for the new sheet
#don't save the header (dataframe columns names) and index (dataframe row names) in the new sheet
dup_df.to_excel(writer, sheet_name='Sheet3', header=None, index=None)
Say I have a folder which have multiple excel files with extension xlsx or xls, they share same header column a, b, c, d, e except some empty sheet in several files.
I want to iterate all the files and sheets (except for empty sheets) and concatenate them into one sheet of one file output.xlsx.
I have iterated through all excel files and append them to one file, but how could I iterate through all the sheets of each files if they have more than one sheets?
I need to integrate two block of code below into one. Thanks for your help.
import pandas as pd
import numpy as np
import glob
path = os.getcwd()
files = os.listdir(path)
files
df = pd.DataFrame()
# method 1
excel_files = [f for f in files if f[-4:] == 'xlsx' or f[-3:] == 'xls']
excel_files
for f in excel_files:
data = pd.read_excel(f)
df = df.append(data)
# method 2
for f in glob.glob("*.xlsx" or "*.xls"):
data = pd.read_excel(f)
df = df.append(data, ignore_index=True)
# save the data frame
writer = pd.ExcelWriter('output.xlsx')
df.to_excel(writer, 'sheet1')
writer.save()
For one file to concatenate multiple sheets:
file = pd.ExcelFile('file.xlsx')
names = file.sheet_names # read all sheet names
df = pd.concat([file.parse(name) for name in names])
import pandas as pd
path = os.getcwd()
files = os.listdir(path)
files
excel_files = [file for file in files if '.xls' in file]
excel_files
def create_df_from_excel(file_name):
file = pd.ExcelFile(file_name)
names = file.sheet_names
return pd.concat([file.parse(name) for name in names])
df = pd.concat(
[create_df_from_excel(xl) for xl in excel_files]
)
# save the data frame
writer = pd.ExcelWriter('output.xlsx')
df.to_excel(writer, 'sheet1')
writer.save()
I have a for loop that imports all of the Excel files in the directory and merge them together in a single dataframe. However, I want to create a new column where each row takes the string of the filename of the Excel-file.
Here is my import and merge code:
path = os.getcwd()
files = os.listdir(path)
df = pd.DataFrame()
for f in files:
data = pd.read_excel(f, 'Sheet1', header = None, names = ['col1','col2'])
df = df.append(data)
For example if first Excel file is named "file1.xlsx", I want all rows from that file to have value file1.xlsx in col3 (a new column). If the second Excel file is named "file2.xlsx" I want all rows from that file to have value file2.xlsx. Notice that there is no real pattern of the Excel files, and I just use those names as an example.
Many thanks
Create new column in loop:
df = pd.DataFrame()
for f in files:
data = pd.read_excel(f, 'Sheet1', header = None, names = ['col1','col2'])
data['col3'] = f
df = df.append(data)
Another possible solution with list comprehension:
dfs = [pd.read_excel(f, 'Sheet1', header = None, names = ['col1','col2']).assign(col3 = f)
for f in files]
df = pd.concat(dfs)