Merge excel files from a folder with more than one tab - python

I have to build a solution that unifies all excel files in a folder and generates a new consolidated excel with all information. The files have the same amount of tabs (3) and same name.
I tried this way:
import pandas as pd
import glob
path = "C:\Users\Alan\Desktop\"
filenames = glob.glob(path + "\*.xlsx")
outputxlsx = pd.DataFrame()
for file in filenames:
df = pd.concat(pd.read_excel( file, sheet_name=None), ignore_index=True, sort=False)
outputxlsx = outputxlsx.append( df, ignore_index=True)
outputxlsx.to_excel("C:\Users\Alan\Desktop\Output.xlsx", index=False)
Unfortunately on the first tab the header is replicated and the other two tabs are not generated.

from pathlib import Path
import pandas as pd
def get_data_by_sheet(file_path: str) -> dict:
return {x: df for x, df in pd.read_excel(file_path, sheet_name=None).items() if not df.empty}
path = "C:/Users/Alan/Desktop/"
all_files = [x for x in Path(path).rglob("*.xlsx")]
(pd
.concat([pd.concat([df for sheet, df in list(get_data_by_sheet(file_path=file).items())]) for file in all_files])
.reset_index(drop=True)
).to_excel(f"{path}final_df.xlsx", index=False)
Or if you also want to know what workbook and sheet each row came from:
(pd
.concat(
[pd.concat([df.assign(file_name=Path(file).stem).assign(sheet_name=sheet) for
sheet, df in list(all_sheets_mapping(file_path=file).items())]) for file in all_files]
).reset_index(drop=True)).to_excel(f"{path}final_df.xlsx", index=False)

Related

pandas loop through excel files and sheets

Need help please.
Using python 3.
I need to loop through a folder that contains excel files and each file has multiple sheets.
How do I loop through all the files and all the sheets and extract to a dataframe?
What I was able to accomplish only returns one excel file and all the worksheets for that file but I need for all files. Please help.
This is what I have so far:
from xlsxwriter import Workbook
import pandas as pd
import openpyxl
import glob
import os
path = 'filestoimport/*.xlsx'
for filepath in glob.glob(path):
xl = pd.ExcelFile(filepath)
# Define an empty list to store individual DataFrames
list_of_dfs = []
list_of_dferror= []
for sheet_name in xl.sheet_names:
df = xl.parse(sheet_name, usecols='A,D,N,B,C,E,F,G,H,I,J,K,L,M', header=0)
df.columns = df.columns.str.replace(' ', '')
df['sheetname'] = sheet_name # this adds `sheet_name` into the column
# using basename function from os
# module to print file name
file_name = os.path.basename(filepath)
df['sourcefilename'] = file_name
# only add sheets containing columns ['Status', 'ProjectID']
column_names = ['Status', 'ProjectID']
if set(column_names).issubset(df.columns):
df['Status'].fillna('', inplace=True)
df['Addedby'].fillna('', inplace=True)
# And append it to the list
list_of_dfs.append(df)
# Combine all DataFrames into one
data = pd.concat(list_of_dfs, ignore_index=True)

How do use python to iterate through a directory and delete specific columns from all csvs?

I have a directory with several csvs.
files = glob('C:/Users/jj/Desktop/Bulk_Wav/*.csv')
Each csv has the same below columns. Reprex below-
yes no maybe ofcourse
1 2 3 4
I want my script to iterate through all csvs in the folder and delete the columns maybe and ofcourse.
If glob provides you with file paths, you can do the following with pandas:
import pandas as pd
files = glob('C:/Users/jj/Desktop/Bulk_Wav/*.csv')
drop = ['maybe ', 'ofcourse']
for file in files:
df = pd.read_csv(file)
for col in drop:
if col in df:
df = df.drop(col, axis=1)
df.to_csv(file)
Alternatively if you want a cleaner way to not get KeyErrors from drop you can do this:
import pandas as pd
files = glob('C:/Users/jj/Desktop/Bulk_Wav/*.csv')
drop = ['maybe ', 'ofcourse']
for file in files:
df = pd.read_csv(file)
df = df.drop([c for c in drop if c in df], axis=1)
df.to_csv(file)
Do you mean by:
files = glob('C:/Users/jj/Desktop/Bulk_Wav/*.csv')
for filename in files:
df = pd.read_csv(filename)
df = df.drop(['maybe ', 'ofcourse'], axis=1)
df.to_csv(filename)
This code will remove the maybe and ofcourse columns and save it back to the csv.
You can use panda to read csv file to a dataframe then use drop() to drop specific columns. something like below:
df = pd.read_csv(csv_filename)
df.drop(['maybe', 'ofcourse'], axis=1)
import pandas as pd
from glob import glob
files = glob(r'C:/Users/jj/Desktop/Bulk_Wav/*.csv')
for filename in files:
df = pd.read_csv(filename, sep='\t')
df.drop(['maybe', 'ofcourse'], axis=1, inplace=True)
df.to_csv(filename, sep='\t', index=False)
If the files look exactly like what you have there, then maybe something like this

Multiple csv not being added to pandas

I hope you can help me with this problem.
I am having issues with adding multiple CSV files in pandas.
I have 12 files of sales data that have the same columns (one for each month: Sales_January_2019, Sales_February_2019.... and so on until December).
I've tried the following code but seems not working, also the index number should be continuous and not reset after each file. I tried with reset_index() but also didn't work.
import pandas as pd
import glob
path = r'C:\Users\ricar\.spyder-py3\data' # my path
all_files = glob.glob(path + "/*.csv")
li = []
for filename in all_files:
df = pd.read_csv(filename, index_col=0, header=0)
li.append(df)
df.reset_index(inplace=True)
frame = pd.concat(li, axis=0, ignore_index=True)
df.drop(columns = ['x_t', 'perf'], inplace=True)
print(df)
Try correcting your code like this:
import pandas as pd
import glob
path = r'C:\Users\ricar\.spyder-py3\data' # my path
files = glob.glob(path + "/*.csv")
# Make a list of dataframes
li = [pd.read_csv(file, index_col=0, header=0) for file in files]
# Concatenate dataframes and remove useless columns
df = pd.concat(li, axis=0, ignore_index=True)
df.drop(columns=["x_t", "perf"], inplace=True)
print(df)

Adding dataframe column names based on filename after merging using Glob

I have Excel files in a folder, all in the same format with data for all countries in the world in the sheet 'Dataset2' in each file.
I have merged all files together into one using glob, but I need to know which file (i.e. which country) each column comes from.
Is there a way to do this?
import glob
import os
import pandas as pd
os.chdir("Countries/")
extension = 'xlsx'
all_filenames = [i for i in glob.glob('*.{}'.format(extension))]
combined = pd.concat([pd.read_excel(f, sheet_name='Dataset2') for f in all_filenames ],axis=1, ignore_index=True)
combined.to_excel( "New/combined.xlsx", index=False, encoding='utf-8-sig')
You could unpack the list comprehension into a for-loop and add an additional column to each data file, something like this:
import glob
import os
import pandas as pd
os.chdir("Countries/")
extension = 'xlsx'
all_filenames = [i for i in glob.glob('*.{}'.format(extension))]
file_list = []
for f in all_filenames:
data = pd.read_excel(f, sheet_name='Dataset2')
data['source_file'] = f # create a column with the name of the file
file_list.append(data)
combined = pd.concat(file_list, axis=1, ignore_index=True)
combined.to_excel( "New/combined.xlsx", index=False, encoding='utf-8-sig')
if you're using os module try path.basename and adding this to the key argument in concat:
import glob
import os
import pandas as pd
os.chdir(r"C:\Users\Umar.Hussain\OneDrive - Ricoh Europe PLC\Documents\Excels")
extension = 'xlsx'
all_filenames = [i for i in glob.glob('*.{}'.format(extension))]
names = [os.path.basename(f) for f in all_filenames]
combined = pd.concat([pd.read_excel(f, sheet_name='Sheet1') for f in all_filenames],keys=names,axis=1 )
as your using axis=1 this will add the keys to the header, so may want to read the excels first and add it to a list like :
dfs = []
for file in all_filenames:
df = pd.read_excel(file)
df['source'] = os.path.basename(file)
dfs.append(df)

Grabbing a single Excel worksheet from multiple workbooks into a pandas dataframe and saving this

I need to extract an Excel worksheet from multiple workbooks and saving it to a dataframe and in turn saving that dataframe.
I have a spreadsheet that is generated at the end of each month (e.g.
June 2019.xlsx, May 2019.xlsx, April 2019.xlsx).
I need to grab a worksheet 'Sheet1'from each of these workbooks and convert these to a dataframe (df1).
I would like to have this dataframe saved.
As a nice to have, I would also like some way just to append the next month's data after the initial 'data grab'.
I'm relatively new to this, so I haven't made much progress.
import os
import glob
import pandas as pd
import xlrd
import json
import io
import flatten_json
files = glob.glob('/Users/ngove/Documents/Python Scripts/2019/*.xlsx')
dfs={}
for f in files:
dfs[os.path.splitext(os.path.basename(f))[0]] = pd.read_excel(f)
You can drop all of your files in a directory (e.g. current directory). Then append all of your excel files in a list (e.g. files_xls). Iterate over all your files and use pandas.read_excel to get the respective dataframes (e.g. list_frames).
Below, you can find an example:
import os
import pandas as pd
path = os.getcwd() # get cur dir
files = os.listdir(path) # get all the files in your cur dir
# get only the xls or xlsm (this depends on you)
files_xls = [f for f in files if (f[-3:] == 'xls' or f[-4:] == 'xlsm')]
df = pd.DataFrame()
list_frames = []
for f in files_xls:
print("Processing file: %s" %f)
try:
# the following will give you the dataframe
# the fun params depends on your data format
data = pd.read_excel(f, 'Sheet1', header=0, index_col=None,
sep='delimiter', error_bad_lines=False,
skip_blank_lines=True, comment=',,')
except:
pass
list_frames.append(data)
# at the end you can concat your data if you want and remove any dublicate
df = pd.concat(list_frames, sort=False).fillna(0)
df = df.drop_duplicates()
# at the end you can save it
writer = pd.ExcelWriter("your_title" + ".xlsx", engine='xlsxwriter')
df.to_excel(writer, sheet_name="Sheets1", index=False)
writer.save()
I hope this helps.
I interpreted your statement that you want to save the dataframe as that you want to save it as a combined Excel file. This will combine all files in the folder specified that end in xlsx.
import os
import pandas as pd
from pandas import ExcelWriter
os.chdir("H:/Python/Reports/") #edit this to be your path
path = os.getcwd()
files = os.listdir(path)
files_xlsx = [f for f in files if f[-4:] == 'xlsx']
df = pd.DataFrame()
for f in files_xlsx:
data = pd.read_excel(f, 'Sheet1')
df = df.append(data)
writer=ExcelWriter('Combined_Data.xlsx')
df.to_excel(writer,'Sheet1',index=False)
writer.save()
You could update the code to grab all 2019 files by changing the one line to this:
files_xlsx = [f for f in files if f[-9:] == '2019.xlsx']
I referenced this question for most of the code and updated for xlsx and added the file save portion of the code

Categories

Resources