How to concat multiple spreadsheets in Excel workbooks into pandas dataframe? - python

I have multiple folders and subfolders, containing Excel workbooks with multiple tabs. How do I concat all the information into 1 pandas dataframe?
Here is my code so far:
from pathlib import Path
import os
import pandas as pd
import glob
p = Path(r'C:\Users\user1\Downloads\key_folder')
globbed_files = p.glob('**/**/*.xlsx')
df = []
for file in globbed_files:
frame = pd.read_excel(file, sheet_name = None, ignore_index=True)
frame['File Path'] = os.path.basename(file)
df.append(frame)
# df = pd.concat([d.values() for d in df], axis = 0, ignore_index=True)
df = pd.concat(df, axis=0, ignore_index = True)
This is generating the following error:
cannot concatenate object of type "<class 'collections.OrderedDict'>"; only pd.Series, pd.DataFrame, and pd.Panel (deprecated) objs are valid
When I ran pd.DataFrame(df), I saw that each Excel spreadsheet tab is a separate column. The cells contain the data and headers in text form, forming a really long string.
Any help is appreciated! Thank you!

Here is the final code:
from pathlib import Path
import os
import pandas as pd
import glob
import xlrd
p = Path('path here')
globbed_files = p.glob('**/**/*.xlsx')
list_dfs = []
dfs = []
for file in globbed_files:
xls = xlrd.open_workbook(file, on_demand=True)
for sheet_name in xls.sheet_names():
df = pd.read_excel(file,sheet_name)
df['Sheet Name'] = sheet_name
list_dfs.append(df)
dfs = pd.concat(list_dfs,axis=0)
dfs.to_excel('merged spreadsheet.xlsx')

Related

Merge excel files from a folder with more than one tab

I have to build a solution that unifies all excel files in a folder and generates a new consolidated excel with all information. The files have the same amount of tabs (3) and same name.
I tried this way:
import pandas as pd
import glob
path = "C:\Users\Alan\Desktop\"
filenames = glob.glob(path + "\*.xlsx")
outputxlsx = pd.DataFrame()
for file in filenames:
df = pd.concat(pd.read_excel( file, sheet_name=None), ignore_index=True, sort=False)
outputxlsx = outputxlsx.append( df, ignore_index=True)
outputxlsx.to_excel("C:\Users\Alan\Desktop\Output.xlsx", index=False)
Unfortunately on the first tab the header is replicated and the other two tabs are not generated.
from pathlib import Path
import pandas as pd
def get_data_by_sheet(file_path: str) -> dict:
return {x: df for x, df in pd.read_excel(file_path, sheet_name=None).items() if not df.empty}
path = "C:/Users/Alan/Desktop/"
all_files = [x for x in Path(path).rglob("*.xlsx")]
(pd
.concat([pd.concat([df for sheet, df in list(get_data_by_sheet(file_path=file).items())]) for file in all_files])
.reset_index(drop=True)
).to_excel(f"{path}final_df.xlsx", index=False)
Or if you also want to know what workbook and sheet each row came from:
(pd
.concat(
[pd.concat([df.assign(file_name=Path(file).stem).assign(sheet_name=sheet) for
sheet, df in list(all_sheets_mapping(file_path=file).items())]) for file in all_files]
).reset_index(drop=True)).to_excel(f"{path}final_df.xlsx", index=False)

pandas loop through excel files and sheets

Need help please.
Using python 3.
I need to loop through a folder that contains excel files and each file has multiple sheets.
How do I loop through all the files and all the sheets and extract to a dataframe?
What I was able to accomplish only returns one excel file and all the worksheets for that file but I need for all files. Please help.
This is what I have so far:
from xlsxwriter import Workbook
import pandas as pd
import openpyxl
import glob
import os
path = 'filestoimport/*.xlsx'
for filepath in glob.glob(path):
xl = pd.ExcelFile(filepath)
# Define an empty list to store individual DataFrames
list_of_dfs = []
list_of_dferror= []
for sheet_name in xl.sheet_names:
df = xl.parse(sheet_name, usecols='A,D,N,B,C,E,F,G,H,I,J,K,L,M', header=0)
df.columns = df.columns.str.replace(' ', '')
df['sheetname'] = sheet_name # this adds `sheet_name` into the column
# using basename function from os
# module to print file name
file_name = os.path.basename(filepath)
df['sourcefilename'] = file_name
# only add sheets containing columns ['Status', 'ProjectID']
column_names = ['Status', 'ProjectID']
if set(column_names).issubset(df.columns):
df['Status'].fillna('', inplace=True)
df['Addedby'].fillna('', inplace=True)
# And append it to the list
list_of_dfs.append(df)
# Combine all DataFrames into one
data = pd.concat(list_of_dfs, ignore_index=True)

How to run function on multiple dataframes of variable row sizes, then generate a new dataframe with just the function results

I have a folder full of CSVs of equal columns but variable rows. I want to convert each to a dataframe and run a simple function on them, and create one new dataframe with just the function values and the file names as the index.
So far I have:
import os.path
import tkinter.filedialog as filedialog
import glob
import pandas as pd
file_path = filedialog.askdirectory()
pattern = os.path.join(file_path, '*.csv')
files = glob.glob(pattern)
for index, file in enumerate(files):
df = pd.read_csv(file, sep=',', index_col=[0])
df.loc['total'] = df.sum(numeric_only=True, axis=0) # or any function
pd.concat(df2[df.index == 'total'])
df.to_csv('file_path')
I'm sure there are several ways in which this is messed up, but any advice is appreciated
import os.path
import tkinter.filedialog as filedialog
import glob
import pandas as pd
file_path = filedialog.askdirectory()
pattern = os.path.join(file_path, '*.csv')
files = glob.glob(pattern)
dfs = []
for index, file in enumerate(files):
df = pd.read_csv(file, sep=',', index_col=[0])
# Would remove the .loc, but it does no harm
df.loc['total'] = df.sum(numeric_only=True, axis=0) # or any function
dfs.append(df[['total']])
df_total = pd.concat(dfs).reset_index(drop=True)
df_total.to_csv('file_path')
OK I figured it out:
import os.path
import tkinter.filedialog as filedialog
import glob
import pandas as pd
file_path = filedialog.askdirectory()
pattern = os.path.join(file_path, '*.csv')
files = glob.glob(pattern)
filename = pd.DataFrame(columns=['Filename'])
filename['Filename'] = pd.Series([file for file in files]).reset_index(drop=True)
dfs = []
for index, file in enumerate(files):
df = pd.read_csv(file, sep=',', index_col=[0])
# Would remove the .loc, but it does no harm
df.loc['total'] = df.sum(numeric_only=True, axis=0) # or any function
dfs.append(df)
dfs = pd.concat(dfs)
total = dfs[dfs.index == 'total'][['dfcolumn1','dfcolumn2',etc]]#write column names exactly as they appear on csv
total_named = filename.join(total.set_index(filename.index))
total_named.to_csv('file_path')

combining multiple sheets on excel

Hi I am trying to combine all the excel sheets from one folder and I always got the below error. 'TypeError: listdir: path should be string, bytes, os.PathLike or None, not list. please help.
import pandas as pd
import os
path = ['pythonProject']
combine = pd.DataFrame()
#2nd
for j in os.listdir(path):
df = pd.read_excel(path, skiprows=3)
combine = combine.append(df, ignore_index=True)
print(combine)
you accidently made path a list because of [ and ]. try the following code
import pandas as pd
import os
path = 'pythonProject'
combine = pd.DataFrame()
#2nd
for j in os.listdir(path):
df = pd.read_excel(path, skiprows=3)
combine = combine.append(df, ignore_index=True)
print(combine)

Adding dataframe column names based on filename after merging using Glob

I have Excel files in a folder, all in the same format with data for all countries in the world in the sheet 'Dataset2' in each file.
I have merged all files together into one using glob, but I need to know which file (i.e. which country) each column comes from.
Is there a way to do this?
import glob
import os
import pandas as pd
os.chdir("Countries/")
extension = 'xlsx'
all_filenames = [i for i in glob.glob('*.{}'.format(extension))]
combined = pd.concat([pd.read_excel(f, sheet_name='Dataset2') for f in all_filenames ],axis=1, ignore_index=True)
combined.to_excel( "New/combined.xlsx", index=False, encoding='utf-8-sig')
You could unpack the list comprehension into a for-loop and add an additional column to each data file, something like this:
import glob
import os
import pandas as pd
os.chdir("Countries/")
extension = 'xlsx'
all_filenames = [i for i in glob.glob('*.{}'.format(extension))]
file_list = []
for f in all_filenames:
data = pd.read_excel(f, sheet_name='Dataset2')
data['source_file'] = f # create a column with the name of the file
file_list.append(data)
combined = pd.concat(file_list, axis=1, ignore_index=True)
combined.to_excel( "New/combined.xlsx", index=False, encoding='utf-8-sig')
if you're using os module try path.basename and adding this to the key argument in concat:
import glob
import os
import pandas as pd
os.chdir(r"C:\Users\Umar.Hussain\OneDrive - Ricoh Europe PLC\Documents\Excels")
extension = 'xlsx'
all_filenames = [i for i in glob.glob('*.{}'.format(extension))]
names = [os.path.basename(f) for f in all_filenames]
combined = pd.concat([pd.read_excel(f, sheet_name='Sheet1') for f in all_filenames],keys=names,axis=1 )
as your using axis=1 this will add the keys to the header, so may want to read the excels first and add it to a list like :
dfs = []
for file in all_filenames:
df = pd.read_excel(file)
df['source'] = os.path.basename(file)
dfs.append(df)

Categories

Resources