Iterate through excel files and sheets and concatenate in Python - python

Say I have a folder which have multiple excel files with extension xlsx or xls, they share same header column a, b, c, d, e except some empty sheet in several files.
I want to iterate all the files and sheets (except for empty sheets) and concatenate them into one sheet of one file output.xlsx.
I have iterated through all excel files and append them to one file, but how could I iterate through all the sheets of each files if they have more than one sheets?
I need to integrate two block of code below into one. Thanks for your help.
import pandas as pd
import numpy as np
import glob
path = os.getcwd()
files = os.listdir(path)
files
df = pd.DataFrame()
# method 1
excel_files = [f for f in files if f[-4:] == 'xlsx' or f[-3:] == 'xls']
excel_files
for f in excel_files:
data = pd.read_excel(f)
df = df.append(data)
# method 2
for f in glob.glob("*.xlsx" or "*.xls"):
data = pd.read_excel(f)
df = df.append(data, ignore_index=True)
# save the data frame
writer = pd.ExcelWriter('output.xlsx')
df.to_excel(writer, 'sheet1')
writer.save()
For one file to concatenate multiple sheets:
file = pd.ExcelFile('file.xlsx')
names = file.sheet_names # read all sheet names
df = pd.concat([file.parse(name) for name in names])

import pandas as pd
path = os.getcwd()
files = os.listdir(path)
files
excel_files = [file for file in files if '.xls' in file]
excel_files
def create_df_from_excel(file_name):
file = pd.ExcelFile(file_name)
names = file.sheet_names
return pd.concat([file.parse(name) for name in names])
df = pd.concat(
[create_df_from_excel(xl) for xl in excel_files]
)
# save the data frame
writer = pd.ExcelWriter('output.xlsx')
df.to_excel(writer, 'sheet1')
writer.save()

Related

pandas loop through excel files and sheets

Need help please.
Using python 3.
I need to loop through a folder that contains excel files and each file has multiple sheets.
How do I loop through all the files and all the sheets and extract to a dataframe?
What I was able to accomplish only returns one excel file and all the worksheets for that file but I need for all files. Please help.
This is what I have so far:
from xlsxwriter import Workbook
import pandas as pd
import openpyxl
import glob
import os
path = 'filestoimport/*.xlsx'
for filepath in glob.glob(path):
xl = pd.ExcelFile(filepath)
# Define an empty list to store individual DataFrames
list_of_dfs = []
list_of_dferror= []
for sheet_name in xl.sheet_names:
df = xl.parse(sheet_name, usecols='A,D,N,B,C,E,F,G,H,I,J,K,L,M', header=0)
df.columns = df.columns.str.replace(' ', '')
df['sheetname'] = sheet_name # this adds `sheet_name` into the column
# using basename function from os
# module to print file name
file_name = os.path.basename(filepath)
df['sourcefilename'] = file_name
# only add sheets containing columns ['Status', 'ProjectID']
column_names = ['Status', 'ProjectID']
if set(column_names).issubset(df.columns):
df['Status'].fillna('', inplace=True)
df['Addedby'].fillna('', inplace=True)
# And append it to the list
list_of_dfs.append(df)
# Combine all DataFrames into one
data = pd.concat(list_of_dfs, ignore_index=True)

Grabbing a single Excel worksheet from multiple workbooks into a pandas dataframe and saving this

I need to extract an Excel worksheet from multiple workbooks and saving it to a dataframe and in turn saving that dataframe.
I have a spreadsheet that is generated at the end of each month (e.g.
June 2019.xlsx, May 2019.xlsx, April 2019.xlsx).
I need to grab a worksheet 'Sheet1'from each of these workbooks and convert these to a dataframe (df1).
I would like to have this dataframe saved.
As a nice to have, I would also like some way just to append the next month's data after the initial 'data grab'.
I'm relatively new to this, so I haven't made much progress.
import os
import glob
import pandas as pd
import xlrd
import json
import io
import flatten_json
files = glob.glob('/Users/ngove/Documents/Python Scripts/2019/*.xlsx')
dfs={}
for f in files:
dfs[os.path.splitext(os.path.basename(f))[0]] = pd.read_excel(f)
You can drop all of your files in a directory (e.g. current directory). Then append all of your excel files in a list (e.g. files_xls). Iterate over all your files and use pandas.read_excel to get the respective dataframes (e.g. list_frames).
Below, you can find an example:
import os
import pandas as pd
path = os.getcwd() # get cur dir
files = os.listdir(path) # get all the files in your cur dir
# get only the xls or xlsm (this depends on you)
files_xls = [f for f in files if (f[-3:] == 'xls' or f[-4:] == 'xlsm')]
df = pd.DataFrame()
list_frames = []
for f in files_xls:
print("Processing file: %s" %f)
try:
# the following will give you the dataframe
# the fun params depends on your data format
data = pd.read_excel(f, 'Sheet1', header=0, index_col=None,
sep='delimiter', error_bad_lines=False,
skip_blank_lines=True, comment=',,')
except:
pass
list_frames.append(data)
# at the end you can concat your data if you want and remove any dublicate
df = pd.concat(list_frames, sort=False).fillna(0)
df = df.drop_duplicates()
# at the end you can save it
writer = pd.ExcelWriter("your_title" + ".xlsx", engine='xlsxwriter')
df.to_excel(writer, sheet_name="Sheets1", index=False)
writer.save()
I hope this helps.
I interpreted your statement that you want to save the dataframe as that you want to save it as a combined Excel file. This will combine all files in the folder specified that end in xlsx.
import os
import pandas as pd
from pandas import ExcelWriter
os.chdir("H:/Python/Reports/") #edit this to be your path
path = os.getcwd()
files = os.listdir(path)
files_xlsx = [f for f in files if f[-4:] == 'xlsx']
df = pd.DataFrame()
for f in files_xlsx:
data = pd.read_excel(f, 'Sheet1')
df = df.append(data)
writer=ExcelWriter('Combined_Data.xlsx')
df.to_excel(writer,'Sheet1',index=False)
writer.save()
You could update the code to grab all 2019 files by changing the one line to this:
files_xlsx = [f for f in files if f[-9:] == '2019.xlsx']
I referenced this question for most of the code and updated for xlsx and added the file save portion of the code

Importing multiple excel files into Python, merge and apply filename to a new column

I have a for loop that imports all of the Excel files in the directory and merge them together in a single dataframe. However, I want to create a new column where each row takes the string of the filename of the Excel-file.
Here is my import and merge code:
path = os.getcwd()
files = os.listdir(path)
df = pd.DataFrame()
for f in files:
data = pd.read_excel(f, 'Sheet1', header = None, names = ['col1','col2'])
df = df.append(data)
For example if first Excel file is named "file1.xlsx", I want all rows from that file to have value file1.xlsx in col3 (a new column). If the second Excel file is named "file2.xlsx" I want all rows from that file to have value file2.xlsx. Notice that there is no real pattern of the Excel files, and I just use those names as an example.
Many thanks
Create new column in loop:
df = pd.DataFrame()
for f in files:
data = pd.read_excel(f, 'Sheet1', header = None, names = ['col1','col2'])
data['col3'] = f
df = df.append(data)
Another possible solution with list comprehension:
dfs = [pd.read_excel(f, 'Sheet1', header = None, names = ['col1','col2']).assign(col3 = f)
for f in files]
df = pd.concat(dfs)

How to split a large excel file into multiple worksheets based on their given ip address using pandas python

I'm new to pandas and python so ran into some trouble. I have a one large excel file which i need to divide into multiple worksheets using a python script. That i have to divide base on the ip addresses given in the data. I can't figure out how to do that and would appreciate some help and guidance.
I have no knowledge of working with python or any libraries before. This is what i did but created workbooks for each row.
import pandas as pd
df = pd.read_excel("D:/Users/Zakir/Desktop/MyNotebooks/Legacy.xls", sheet_name="Total", header=0, names=None, index_col=None, parse_cols=None, usecols=None, squeeze=False, dtype=None, engine=None, converters=None, true_values=None, false_values=None, skiprows=None, nrows=None, na_values=None, keep_default_na=True, verbose=False, parse_dates=False, date_parser=None, thousands=None, comment=None, skipfooter=0, convert_float=True, mangle_dupe_cols=True)
writer = pd.ExcelWriter('D:/Users/Zakir/Desktop/MyNotebooks/pandas_simple.xlsx', engine='xlsxwriter')
for index, row in df.iterrows():
df1 = df.iloc[[index]]
df1.set_index('Number',inplace=True)
df1.to_excel(writer, sheet_name=row['IPAddress'])
writer.save()
This is the kind of excel file i have. over 5000 rows. There are 60 groups of ip addresses and have to divide each group into its own worksheet
one solution if you have enough memory:
from pandas import ExcelWriter
df = pd.read_excel('file',sheet_name="Total", header=0, #other settings.....#)
writer = ExcelWriter('E:/output.xlsx',engine='xlsxwriter')
print(df)
def writesheet(g):
a = g['IPAddress'].tolist()[0]
g.to_excel(writer, sheet_name=str(a), index=False)# index = True if you want to keep index
df.groupby('IPAddress').apply(writesheet)
writer.save()
This is how I implemented the code to check a folder, loop through all excel files and split each file by the values of a column name, which can be passed as input(vColName), assuming one sheet in the file:
import sys
import os, shutil
from os import listdir
from os.path import isfile, join
import pandas as pd
import urllib as ul
import datetime
import xlrd
#this method retrieves all the xlsx filenames from a folder
def find_excel_filenames( path_to_dir, suffix=".xlsx" ):
filenames = listdir(path_to_dir)
return [ filename for filename in filenames if filename.endswith( suffix ) ]
#this folder contains .xlsx files
filePath = "D:\files\sample\"
#there is a subfolder in my solution to move the processed files to
#and another subfolder to move the splitted output files
archivePath = os.path.join(filePath, "archive")
outPath = os.path.join(filePath, "output")
#get a list of filenames
fnames = find_excel_filenames(filePath)
#loop through each file
for fl in fnames:
vFile = os.path.join(filePath, fl)
#load the content of the file to a data frame,
#I open the file twice, first to get the number of columns and
#create the converter, then to open the file with string converter
#it helps with trimming of leading zeros
df = pd.read_excel(vFile, header=None)
column_list = []
for i in df:
column_list.append(i)
converter = {col: str for col in column_list}
df1 = pd.read_excel(vFile, converters=converter)
colValues=df1[vColName].unique().tolist()
for v in colValues:
filteredDF = df1.loc[df1[vColName]==v]
vOutFile = os.path.join(outPath, fl+''_''+v.replace("/"," ")+''.xlsx'')
writer = pd.ExcelWriter(vOutFile, engine=''xlsxwriter'')
# Convert the dataframe to an XlsxWriter Excel object.
filteredDF.to_excel(writer, sheet_name=''Sheet1'')
# Close the Pandas Excel writer and output the Excel file.
writer.save()
#move the processed file to an archive folder
dst_file = os.path.join(archivePath, fl)
if os.path.exists(dst_file):
os.remove(dst_file)
shutil.move(vFile, archivePath)

Pasting from dataframe to Excel columns with a loop

I want to copy the columns from an Excel file in a certain order, and then paste the columns I concatenated to the data frame df1 into another excel file at a certain interval.
In other words, in the dataframe, paste the first column from A1 to A1 in the excel file, the second column to A3 to the third column A5 ... (assuming we have pasted 50 such columns). I've used these codes so far. But I'm blocked at this point. Thanks in advance for your help.
import os
import pandas as pd
from os.path import expanduser
os.chdir('C:\Table')
files = os.listdir('C:\Table')
print('List of files at *.xls ve *.xlsx format:\n', files)
all_files = [f for f in files if (f[-3:] == 'xls' or f[-4:] == 'xlsx')]
df1 = pd.DataFrame() # Creating empty dataframe
for f in all_files:
# Take values on C column
names= pd.read_excel(f, skiprows=1, parse_cols="C:C",sheetname='Sheet1', header=None)
df1 = pd.concat([df1, names[:1]], axis=1)
print(df1)
home = expanduser("~\Desktop") #For saving desktop
Saving = input("Please Insert Name Of File:")
writer = pd.ExcelWriter(os.path.join(home,Saving+'.xlsx'), engine='xlsxwriter')
df1.to_excel(writer,startcol=1,startrow=5, sheet_name='Sheet1', header=None, index=False)
workbook = writer.book
worksheet = writer.sheets['Sheet1']
writer.save()

Categories

Resources