Extracting multiple excel files as Pandas data frame - python

I'm trying to create a data ingestion routine to load data from multiple excel files with multiple tabs and columns in the pandas data frame. The structuring of the tabs in each of the excel files is the same. Any help would be appreciated!!
folder = "specified_path"
files = os.listdir(folder)
sheet_contents = {}
for file in files:
data = pd.ExcelFile(folder+file)
file_data = {}
for sheet in data.sheet_names:
file_data[sheet] = data.parse(sheet)
sheet_contents[file[:-5]] = file_data

One of the ways to create a dataframe for each excelfile (stored in a specific folder and that holds multiple sheets) is by using pandas.read_excel and pandas.concat combined. By passing the parameter sheet_name=None to pandas.read_excel, we can read in all the sheets in the excelfile at one time.
Try this :
import os
import pandas as pd
folder = 'specified_path'
excel_files = [file for file in os.listdir(folder)]
list_of_dfs = []
for file in excel_files :
df = pd.concat(pd.read_excel(folder + "\\" + file, sheet_name=None), ignore_index=True)
df['excelfile_name'] = file.split('.')[0]
list_of_dfs.append(df)
To access to one of the dataframes created, you can use its index (e.g, list_of_dfs[0]) :
print(type(list_of_dfs[0]))
<class 'pandas.core.frame.DataFrame'>

Related

Merging csv files into one (columnwise) in Python

I have many .csv files like this (with one column):
picture
Id like to merge them into one .csv file, so that each of the column will contain one of the csv files data. The headings should be like this (when converted to spreadsheet):
picture (the first number is the number of minutes extracted from the file name, the second is the first word in the file name behind "export_" in the name, and third is the whole name of the file).
Id like to work in Python.
Can you please someone help me with this? I am new in Python.
Thank you very much.
I tried to join only 2 files, but I have no idea how to do it with more files without writing all down manually. Also, i dont know, how to extract headings from the file names:
import pandas as pd
file_list = ['export_Control 37C 4h_Single Cells_Single Cells_Single Cells.csv', 'export_Control 37C 0 min_Single Cells_Single Cells_Single Cells.csv']
df = pd.DataFrame()
for file in file_list:
temp_df = pd.read_csv(file)
df = pd.concat([df, temp_df], axis=1)
print(df)
df.to_csv('output2.csv', index=False)
Assuming that your .csv files they all have a header and the same number of rows, you can use the code below to put all the .csv (single-columned) one besides the other in a single Excel worksheet.
import os
import pandas as pd
csv_path = r'path_to_the_folder_containing_the_csvs'
csv_files = [file for file in os.listdir(csv_path)]
list_of_dfs=[]
for file in csv_files :
temp=pd.read_csv(csv_path + '\\' + file, header=0, names=['Header'])
time_number = pd.DataFrame([[file.split('_')[1].split()[2]]], columns=['Header'])
file_title = pd.DataFrame([[file.split('_')[1].split()[0]]], columns=['Header'])
file_name = pd.DataFrame([[file]], columns=['Header'])
out = pd.concat([time_number, file_title, file_name, temp]).reset_index(drop=True)
list_of_dfs.append(out)
final= pd.concat(list_of_dfs, axis=1, ignore_index=True)
final.columns = ['Column' + str(col+1) for col in final.columns]
final.to_csv(csv_path + '\output.csv', index=False)
final
For example, considering three .csv files, running the code above yields to :
>>> Output (in Jupyter)
>>> Output (in Excel)

Merge Multiple Excel files having multiple sheets to One Excel file,

I am trying to do this
Multiple Excel files having multiple sheets to One Excel file, having merged data in multiple sheets.
if all files have sheets "A",B and C, data from all Sheets should concate in a single file under the same sheet names. I wrote the below code but I am failing; I get the error 'NoneType' object has no attribute 'to_excel'
import pandas as pd
#return all file paths that match a specific pattern in our case we want all *.xlsx
import glob
import os
import openpyxl
import xlrd
#reading excel files folder
location = r"C:\-----Desktop\python\Input/*.xlsx"
excel_files = glob.glob(location)
with pd.ExcelWriter(r"C:\---on\Output\filemergetest.xlsx") as writer: # excel writing for multple sheets
for files in excel_files:
sheet = os.path.basename(files) #simplying file name
sheet = sheet.split(".")[0] #simplying file name
list_of_dfs = []
# Iterate through each worksheet
for sheet in excel_files:
fi = pd.ExcelFile(sheet)
for sh in fi.sheet_names:
# Parse data from each worksheet as a Pandas DataFrame
dfC = fi.parse(sheet_name=0)
# And append it to the list
data2 = list_of_dfs.append(dfC)
# Combine all DataFrames into one
#data2 = pd.concat(list_of_dfs,ignore_index=True)
# Preview first 10 rows
data2
data2.to_excel(r"C:----python\Output\jointsheetsTAA.xlsx",index=False)

Use Python to combine excel files from folder and keep the original formatting

I have already searched for the problem on the Internet, but no satisfying solution.
Lots of excel files that have different formatting in one folder.
The requirement is that combining all the excel files into different sheets of 1 excel. And the sheet name should be the individual excel name, meanwhile, it should keep the original formatting of each excel.
I now can use Pandas to combine all the excels, but the formatting is changed after writing to excel.
How to keep the formatting including the font, alignment, background and etc? Any suggestions? Thanks.
import pandas as pd
import os
# 1, List excel files:
path = r'C:\Users\h290602\Desktop\SAP'
files = os.listdir(path)
#2, Pick excel files
files_xls = [f for f in files if f.endswith(".xlsx") or f.endswith(".xls")]
#3, Initialize a empty dataframe
df = pd.DataFrame()
#4, Loop over the list of files to empty dataframe
save_path = '{0}\{1}.xlsx'.format(path,'results')
result = pd.ExcelWriter(save_path)
for f in files_xls:
excel_file_name = f.split('.')[0]
if '~$' in f:
f = f.replace('~$','')
excel_path = '{0}\{1}'.format(path,f)
df = pd.read_excel(excel_path)
df.to_excel(result,excel_file_name,index= False)
result.save()

Grabbing a single Excel worksheet from multiple workbooks into a pandas dataframe and saving this

I need to extract an Excel worksheet from multiple workbooks and saving it to a dataframe and in turn saving that dataframe.
I have a spreadsheet that is generated at the end of each month (e.g.
June 2019.xlsx, May 2019.xlsx, April 2019.xlsx).
I need to grab a worksheet 'Sheet1'from each of these workbooks and convert these to a dataframe (df1).
I would like to have this dataframe saved.
As a nice to have, I would also like some way just to append the next month's data after the initial 'data grab'.
I'm relatively new to this, so I haven't made much progress.
import os
import glob
import pandas as pd
import xlrd
import json
import io
import flatten_json
files = glob.glob('/Users/ngove/Documents/Python Scripts/2019/*.xlsx')
dfs={}
for f in files:
dfs[os.path.splitext(os.path.basename(f))[0]] = pd.read_excel(f)
You can drop all of your files in a directory (e.g. current directory). Then append all of your excel files in a list (e.g. files_xls). Iterate over all your files and use pandas.read_excel to get the respective dataframes (e.g. list_frames).
Below, you can find an example:
import os
import pandas as pd
path = os.getcwd() # get cur dir
files = os.listdir(path) # get all the files in your cur dir
# get only the xls or xlsm (this depends on you)
files_xls = [f for f in files if (f[-3:] == 'xls' or f[-4:] == 'xlsm')]
df = pd.DataFrame()
list_frames = []
for f in files_xls:
print("Processing file: %s" %f)
try:
# the following will give you the dataframe
# the fun params depends on your data format
data = pd.read_excel(f, 'Sheet1', header=0, index_col=None,
sep='delimiter', error_bad_lines=False,
skip_blank_lines=True, comment=',,')
except:
pass
list_frames.append(data)
# at the end you can concat your data if you want and remove any dublicate
df = pd.concat(list_frames, sort=False).fillna(0)
df = df.drop_duplicates()
# at the end you can save it
writer = pd.ExcelWriter("your_title" + ".xlsx", engine='xlsxwriter')
df.to_excel(writer, sheet_name="Sheets1", index=False)
writer.save()
I hope this helps.
I interpreted your statement that you want to save the dataframe as that you want to save it as a combined Excel file. This will combine all files in the folder specified that end in xlsx.
import os
import pandas as pd
from pandas import ExcelWriter
os.chdir("H:/Python/Reports/") #edit this to be your path
path = os.getcwd()
files = os.listdir(path)
files_xlsx = [f for f in files if f[-4:] == 'xlsx']
df = pd.DataFrame()
for f in files_xlsx:
data = pd.read_excel(f, 'Sheet1')
df = df.append(data)
writer=ExcelWriter('Combined_Data.xlsx')
df.to_excel(writer,'Sheet1',index=False)
writer.save()
You could update the code to grab all 2019 files by changing the one line to this:
files_xlsx = [f for f in files if f[-9:] == '2019.xlsx']
I referenced this question for most of the code and updated for xlsx and added the file save portion of the code

Convert multiple xlsm files automatically to multiple csv files by using pandas

I have 300 raw datas (.xlsm) and wanne to extract useful datas and turn them to csv files as input for subsequent neural network, now i try to implement them with 10 datas as example, i have sucessfully extracted the informations what i need, but i dont know how to convert them to csv files with the same name, for single data we can use df.to_csv, but how about for all the datas? with for function?
import glob
import pandas as pd
import numpy as np
import csv
import os
excel_files = glob.glob('../../Versuch/Versuche/RohBeispiel/*.xlsm')
directory = '/Beispiel'
for files in excel_files:
data = pd.read_excel(files)
# getting the list of rows and columns you need
list_of_dfs = pd.DataFrame(data.values[0:600:,12:26],
columns=data.columns[12:26]).drop(['Sauberkeit', 'Temparatur'], axis=1)
# converting pandas dataframe columns to numeric: string into float
cols = ['KonzA', 'KonzB', 'KonzC', 'TempA',
'TempB', 'TempC', 'Modul1', 'Modul2',
'Modul3', 'Modul4', 'Modul5', 'Modul6']
list_of_dfs[cols] = list_of_dfs[cols].apply(pd.to_numeric, errors='coerce', axis=1)
# Filling down from a column through missing data
for fec in list_of_dfs[cols]:
list_of_dfs[fec].fillna(method='ffill', inplace=True)
csvfilename = files.split('/')[-1].split('.')[0] + '.csv'
newtempfile = os.path.join(directory,csvfilename)
print(newtempfile)
print(list_of_dfs.head(2))
problem is solved.
folder_name = 'Beispiel'
csvfilename = files.split('/')[-1].split('.')[0] + '.csv' # change into csv files
newtempfile = os.path.join(folder_name, csvfilename)
# Verify if directory exists
if not os.path.exists(folder_name):
os.makedirs(folder_name) # If not, create it
print(newtempfile)
list_of_dfs.to_csv(newtempfile, index=False)
The easiest way of doing this is to get the filename from the excel and then use the os.path.join() method to save it to the directory you want.
directory = "C:/Test"
for files in excel_files:
csvfilename = (os.path.basename(file)[-1]).replace('.xlsm','.csv')
newtempfile=os.path.join(directory,csvfilename)
Since you already have the excel df you want to push into the csv file, just add the above code to the loop and change the output csv file to 'newtempfile' and that should do it.
df.to_csv(newtempfile, 'Beispel/data{0}.csv'.format(idx))
Hope this helps. :)
Updated Code:
cols = ['KonzA', 'KonzB', 'KonzC', 'TempA',
'TempB', 'TempC', 'Modul1', 'Modul2',
'Modul3', 'Modul4', 'Modul5', 'Modul6']
excel_files = glob.glob('../../Versuch/Versuche/RohBeispiel/*.xlsm')
for file in excel_files:
data = pd.read_excel(file, columns = cols) # import only the columns you need to the dataframe
csvfilename = (os.path.basename(files)[-1]).replace('.xlsm','.csv')
newtempfile=os.path.join(directory,csvfilename)
# converting pandas dataframe columns to numeric: string into float
data[cols] = data[cols].apply(pd.to_numeric, errors='coerce', axis=1)
data[cols].fillna(method='ffill', inplace=True)
data.to_csv(newtempfile).format(idx)

Categories

Resources