How to read multiple excel files in Different Pandas Dataframes - python

I have collection of excel files containing similar datasets. I want it to be read by different Pandas dataframes.
import glob
import pandas as pd
path=r"C:users/me/desktop/ExcelData"
files=glob.glob('*.xls')
for f in files:
df[f]=pd.read_excel(f)

import glob
import pandas as pd
import os
path=r"C:\\users\\me\\desktop\\ExcelData\\"
csv_files = glob.glob(os.path.join(path, "*.xls"))
dfl=[]
for f in csv_files:
x= pd.read_excel(f)
dfl.append(x)

import pandas as pd
import os
import glob
path = r"C:users/me/desktop/ExcelData"
files = glob.glob(path + "\*.xls")
finalexcelsheet = pd.DataFrame()
for file in files:
df = pd.concat(pd.read_excel(file, sheet_name = None), ignore_index=True,
sort=False)
finalexcelsheet = finalexcelsheet.append(df, ignore_index = True)
print(finalexcelsheet)

Related

How to run function on multiple dataframes of variable row sizes, then generate a new dataframe with just the function results

I have a folder full of CSVs of equal columns but variable rows. I want to convert each to a dataframe and run a simple function on them, and create one new dataframe with just the function values and the file names as the index.
So far I have:
import os.path
import tkinter.filedialog as filedialog
import glob
import pandas as pd
file_path = filedialog.askdirectory()
pattern = os.path.join(file_path, '*.csv')
files = glob.glob(pattern)
for index, file in enumerate(files):
df = pd.read_csv(file, sep=',', index_col=[0])
df.loc['total'] = df.sum(numeric_only=True, axis=0) # or any function
pd.concat(df2[df.index == 'total'])
df.to_csv('file_path')
I'm sure there are several ways in which this is messed up, but any advice is appreciated
import os.path
import tkinter.filedialog as filedialog
import glob
import pandas as pd
file_path = filedialog.askdirectory()
pattern = os.path.join(file_path, '*.csv')
files = glob.glob(pattern)
dfs = []
for index, file in enumerate(files):
df = pd.read_csv(file, sep=',', index_col=[0])
# Would remove the .loc, but it does no harm
df.loc['total'] = df.sum(numeric_only=True, axis=0) # or any function
dfs.append(df[['total']])
df_total = pd.concat(dfs).reset_index(drop=True)
df_total.to_csv('file_path')
OK I figured it out:
import os.path
import tkinter.filedialog as filedialog
import glob
import pandas as pd
file_path = filedialog.askdirectory()
pattern = os.path.join(file_path, '*.csv')
files = glob.glob(pattern)
filename = pd.DataFrame(columns=['Filename'])
filename['Filename'] = pd.Series([file for file in files]).reset_index(drop=True)
dfs = []
for index, file in enumerate(files):
df = pd.read_csv(file, sep=',', index_col=[0])
# Would remove the .loc, but it does no harm
df.loc['total'] = df.sum(numeric_only=True, axis=0) # or any function
dfs.append(df)
dfs = pd.concat(dfs)
total = dfs[dfs.index == 'total'][['dfcolumn1','dfcolumn2',etc]]#write column names exactly as they appear on csv
total_named = filename.join(total.set_index(filename.index))
total_named.to_csv('file_path')

Append new excel files to dataframe

i have a code that takes all files from dir and create dataframe, then saves it to xls. How do I make it to just append new files instead of creating new dataframe each time from all files?
import os
import pandas as pd
import numpy as np
import glob
import time
path = r'C:\Users\user\Desktop\test' # path
all_files = glob.glob(os.path.join(path, "*.xlsm")) # advisable to use os.path.join as this makes concatenation OS independent
df_from_each_file = (pd.read_excel(f, sheet_name='Rank', header= 2, usecols=['Model', 'HK']) for f in all_files)
concatenated_df = pd.concat(df_from_each_file, ignore_index=True)
concatenated_df = concatenated_df.set_index('Model')
#Save dataframe to excel file
concatenated_df.to_excel(r'C:\Users\user\Desktop\test\output.xlsx')

merge excel files with dynamic names

I have an Excel file that needs to be refreshed automatically every week. It must be extended by other Excel files. The problem is that these files have different names each time.
So in my opinion i can not use code like:
import pandas as pd
NG = 'NG.xlsx'
df = pd.read_excel(NG)
because the filename is not always "NG" like in this case.
Do you have any ideas?
Best Greetz
You could read all the files in your folder by doing this, because it allows you to ignore name changes:
import sys
import csv
import glob
import pandas as pd
# get data file names
path = r"C:\.......\folder_with_excel"
filenames = glob.glob(path + "/*.xlsx")
DF = []
for df in dfs:
xl_file = pd.ExcelFile(filenames)
df=xl_file.parse('Sheet1')
DF.concat(df, ignore_index=True)
Alternatively:
import os
import pandas as pd
path = os.getcwd()
files = os.listdir(path) # list all the files in you directory
files_xls = [f for f in files if f[-3:] == 'xlsx'] # make a list of the xlsx
df = pd.DataFrame()
for f in files_xls:
info = pd.read_excel(f, '<sheet name>') # remove <sheet name if you don't need it
df = df.append(info)

How to create a pandas dataframe from one file (with any file name) located in a specified folder?

What's the best way to create a pandas dataframe from one file with any file name located in a specified folder?
I have used pathlib and it's not quite working as the output dataframe is not giving me anything.
from pathlib import Path
import pandas as pd
pth = r'C:\Users\HP\Desktop\IBM\New folder'
fle = Path(pth).glob('*.tsv')
someDf = pd.DataFrame(fle)
someDf
Edit:
I also tried doing the below, but the output dataframe combines all columns into one column separated by a backward slash. How do I fix this?
from pathlib import Path
import pandas as pd
pth = r'C:\Users\HP\Desktop\IBM\New folder'
fle = Path(pth).glob('*.tsv')
dfs = []
for filename in fle:
dfs.append(pd.read_csv(filename))
dfs1 = pd.concat(dfs)
dfs1.head()
The way I did this seems complicated. Is there an easier way to do this?
Please try:
from pathlib import Path
import pandas as pd
import os
pth = r'C:\Users\HP\Desktop\IBM\New folder'
for file_ in os.listdir(pth):
h=os.path.join(pth, file_)
#print (h)
someDf = pd.read_csv(h)
someDf
Try
from glob import glob
files = glob('C:\Users\HP\Desktop\IBM\New folder\*.tsv')
if len(files) == 1:
dfs = pd.read_csv(files[0], sep='\t')
else:
dfs = pd.concat([pd.read_csv(file, sep='\t') for file in files])
The solution I found for this is as below. I missed the sep parameter in pd.read_csv().
from pathlib import Path
import pandas as pd
pth = r'C:\Users\HP\Desktop\IBM\New folder'
fle = Path(pth).glob('*.tsv')
dfs = []
for filename in fle:
dfs.append(pd.read_csv(filename, sep='\t'))
dfs1 = pd.concat(dfs)
dfs1.head()

Reading multiple csv files and writing it to another csv file

I have this code
import pandas as p
import csv
df = p.read_csv('interview1.csv')
df2 = df[['Participant', 'Translation']] # selects two of the columns in your file
df2.to_csv('out.csv')
How do i read multiple files and then write to 'out.csv'. So basically, instead of reading only interview1, i read interview2, interview3 to interview7 as well into the out.csv
Simply open the output file in append mode:
import pandas as p
import csv
csv_list=['interview1.csv', 'interview2.csv', ...]
for itw in csv_list:
df = p.read_csv(itw)
df.to_csv('out.csv', mode='a')
Use this to read all .CSV data from a folder and combined it together
import pandas as pd
import glob
import os
path = r'file path'
all_files = glob.glob(os.path.join(path, "*.csv"))
df_from_each_file = (pd.read_csv(f) for f in all_files)
concatenated_df = pd.concat(df_from_each_file, ignore_index=True)
concatenated_df.to_csv("combined-data_new.csv")

Categories

Resources