Iterate through folders and find a file to put into a dataframe - python

I have a directory ../customer_data/* with 15 folders. Each folder is a unique customer.
Example: ../customer_data/customer_1
Within each customer folder there is a csv called surveys.csv.
GOAL: I want to iterate through all the folders in ../customer_data/* and find the surveys.csv for each unique customer and create a concatenated dataframe. I also want to add a column in the dataframe where it has the customer id which is the name of the folder.
import glob
import os
rootdir = '../customer_data/*'
dataframes = []
for subdir, dirs, files in os.walk(rootdir):
for file in files:
csvfiles = glob.glob(os.path.join(rootdir, 'surveys.csv'))
# loop through the files and read them in with pandas
# a list to hold all the individual pandas DataFrames
df = pd.read_csv(csvfiles)
df['customer_id'] = os.path.dirname
dataframes.append(df)
# concatenate them all together
result = pd.concat(dataframes, ignore_index=True)
result.head()
This code is not giving me all 15 files. Please help

You can use the pathlib module for this.
from pathlib import Path
import pandas as pd
dfs = []
for filepath in Path("customer_data").glob("customer_*/surveys.csv"):
this_df = pd.read_csv(filepath)
# Set the customer ID as the name of the parent directory.
this_df.loc[:, "customer_id"] = filepath.parent.name
dfs.append(this_df)
df = pd.concat(dfs)

Let's try pathlib with rglob which will recursively search your directory structure for all files that match a glob pattern. in this instance survey.
import pandas as pd
from pathlib import Path
root_dir = Path('/top_level_dir/')
files = {file.parent.parts[-1] : file for file in Path.rglob('*survey.csv')}
df = pd.concat([pd.read_csv(file).assign(customer=name) for name,file in files.items()])
Note you'll need Python 3.4+ for pathlib.

Related

Read the all excel files in a folder and split the each file name, add splitted name into the dataframe

All files have a name convention such as NPS_Platform_FirstLabel_Session_Language_Version.xlsx
I want to have additional columns like Platform, FirstLabel, Session, Language, Version these will column names and the values determined by filenames. I coded the following, it works but the value of added columns just came from the last file. For example, assume that the last filename is
NPS_MEM_GAIT_Science_EN_10.xlsx. Therefore, all of the added columns values are MEM, GAIT_Science, etc. Not the corresponding file names.
import glob
import os
import pandas as pd
path = "C:/Users/User/blabla"
all_files = glob.glob(os.path.join(path, "*.xlsx")) #make list of paths
df = pd.DataFrame()
for f in all_files:
data = pd.read_excel(f)
df = df.append(data)
file_name = os.path.splitext(os.path.basename(f))[0]
nameList = []
nameList = file_name.rsplit('_')
df['Platform'] = nameList[1]
df['First label']= nameList[2]
df['Session'] = nameList[3]
df['Language'] = nameList[4]
df['Version'] = nameList[5]
df
I started with nameList[1] since I don't want NPS.
Any suggestions or feedback?
I have found a solution, I leave it here since there are more views than I expected.
import glob
import os
import pandas as pd
path = "C:/Users/User/....."
all_files = glob.glob(os.path.join(path, "*.xlsx")) #make list of paths
df_files= [pd.read_excel(filename) for filename in all_files]
for dataframe, filename in zip(df_files, all_files):
filename =os.path.splitext(os.path.basename(filename))[0]
filename = filename.rsplit('_')
dataframe['Platform'] = filename[1]
dataframe['First label']= filename[2]
dataframe['Session'] = filename[3]
dataframe['Language'] = filename[4]
dataframe['Version'] = filename[5]
df= pd.concat(files_df, ignore_index=True)
I think the reason is I was just iterating over the files, not the dataframe that I was trying to build. With this, I can iterate over the dataframe and file names at the same time. I have found this solution on https://jonathansoma.com/lede/foundations-2017/classes/working-with-many-files/class/
But still if you can give explicit answer about why the first code does not work as I want, it would be great

merge excel files with dynamic names

I have an Excel file that needs to be refreshed automatically every week. It must be extended by other Excel files. The problem is that these files have different names each time.
So in my opinion i can not use code like:
import pandas as pd
NG = 'NG.xlsx'
df = pd.read_excel(NG)
because the filename is not always "NG" like in this case.
Do you have any ideas?
Best Greetz
You could read all the files in your folder by doing this, because it allows you to ignore name changes:
import sys
import csv
import glob
import pandas as pd
# get data file names
path = r"C:\.......\folder_with_excel"
filenames = glob.glob(path + "/*.xlsx")
DF = []
for df in dfs:
xl_file = pd.ExcelFile(filenames)
df=xl_file.parse('Sheet1')
DF.concat(df, ignore_index=True)
Alternatively:
import os
import pandas as pd
path = os.getcwd()
files = os.listdir(path) # list all the files in you directory
files_xls = [f for f in files if f[-3:] == 'xlsx'] # make a list of the xlsx
df = pd.DataFrame()
for f in files_xls:
info = pd.read_excel(f, '<sheet name>') # remove <sheet name if you don't need it
df = df.append(info)

How do I import a load of csvs into different python dataframes via a loop?

I have a load of csv files. I want to create a loop that allows me to do this;
df_20180731 = pd.read_csv('path/cust_20180731.csv')
for each of about 36 files.
My files are df_20160131, df_20160231 ...... df_20181231 etc. Basically dates by the end of the month.
Thanks
# include here all ids
files = ['20160131', '20160231']
_g = globals()
for f in files:
_g['df_{}'.format(f)] = pandas.read_csv('path/cust_{}.csv'.format(f))
print(df_20160131)
You could do something like:
import glob
import pandas as pd
datasets = {}
for file in glob.glob('path/df_*'):
datasets[file] = pd.read_csv(file)
import os
import pandas as pd
# get a list of all the files in the directory
files = os.listdir(<path of the directory containing all the files>)
#iterate over all the files and store it in a dictionary
dataframe = {file: pd.read_csv(file) for file in files}
#if the directory must contain other files,
#you can check the file paths with any logic(extension etc.), in that case
def logic(fname):
return '.csv' in fname
dataframe = {file: pd.read_csv(file) for file in files if logic(file) }
#this will create a dictionary of file : dataframe_objects
I hope it helps

Pandas not exporting dataframe to csv

I have a script to output a whole bunch of CSVs to folder c:\Scripts\CSV. This particular script is looping through all of the dataframes and counting the usage of the top 100 words in the data set. The top 100 words and their count are added to a list, the dataframes are concatenated, and then the csv should export. The print contains the correct information, but the script doesn't output any file.
#! python3
import pandas as pd
import os
path = r'Scripts\\CSV\\'
directory = os.path.join("c:\\",path)
appended_data = []
for root,dirs,files in os.walk(directory):
for file in files:
if file.endswith(".csv"):
thread = pd.read_csv(directory + file)
thread.columns = ['num', 'id', 'body', 'title', 'url']
s = pd.Series(''.join(thread['body']).lower().split()).value_counts()[:100]
appended_data.append(s)
thatdata = pd.concat(appended_data)
#print(appended_data)
thatdata.to_csv = (directory + 'somename.csv')
Try using pathlib instead:
from pathlib import PureWindowsPath
directory = PureWindowsPath('c:/Scripts/CSV/')
for csv_f in directory.glob('**/*.csv'):
# process inputs
target_path = directory / 'somename.csv'
thatdata.to_csv(target_path)

Python Fetching Name Of All CSV Files From Path And Writing Each To Different Folder

I am trying to open all files from a folder, store them in a dataframe and append each csv file with another csv file called Append.csv and am trying to write all the files with their names to a different folder.
For example I have 5 csv files that are saved in a folder called CSV FILES FOLDER. These files are F1.csv, F2.csv, F3.csv, F4.csvand F5.csv. What I am trying to do is open each file using pandas and I do this in a for loop, Append.csv and now store it in a different folder called NEW CSV FILES FOLDER as :
F1_APPENDED.csv
F2_APPENDED.csv
F3_APPENDED.csv
F4_APPENDED.csv
In other words, the _APPENDED is added with each file and then the file with the new name having _APPENDED is saved.
I have already defined the path for this folder but cant save it. The code is as below :
import pandas as pd
import glob
import os.path
import pathlib
path =r'C:\Users\Ahmed Ismail Khalid\Desktop\CSV FILES FOLDER'
allFiles = glob.glob(path + "/*.csv")
path1 = r'C:\Users\Ahmed Ismail Khalid\Desktop\Different Folder\Bitcoin Prices Hourly Based.csv'
outpath = r'C:\Users\Ahmed Ismail Khalid\Desktop\NEW CSV FILES FOLDER'
for f in allFiles:
file = open(f, 'r')
df1 = pd.read_csv(path1)
df2 = pd.read_csv(f)
output = pd.merge(df1, df2, how="inner", on="created_at")
df3 = output.created_at.value_counts().rename_axis('created_at').reset_index(name='count')
df3 = df3.sort_values(by=['created_at'])
#print(df3,'\n\n')
df3.to_csv(outpath+f, encoding='utf-8',index=False)
#print(f,'\n\n')
How can I do this? I tried to look up the official documentation but couldn't understand anything
Any and all help would be appreciated
Thanks
Here, I added a line in the for loop where you can get just the file name. You can use that instead of the full path to the file when you write the file and indicate the output .csv filename.
import pandas as pd
import glob
import os.path
import pathlib
path =r'C:\Users\Ahmed Ismail Khalid\Desktop\CSV FILES FOLDER'
allFiles = glob.glob(path + "/*.csv")
path1 = r'C:/Users/Ahmed Ismail Khalid/Desktop/Different Folder/Bitcoin Prices Hourly Based.csv'
# You need to have a slash at the end so it knows it's a folder
outpath = r'C:/Users/Ahmed Ismail Khalid/Desktop/NEW CSV FILES FOLDER/'
for f in allFiles:
file = open(f, 'r')
_, fname = os.path.split(f)
fname, ext = os.path.splittext(fname)
df1 = pd.read_csv(path1)
df2 = pd.read_csv(f)
output = pd.merge(df1, df2, how="inner", on="created_at")
df3 = output.created_at.value_counts().rename_axis('created_at').reset_index(name='count')
df3 = df3.sort_values(by=['created_at'])
#print(df3,'\n\n')
df3.to_csv(outpath+fname+'_appended.csv', encoding='utf-8',index=False)
#print(f,'\n\n')

Categories

Resources