could anyone advise me how to apply this code to several csv in one folder? Then, save the modified csv to another folder and each separately? In short, I need to automate it.
I need to automatically load the csv file, execute the code, save the newly modified csv file, and then repeat it to the next csv file in the folder.
import pandas as pd
import datetime as dt
import numpy as np
from numpy import nan as Nan
path = "C://Users//Zemi4//Desktop//csv//A-001.csv"
df = pd.read_csv(path,delimiter=";")
df['ta'] = pd.to_numeric(df['ta'])
df['tw'] = pd.to_numeric(df['tw'])
df["time_str"] = [dt.datetime.strptime(d, "%d.%m.%Y %H:%M:%S") for d in df["time"]]
df["time_str"] = [d.date() for d in df["time_str"]]
df["time_str"] = pd.to_datetime(df["time_str"])
df["time_zaokrouhleny"]=df["time_str"]
def analyza(pozadovane_data):
new_list = []
new_df = pd.DataFrame(new_list)
new_df=df.loc[df["time_str"] == pozadovane_data,["ta","tw", "zone", "time_zaokrouhleny"]]
counter = new_df.ta.count()
if counter < 24:
for i in range(counter,24):
new_df.loc[i] = [Nan for n in range(4)]
new_df["ta"]= new_df.ta.fillna(0)
new_df["tw"] = new_df.tw.fillna(0)
new_df["zone"] = new_df.zone.fillna(0)
new_df["time_zaokrouhleny"]=new_df.time_zaokrouhleny.fillna(new_df.time_zaokrouhleny.min())
elif counter > 24:
counter_list = list(range(24,counter))
new_df = new_df.drop(new_df.index[counter_list])
new_df["time_oprava"] = [dt.datetime.combine(d.date(),dt.time(1,0)) for d in new_df["time_zaokrouhleny"]]
s = 0
cas_list = []
for d in new_df["time_oprava"]:
d =d + dt.timedelta(hours=s)
#print(d)
#print(s)
cas_list.append(d)
s = s + 1
se = pd.Series(cas_list)
new_df['time_oprava'] = se.values
new_df['Validace'] = (new_df['ta'] != 0) & (new_df['tw'] != 0)
new_df['Rozdil'] = new_df['ta'] - new_df['tw']
new_df.rename(columns={"ta": "Skutecna teplota", "tw": "Pozadovana teplota", "time_oprava": "Cas", "zone": "Mistnost"}, inplace = True)
new_df.index = new_df['Cas']
return new_df
start = dt.datetime(2010,10,6)
end = dt.datetime(2010,12,27)
date_range = []
date_range = [start + dt.timedelta(days=x) for x in range(0,(end-start).days)]
new_list = []
vysledek_df =pd.DataFrame(new_list)
for d in date_range:
pom = analyza(d)
vysledek_df = vysledek_df.append(pom,ignore_index=True)
vysledek_df.pop('time_zaokrouhleny')
vysledek_df.to_csv('C://Users//Zemi4//Desktop//zpr//A-001.csv', encoding='utf-8', index=False)
The code itself works correctly. Thank you for your advice.
Simplest way is to use glob. Just give the folder_path and output_path as per your requirements and use the sample code below. I commented the code to help you understand the code.
import os
import glob
folder_path = 'path/to/folder/' # path to folder containing .csv files
output_path = 'path/to/output/folder/' # path to output folder
for file in glob.glob(folder_path + '*.csv'): # only loads .csv files from the folder
df = pd.read_csv(file, delimiter=";") # read .csv file
# Do something
df.to_csv(output_path + 'modified_' + str(os.path.basename(file)), encoding='utf-8', index=False) # saves modified .csv file to output_path
You want to use os.listdir() to find the contents of the directory, then parameterize the file path in a new function. You can then loop over a list of directories retrieved via os.walk() and run the function for each one.
import os
def run(file_directory):
filelist = os.listdir(file_directory)
for path in filelist:
df = pd.read_csv(path,delimiter=";")
# etc.
df.to_csv(os.path.join(file_directory, 'output.csv'))
If you need to create a new directory, you can use os.mkdir(newpath)
Can you still advise on how to parameterize the function?
Related
I need a little help in appending the data thats getting generated out of the for loop below. Currenlty, im writing it to a dataframe in line "df = pd.DataFrame(li_row, columns=col_names)"
But when I have multiple files which starts from PAJ, I need the resulted Dataframe to be appended to one Dataframe.
Also, the below is a bits and pieces we gathered and amended to suit our need. please excuse me in case you feel its a mess. :)
import xmlschema
import os
import xml.etree.ElementTree as ET
import pandas as pd
dirpath = "C:\\Users\\xxxxx\\PycharmProjects\\pythonProject\\xmls"
filenames = os.listdir("C:\\Users\\xxxxx\\PycharmProjects\\pythonProject\\xmls")
# print(filenames)
for eachfile in filenames:
fname = eachfile[0:3]
print(dirpath+'\\'+eachfile)
if fname == 'PAJ':
xmlschema.validate(dirpath+'\\'+eachfile, 'PAJ.xsd')
tree = ET.parse(eachfile)
root = tree.getroot()
# Get AlertID from header
cols = {}
for header in root.findall(".//header/alertId"):
cols[header.tag] = header.text
# print(cols)
# get detailhr to be used for column header names
col_names = []
for DtHeader in root.findall(".//detailHdr/c"):
col_names.append(DtHeader.text)
# print(col_names)
# Get row and c
li_row = []
size = 0
for Data in root.findall(".//report/data"):
for child in Data:
# print(child.tag,child.text,len(Data))
li_row.append([])
for grandchild in child:
# print(grandchild.tag, grandchild.text,len(child))
li_row[size].append(grandchild.text)
size += 1
# print(li_row)
# create a dataframe with the col_names and row with c and alertid added at the end
df = pd.DataFrame(li_row, columns=col_names)
df['alertId'] = cols['alertId']
print(df)
elif fname == 'PIE':
fileContent = ''
with open(dirpath + '\\' + eachfile) as filehandle:
fileContent = filehandle.read()
modFileContent = fileContent.replace("UTF-16", "UTF-8")
xmlschema.validate(modFileContent, 'PIE.xsd')
So if i were to change your current solution as little as possible I create a list of paj_data_frames and concatenate them once the script was done. Look at pd.concat documentation https://pandas.pydata.org/docs/user_guide/merging.html
paj_data_frames = []
for eachfile in filenames:
....
if fname == 'PAJ':
df = pd.DataFrame(li_row, columns=col_names)
df['alertId'] = cols['alertId']
paj_data_frames.append(df)
....
final_df = pd.concat(paj_data_frames)
I am trying to classify based on file extension from a local directory to excel sheet.
Like my input should be:
Directory path
My output should be:
excel sheet with different sheets based on extension.
Like if the input directory is having 5 .sh files, 8 .py file and so on.
On the basis of extension, sheets should be created with file names.
I am able to achieve the same but it is a bit hard coded.
Any help would be appreciated if it can be automated with hard code:
Below is the code i tried and its working fine:
import glob
import pandas as pd
path = r'<path_name>' #base path
files = glob.glob(path + '/**/*.*', recursive=True)
hql, hive, ksh, sh, csv, txt, sql,py = ([] for i in range(8))
for fpath in files:
chk_file = fpath.split('\\')
for file_name in chk_file:
if '.hql' in file_name:
print("Hql:",file_name)
comb = f'{file_name}'
hql.append(comb)
if '.hive' in file_name:
print(file_name)
comb = f'{file_name}'
hive.append(comb)
if '.ksh' in file_name:
print(file_name)
comb = f'{file_name}'
ksh.append(comb)
if '.sh' in file_name:
print(file_name)
comb = f'{file_name}'
sh.append(comb)
if '.sql' in file_name:
print(file_name)
comb = f'{file_name}'
sql.append(comb)
if '.txt' in file_name:
print(file_name)
comb = f'{file_name}'
txt.append(comb)
if '.csv' in file_name:
print(file_name)
comb = f'{file_name}'
csv.append(comb)
if '.py' in file_name:
print(file_name)
comb = f'{file_name}'
py.append(comb)
writer = pd.ExcelWriter(r'C:\Users\saurabh.arun.kumar\OneDrive - Accenture\Desktop\outfile2.xlsx',
engine='xlsxwriter')
new_hql = pd.DataFrame(hql,columns=['file'])
new_hive = pd.DataFrame(hive,columns=['file'])
new_sql = pd.DataFrame(sql,columns=['file'])
new_ksh = pd.DataFrame(ksh,columns=['file'])
new_txt = pd.DataFrame(txt,columns=['file'])
new_sh = pd.DataFrame(sh,columns=['file'])
new_csv = pd.DataFrame(csv,columns=['file'])
new_py = pd.DataFrame(py,columns=['file'])
new_hql.to_excel(writer, sheet_name='hql', index=False)
new_hive.to_excel(writer, sheet_name='hive', index=False)
new_sql.to_excel(writer, sheet_name='sql', index=False)
new_ksh.to_excel(writer, sheet_name='ksh', index=False)
new_csv.to_excel(writer, sheet_name='csv', index=False)
new_txt.to_excel(writer, sheet_name='txt', index=False)
new_sh.to_excel(writer, sheet_name='sh', index=False)
new_py.to_excel(writer, sheet_name='py', index=False)
writer.save()
writer.close()
print ("Executed")
This code will work with the extension provided in the code. And i want it should classify by its own reading the extension and created new sheets with the file names.
Hope i am able to explain the scenario.
You can split the extension from a files path by using
fname, fext = os.path.splitext("/what/ever/kind/of/file/this.is.txt")
Use that to create a dict of "ext" -> "list of files".
Use the dict to create n dataframes. Write them to excel.
If you only want certain extensions, filter the dict-keys to those you want:
import glob
import pandas as pd
from os import path
p = r'/redacted/location' # fix this to your path
files = glob.glob(p + '/**/*.*', recursive=True)
d = {}
i = 0 # used to redact my file names - you would simply store fn+fex
for f in files:
fn, fex = path.splitext(f)
# filter for extensions you want
if (fex in (".txt",".xlsx", ".docx") ):
# use d.setdefault(fex,[]).append(f) - I use something
# to blank out my file names here
# use collections.defaultdict to get a speed kick if needed
d.setdefault(fex,[]).append(f"file...{i}{fex}")
i += 1
# create single data frames per file extension from dictionary
dfs = []
for key,value in d.items():
df = pd.DataFrame({key:value})
dfs.append(df)
# do your excel writing here - use column header for sheet name etc.
for df in dfs:
print (df)
Output (files/names redacted):
.docx
0 file...0.docx
1 file...2.docx
2 file...3.docx
3 file...4.docx
4 file...5.docx
5 file...6.docx
6 file...7.docx
7 file...12.docx
8 file...13.docx
9 file...14.docx
10 file...15.docx
11 file...16.docx
.xlsx
0 file...1.xlsx
1 file...8.xlsx
2 file...9.xlsx
3 file...10.xlsx
4 file...11.xlsx
5 file...17.xlsx
You can then use the column header of each single DF to write your excel sheet - something akin to:
with pd.ExcelWriter('C:/temp/outfile2.xlsx') as writer:
for df in dfs:
df.to_excel(writer, sheet_name = df.columns[0])
should do it - can't test that right now.
I have written a script which works but is not very elegant. It merges csv files, outputs a new file, filters that file to the required conditions, then outputs the filtered file, which is the file I want. I then repeat the process for every month.
Rather than altering this code to process every month (I have 5 more years worth of data to go), I would like to automate the path directory part and export csv file names that change from one month (and year) to the next.
See snippet of Jan and Feb below:
import os
import glob
import pandas as pd
import shutil
path = r"C:\Users\jonathan.capanda\Documents\Fishing_DataBase\gfw_data\100_deg_data\daily_csvs\20xx01"
os.chdir(path)
extension = 'csv'
all_filenames = [i for i in glob.glob('*.{}'.format(extension))]
combined_csv = pd.concat([pd.read_csv(f) for f in all_filenames])
combined_csv.to_csv("201401.csv", index=False, encoding='utf-8-sig')
grab1 = r'C:\Users\jonathan.capanda\Documents\Fishing_DataBase\gfw_data\100_deg_data\daily_csvs\20xx01\201401.csv'
move1 = r'C:\Users\jonathan.capanda\Documents\Fishing_DataBase\gfw_data\100_deg_data\daily_csvs\2014\2014-01.csv'
shutil.move(grab1,move1)
fd = pd.read_csv(r'C:\Users\jonathan.capanda\Documents\Fishing_DataBase\gfw_data\100_deg_data\daily_csvs\2014\2014-01.csv')
df = pd.DataFrame(fd)
irishsea = df[(df.lat_bin >= 5300) & (df.lat_bin <= 5500) & (df.lon_bin >= -650) & (df.lon_bin <= -250)]
irishsea.to_csv("2014-01_irishsea.csv", index=False, encoding='utf-8-sig')
grab2 = r'C:\Users\jonathan.capanda\Documents\Fishing_DataBase\gfw_data\100_deg_data\daily_csvs\20xx01\2014-01_irishsea.csv'
move2 = r'C:\Users\jonathan.capanda\Documents\Fishing_DataBase\gfw_data\100_deg_data\daily_csvs\2014\2014-01-IrishSea.csv'
shutil.move(grab2,move2)
I then repeat it for Feb data but have to update the path locations.
#process feb data
path = r"C:\Users\jonathan.capanda\Documents\Fishing_DataBase\gfw_data\100_deg_data\daily_csvs\20xx02"
os.chdir(path)
extension = 'csv'
all_filenames = [i for i in glob.glob('*.{}'.format(extension))]
combined_csv = pd.concat([pd.read_csv(f) for f in all_filenames])
combined_csv.to_csv("201402.csv", index=False, encoding='utf-8-sig')
grab1 = r'C:\Users\jonathan.capanda\Documents\Fishing_DataBase\gfw_data\100_deg_data\daily_csvs\20xx02\201402.csv'
move1 = r'C:\Users\jonathan.capanda\Documents\Fishing_DataBase\gfw_data\100_deg_data\daily_csvs\2014\2014-02.csv'
shutil.move(grab1,move1)
fd = pd.read_csv(r'C:\Users\jonathan.capanda\Documents\Fishing_DataBase\gfw_data\100_deg_data\daily_csvs\2014\2014-02.csv')
df = pd.DataFrame(fd)
irishsea = df[(df.lat_bin >= 5300) & (df.lat_bin <= 5500) & (df.lon_bin >= -650) & (df.lon_bin <= -250)]
irishsea.to_csv("2014-02_irishsea.csv", index=False, encoding='utf-8-sig')
grab2 = r'C:\Users\jonathan.capanda\Documents\Fishing_DataBase\gfw_data\100_deg_data\daily_csvs\20xx02\2014-02_irishsea.csv'
move2 = r'C:\Users\jonathan.capanda\Documents\Fishing_DataBase\gfw_data\100_deg_data\daily_csvs\2014\2014-02-IrishSea.csv'
shutil.move(grab2,move2)
You can do something like the following. Keep in mind that the second number of range (the stop value) needs to be one value higher than you intend.
for year in range(2014, 2020):
for month in range(1, 13):
if month < 10:
month_as_string = "0" + str(month)
else:
month_as_string = str(month)
date = "%s\%s-%s" % (year, year, month_as_string)
pathname = 'YOUR\FILEPATH\HERE' + date + 'irishsea.csv'
You can learn more about string formatting here https://www.learnpython.org/en/String_Formatting
In below code all the output files are getting written into T1 folder. How to separate those output files into sub folders, with the same name as original sub folders (where the original csv files were) ? Thanks
import pandas as pd
import numpy as np
import glob
import os
path = '/root/Desktop/TT1/'
mystep = 0.4
#define the function
def data_splitter(df, name):
max_time = df['Time'].max() # get max value of Time for the current csv file (df)
myrange= np.arange(0, max_time, mystep) # build the threshold range
for k in range(len(myrange)):
# build the upper values
temp = df[(df['Time'] >= myrange[k]) & (df['Time'] < myrange[k] + mystep)]
temp.to_csv("/root/Desktop/T1/{}_{}.csv".format(name, k))
# use os.walk(path) on the main path to get ALL subfolders inside path
for root,dirs,_ in os.walk(path):
for d in dirs:
path_sub = os.path.join(root,d) # this is the current subfolder
for filename in glob.glob(os.path.join(path_sub, '*.csv')):
df = pd.read_csv(filename)
name = os.path.split(filename)[1] # get the name of the current csv file
data_splitter(df, name)
This should help
Demo:
import pandas as pd
import numpy as np
import glob
import os
path = '/root/Desktop/TT1/'
mystep = 0.4
#define the function
def data_splitter(df, name, dest_folder):
max_time = df['Time'].max() # get max value of Time for the current csv file (df)
myrange= np.arange(0, max_time, mystep) # build the threshold range
basepath = "/root/Desktop/"
for k in range(len(myrange)):
# build the upper values
temp = df[(df['Time'] >= myrange[k]) & (df['Time'] < myrange[k] + mystep)]
dest_f = os.path.join(basepath, dest_folder)
if not os.path.isdir(dest_f):
os.mkdir(dest_f)
temp.to_csv(os.path.join(dest_f, "{}_{}.csv".format(name, k)))
# use os.walk(path) on the main path to get ALL subfolders inside path
for root,dirs, files in os.walk(path):
for f in files:
if f.endswith(".csv"):
filename = os.path.join(root, f)
df = pd.read_csv(filename)
name = os.path.split(os.path.basename(filename))[1]
dest_folder = os.path.basename(os.path.dirname(filename))
data_splitter(df, name, dest_folder)
A similar approach should work here:
import pandas as pd
import numpy as np
import glob
import os
input_root = '/root/Desktop/TT1'
output_root = '/root/Desktop/T1'
mystep = 0.4
#define the function
def data_splitter(input_file, output_path, output_basename):
df = pd.read_csv(input_file)
max_time = df['Time'].max() # get max value of Time for the current csv file (df)
myrange = np.arange(0, max_time, mystep) # build the threshold range
for k in range(len(myrange)):
# build the upper values
temp = df[(df['Time'] >= myrange[k]) & (df['Time'] < myrange[k] + mystep)]
temp.to_csv(os.path.join(output_path, f"{output_basename}_{k}.csv"))
# use os.walk(path) on the main path to get ALL subfolders inside path
for dirpath, dirnames, filenames in os.walk(input_root):
for filename in filenames:
if filename.lower().endswith('.csv'):
input_file = os.path.join(dirpath, filename)
sub_folders = dirpath[len(input_root)+1:]
output_path = os.path.join(output_root, sub_folders)
os.makedirs(output_path, exist_ok=True) # Ensure the output folder exists
output_basename = os.path.join(output_path, os.path.splitext(filename)[0] + '.csv')
data_splitter(input_file, output_path, output_basename)
This should result with the folder structure recreated at your output root folder.
I am trying to combine multiple .csv files into one .csv file using the dataframe in pandas. the tricky part about this is, i need to grab multiple files from multiple days. Please let me know if this does not make sense. As it currently stands i cannot figure out how to loop through the directory. Could you offer some assistance?
import csv
import pandas as pd
import datetime as dt
import glob, os
startDate = 20160613
endDate = 20160614
dateRange = endDate - startDate
dateRange = dateRange + 1
todaysDateFilePath = startDate
for x in xrange(dateRange):
print startDate
startDate = startDate + 1
filePath = os.path.join(r"\\export\path", startDate, "preprocessed")
os.chdir(filePath)
interesting_files = glob.glob("trade" + "*.csv")
print interesting_files
df_list = []
for filename in sorted(interesting_files):
df_list.append(pd.read_csv(filename))
full_df = pd.concat(df_list)
saveFilepath = r"U:\Chris\Test_Daily_Fails"
fileList = []
full_df.to_csv(saveFilepath + '\\Files_For_IN' + "_0613_" + ".csv", index = False)
IIUC you can create list all_files and in loop append output from glob to all_files:
all_files = []
for x in xrange(dateRange):
print startDate
startDate = startDate + 1
filePath = os.path.join(r"\\export\path", startDate, "preprocessed")
os.chdir(filePath)
all_files = all_files + glob.glob("trade" + "*.csv")
print interesting_files
Also you need first append all values to df_list and then only once concat (I indented code for concat):
df_list = []
for filename in sorted(interesting_files):
df_list.append(pd.read_csv(filename))
full_df = pd.concat(df_list)