In below code all the output files are getting written into T1 folder. How to separate those output files into sub folders, with the same name as original sub folders (where the original csv files were) ? Thanks
import pandas as pd
import numpy as np
import glob
import os
path = '/root/Desktop/TT1/'
mystep = 0.4
#define the function
def data_splitter(df, name):
max_time = df['Time'].max() # get max value of Time for the current csv file (df)
myrange= np.arange(0, max_time, mystep) # build the threshold range
for k in range(len(myrange)):
# build the upper values
temp = df[(df['Time'] >= myrange[k]) & (df['Time'] < myrange[k] + mystep)]
temp.to_csv("/root/Desktop/T1/{}_{}.csv".format(name, k))
# use os.walk(path) on the main path to get ALL subfolders inside path
for root,dirs,_ in os.walk(path):
for d in dirs:
path_sub = os.path.join(root,d) # this is the current subfolder
for filename in glob.glob(os.path.join(path_sub, '*.csv')):
df = pd.read_csv(filename)
name = os.path.split(filename)[1] # get the name of the current csv file
data_splitter(df, name)
This should help
Demo:
import pandas as pd
import numpy as np
import glob
import os
path = '/root/Desktop/TT1/'
mystep = 0.4
#define the function
def data_splitter(df, name, dest_folder):
max_time = df['Time'].max() # get max value of Time for the current csv file (df)
myrange= np.arange(0, max_time, mystep) # build the threshold range
basepath = "/root/Desktop/"
for k in range(len(myrange)):
# build the upper values
temp = df[(df['Time'] >= myrange[k]) & (df['Time'] < myrange[k] + mystep)]
dest_f = os.path.join(basepath, dest_folder)
if not os.path.isdir(dest_f):
os.mkdir(dest_f)
temp.to_csv(os.path.join(dest_f, "{}_{}.csv".format(name, k)))
# use os.walk(path) on the main path to get ALL subfolders inside path
for root,dirs, files in os.walk(path):
for f in files:
if f.endswith(".csv"):
filename = os.path.join(root, f)
df = pd.read_csv(filename)
name = os.path.split(os.path.basename(filename))[1]
dest_folder = os.path.basename(os.path.dirname(filename))
data_splitter(df, name, dest_folder)
A similar approach should work here:
import pandas as pd
import numpy as np
import glob
import os
input_root = '/root/Desktop/TT1'
output_root = '/root/Desktop/T1'
mystep = 0.4
#define the function
def data_splitter(input_file, output_path, output_basename):
df = pd.read_csv(input_file)
max_time = df['Time'].max() # get max value of Time for the current csv file (df)
myrange = np.arange(0, max_time, mystep) # build the threshold range
for k in range(len(myrange)):
# build the upper values
temp = df[(df['Time'] >= myrange[k]) & (df['Time'] < myrange[k] + mystep)]
temp.to_csv(os.path.join(output_path, f"{output_basename}_{k}.csv"))
# use os.walk(path) on the main path to get ALL subfolders inside path
for dirpath, dirnames, filenames in os.walk(input_root):
for filename in filenames:
if filename.lower().endswith('.csv'):
input_file = os.path.join(dirpath, filename)
sub_folders = dirpath[len(input_root)+1:]
output_path = os.path.join(output_root, sub_folders)
os.makedirs(output_path, exist_ok=True) # Ensure the output folder exists
output_basename = os.path.join(output_path, os.path.splitext(filename)[0] + '.csv')
data_splitter(input_file, output_path, output_basename)
This should result with the folder structure recreated at your output root folder.
Related
I have a data structure format as below:
Dataset:
training-
-Cat
-dog
-monkey
I would like to transfer/move 10 percent of files from each dataset to validation dataset. How can I do it using python?. It should automatically create the directories as well
Dataset:
validation-
-Cat
-dog
-monkey
You can try:
import os
source = 'C:/.../training/'
destination = 'C:/.../validation/'
if not os.path.exists(destination):
# Create a new directory because it does not exist
os.makedirs(destination)
allfiles = os.listdir(source)
for f in allfiles:
os.rename(source + f, destination + f)
Try this, it should help, yet not tested on Windows (only Ubuntu). But you can modify if path string is different on Windows OS.
Tested on : Python = 3.6.13, numpy = 1.19.2
from glob import glob
import os
import numpy as np
import shutil
def copy_folder(src, dst, percent_keep=0.1):
all_files = glob(f"{src}/*")
# select folders
folders = [folder for folder in all_files if os.path.isdir(folder)]
# select files
all_files = [file for file in all_files if os.path.isfile(file)]
print(f"There are {len(folders)} folders in {src.split('training')[-1]}")
print(f"There are {len(all_files)} files in {src.split('training')[-1]}")
for folder in folders:
# iterate through subfolders
copy_folder(folder, dst, percent_keep)
if len(all_files) > 0:
# find path to be attacked to validation path
remaining_path = src.split("training/")[-1]
new_path = os.path.join(dst, "validation", remaining_path) # new path for destination
if not os.path.exists(new_path):
os.makedirs(new_path)
# select samples from all files you have
keep_files = np.random.choice(all_files, int(len(all_files) * percent_keep))
print(f"Copying {len(keep_files)} random files")
for index, file in enumerate(keep_files):
print(f"\rCopying {index+1} / {len(keep_files)}", end="")
shutil.copyfile(file, os.path.join(new_path, file.rsplit("/")[-1]))
print("")
if __name__ == "__main__":
src = "/home/user/Dataset/training" # Should be path to training folder
# should be path of directory one below training folder
# (lets say root) it will attach validation folder later in code
dst = "/home/user/Dataset/"
copy_folder(src, dst, 0.1)
if you dont want to use numpy for selecting random file to copy to validation folder, use random library.
Something like:
keep_files = random.choices(all_files,k=int(len(all_files) * percent_keep) )
If you dont want to use shutils or glob, you can use os library:
os.lisdirs() # instead of glob
os.rename() # instead of shutils (maybe somethind different, not tested)
if you dont want random samples , use :
keep_files = all_files [:int(len(all_files) * percent_keep)]
I have a folder with 50 .csv files. The .csv files are auto-generated and a results/ output from a process-based model (long and automatically named). For example, sandbox_username_vetch_scaleup_IA_1.csv; sandbox_username_vetch_scaleup_IA_2.csv, and it continues till sandbox_username_vetch_scaleup_IA_50.csv.
I am trying to shorten the file names in a way so that the files are names are IA_1, IA_2 ...up to IA_50 and subsequently the new .csv file name gets added as a column to the data frame. Here is what I have tried so far
# import necessary libraries
import pandas as pd
import os
import glob
import sys
from pathlib import Path
import re
data_p = "/Users/Username/Documents/HV_Scale/CWAD"
output_p = "/Users/Username/Documents/HV_Scale/CWAD"
retval = os.getcwd()
print (retval) # see in which folder you are
os.chdir(data_p) # move to the folder with your data
os.getcwd()
filenames = sorted(glob.glob('*.csv'))
fnames = list(filenames) # get the names of all your files
#print(fnames)
#Loop over
for f in range(len(fnames)):
print(f'fname: {fnames[f]}\n')
pfile = pd.read_csv(fnames[f], delimiter=",") # read in file
#extract filename
filename = fnames[f]
parts = filename.split(".") # giving you the number in file name and .csv
only_id = parts[0].split("_") # if there is a bracket included
# get IA from your file
filestate = pfile["IA"][0] # assuming this is on the first row
filestate = str(filestate)
# get new filename
newfilename = only_id[0]+"-"+filestate+parts[1]
# save your file (don't put a slash at the end of your directories on top)
pfile.to_csv(output_p+"/"+newfilename, index = False, header = True)
Here is the code for adding the csv file name as a column
import glob
import os
import shutil
import sys
import pandas as pd
path = '/Users/Username/Documents/HV_Scale/IA_CWAD/short'
all_files = glob.glob(os.path.join(path, "*.csv"))
names = [os.path.basename(x) for x in glob.glob(path+'\*.csv')]
df = pd.DataFrame()
for file_ in all_files:
file_df = pd.read_csv(file_,sep=';', parse_dates=[0], infer_datetime_format=True,header=None )
file_df['file_name'] = file_
df = df.append(file_df)
#However, this adds the old csv file name and not the renamed one
In order to rename and move these files, all you need is:
import glob
import os
import shutil
import sys
SOURCE = '<Your source directory>'
TARGET = '<Your target directory>'
for file in glob.glob(os.path.join(SOURCE, '*_IA_*.csv')):
idx = file.index('_IA_')
filename = file[idx+1:]
target = os.path.join(TARGET, filename)
if os.path.exists(target):
print(f'Target file {target} already exists', file=sys.stderr)
else:
shutil.copy(file, target)
As there's nothing in the OP's question that tries to handle modification of the CSV files, that is left as an exercise for the OP.
Source and target directories should be different otherwise this can lead to ambiguous results
could anyone advise me how to apply this code to several csv in one folder? Then, save the modified csv to another folder and each separately? In short, I need to automate it.
I need to automatically load the csv file, execute the code, save the newly modified csv file, and then repeat it to the next csv file in the folder.
import pandas as pd
import datetime as dt
import numpy as np
from numpy import nan as Nan
path = "C://Users//Zemi4//Desktop//csv//A-001.csv"
df = pd.read_csv(path,delimiter=";")
df['ta'] = pd.to_numeric(df['ta'])
df['tw'] = pd.to_numeric(df['tw'])
df["time_str"] = [dt.datetime.strptime(d, "%d.%m.%Y %H:%M:%S") for d in df["time"]]
df["time_str"] = [d.date() for d in df["time_str"]]
df["time_str"] = pd.to_datetime(df["time_str"])
df["time_zaokrouhleny"]=df["time_str"]
def analyza(pozadovane_data):
new_list = []
new_df = pd.DataFrame(new_list)
new_df=df.loc[df["time_str"] == pozadovane_data,["ta","tw", "zone", "time_zaokrouhleny"]]
counter = new_df.ta.count()
if counter < 24:
for i in range(counter,24):
new_df.loc[i] = [Nan for n in range(4)]
new_df["ta"]= new_df.ta.fillna(0)
new_df["tw"] = new_df.tw.fillna(0)
new_df["zone"] = new_df.zone.fillna(0)
new_df["time_zaokrouhleny"]=new_df.time_zaokrouhleny.fillna(new_df.time_zaokrouhleny.min())
elif counter > 24:
counter_list = list(range(24,counter))
new_df = new_df.drop(new_df.index[counter_list])
new_df["time_oprava"] = [dt.datetime.combine(d.date(),dt.time(1,0)) for d in new_df["time_zaokrouhleny"]]
s = 0
cas_list = []
for d in new_df["time_oprava"]:
d =d + dt.timedelta(hours=s)
#print(d)
#print(s)
cas_list.append(d)
s = s + 1
se = pd.Series(cas_list)
new_df['time_oprava'] = se.values
new_df['Validace'] = (new_df['ta'] != 0) & (new_df['tw'] != 0)
new_df['Rozdil'] = new_df['ta'] - new_df['tw']
new_df.rename(columns={"ta": "Skutecna teplota", "tw": "Pozadovana teplota", "time_oprava": "Cas", "zone": "Mistnost"}, inplace = True)
new_df.index = new_df['Cas']
return new_df
start = dt.datetime(2010,10,6)
end = dt.datetime(2010,12,27)
date_range = []
date_range = [start + dt.timedelta(days=x) for x in range(0,(end-start).days)]
new_list = []
vysledek_df =pd.DataFrame(new_list)
for d in date_range:
pom = analyza(d)
vysledek_df = vysledek_df.append(pom,ignore_index=True)
vysledek_df.pop('time_zaokrouhleny')
vysledek_df.to_csv('C://Users//Zemi4//Desktop//zpr//A-001.csv', encoding='utf-8', index=False)
The code itself works correctly. Thank you for your advice.
Simplest way is to use glob. Just give the folder_path and output_path as per your requirements and use the sample code below. I commented the code to help you understand the code.
import os
import glob
folder_path = 'path/to/folder/' # path to folder containing .csv files
output_path = 'path/to/output/folder/' # path to output folder
for file in glob.glob(folder_path + '*.csv'): # only loads .csv files from the folder
df = pd.read_csv(file, delimiter=";") # read .csv file
# Do something
df.to_csv(output_path + 'modified_' + str(os.path.basename(file)), encoding='utf-8', index=False) # saves modified .csv file to output_path
You want to use os.listdir() to find the contents of the directory, then parameterize the file path in a new function. You can then loop over a list of directories retrieved via os.walk() and run the function for each one.
import os
def run(file_directory):
filelist = os.listdir(file_directory)
for path in filelist:
df = pd.read_csv(path,delimiter=";")
# etc.
df.to_csv(os.path.join(file_directory, 'output.csv'))
If you need to create a new directory, you can use os.mkdir(newpath)
Can you still advise on how to parameterize the function?
I am new with Python but trying to write a code which add a column on multiple .xlsx files and saves this files with the origin name to a new folder.
I have started with some coding beneath, but missing some code in open all files and saving to my DestPath. Would be pleased if any has a solution for this:
from os import listdir, path
import pandas as pd
import xlrd
SourcePath = 'C:\' #Source Path
DestPath = 'C:\' #Destination Path
# Listing up all .xlsx files from Source
def find_xlsx_filenames( path_to_dir, suffix=".xlsx" ):
filenames = listdir(path_to_dir)
return [ filename for filename in filenames if filename.endswith( suffix ) ]
filenames = find_xlsx_filenames(SourcePath)
fname = path.join(SourcePath, filenames[0]) # Tar første fil i mappa.
outname = path.join(outputdata, filenames[0])
for i in range(len(filenames)):
fname = path.join(SourcePath, filenames[i])
df = pd.read_excel(fname) #Read Excel file as a DataFrame
df['new_col'] = 'Sort Data' #Adding a new column named <Sort Data>
#To save it back as Excel
df.to_excel(DestPath, outname) #Write DateFrame back as Excel file
Thanks in Advance
check if this works
import os
import pandas as pd
path = 'C:/'
for roots, dirs, files in os.walk(path):
xlsfile = [ _ for _ in files if _.endswith('.xlsx')]
for xlsf in xlsfile:
df = pd.read_excel(os.path.join(roots, xlsf))
df['Sort Data'] = ' '
df.to_excel(os.path.join(roots, xlsf), index = False)
I have the same problem as here but now I'm trying to do the same with python because it's more suited to the task.
I've started with this:
import os
import shutil
import random
import glob
root_dir = '/home/leonardo/Desktop/python_script/rfe'
output_dir = '/home/leonardo/Desktop/python_script/output_folder'
ref = 200
folders_root_dir = os.listdir(root_dir)
print folders_root_dir
count = len(folders_root_dir)
print count
for i in xrange(count):
folder_inside = root_dir + '/' + folders_root_dir[i]
print folder_inside
number_files_folder_inside = len(os.listdir(folder_inside))
print number_files_folder_inside
if number_files_folder_inside > ref:
ref_copy = round(0.2*number_files_folder_inside)
print ref_copy
# here I have to copy 20% of the files in this folder to the output folder
else:
# here I have to copy all files from the folder to the output_dir
I tried to use os.walk() but I'm new to python and selecting files while the function is working proved to be really tough.
You'll need to import these:
import os
import shutil
import random
You can get all the files in a directory like this:
files = [file for file in os.listdir(dir) if os.path.isfile(os.path.join(dir, file))]
Then use a conditional:
if len(files) < 200:
for file in files:
shutil.copyfile(os.path.join(dir, file), dst)
else:
# Amount of random files you'd like to select
random_amount = 1000
for x in xrange(random_amount):
if len(files) == 0:
break
else:
file = random.choice(files)
shutil.copyfile(os.path.join(dir, file), outputdir)
A more compact solution (also noticing that copyfile does not really do the job properly unless one specifies the target file name as well):
import os
import shutil
import random
def get_file_list(input_dir):
return [file for file in os.listdir(input_dir) if os.path.isfile(os.path.join(input_dir, file))]
def get_random_files(file_list, N):
return random.sample(file_list, N)
def copy_files(random_files, input_dir, output_dir):
for file in random_files:
shutil.copy(os.path.join(input_dir, file), output_dir)
def main(input_dir, output_dir, N):
file_list = get_file_list(input_dir)
random_files = get_random_files(file_list, N)
copy_files(random_files, input_dir, output_dir)
import os
import shutil
import random
root_dir = '/home/leonardo/Desktop/python_script/qar'
output_dir = '/home/leonardo/Desktop/python_script/output_folder'
ref = 1
for root, dirs, files in os.walk(root_dir):
number_of_files = len(os.listdir(root))
if number_of_files > ref:
ref_copy = int(round(0.2 * number_of_files))
for i in xrange(ref_copy):
chosen_one = random.choice(os.listdir(root))
file_in_track = root
file_to_copy = file_in_track + '/' + chosen_one
if os.path.isfile(file_to_copy) == True:
shutil.copy(file_to_copy,output_dir)
print file_to_copy
else:
for i in xrange(len(files)):
track_list = root
file_in_track = files[i]
file_to_copy = track_list + '/' + file_in_track
if os.path.isfile(file_to_copy) == True:
shutil.copy(file_to_copy,output_dir)
print file_to_copy
print 'Finished !'
The final code has this face
thank you guys for the help !
cheers !
I want this for splitting my dataset to train,test and validation.
here is my code :
import os
import shutil
import random
import numpy as np
dir = r'E:\down\imgs'
train_dir = r'E:/train_test_split/train'
test_dir = r'E:/train_test_split/test'
valid_dir = r'E:/train_test_split/validation'
files = [file for file in os.listdir(dir) if os.path.isfile(os.path.join(dir, file))]
train_count = np.round(50/100*len(files))
test_count = np.round(30/100*len(files))
valid_count = np.round(20/100*len(files))
rndnums = list(random.sample(range(0, len(files)), len(files)))
print("len(files)",len(files))
# print("all",len(files))
# print("train",np.round(train*len(files)))
# print("test",np.round(test*len(files)))
# print("valid",np.round(valid*len(files)))
#
# print("sum",np.round(train*len(files)) + np.round(test*len(files)) + np.round(valid*len(files)))
# Amount of random files you'd like to select
##train_files
print(rndnums)
train_file_index = rndnums[0:int(train_count)+1]
train_file_name = [files[i] for i in train_file_index]
test_file_index = rndnums[int(train_count)+1:int(train_count + test_count)+1]
test_file_name = [files[i] for i in test_file_index]
valid_file_index = rndnums[int(train_count + test_count)+1:]
valid_file_name = [files[i] for i in valid_file_index]
for x in train_file_name:
file = x
shutil.copyfile(os.path.join(dir, file), os.path.join(train_dir, file))
##test_files
for y in test_file_name:
file = y
shutil.copyfile(os.path.join(dir, file), os.path.join(test_dir, file))
##valid_files
for z in valid_file_name:
file = z
shutil.copyfile(os.path.join(dir, file), os.path.join(valid_dir, file))
maybe something like (untested)
import os
THRESHOLD = 200
root_dir = "\home..."
output_dir = "\home....."
for top, dirs, nondirs in os.walk(root_dir):
for name in nondirs[:THRESHOLD]:
path = os.path.join(top, name)
destination = os.path.join(output_dir, name)
os.rename(path, destination)
import random
import shutil
import os
rootdir = '/home/leonardo/Desktop/python_script/qar'
outdir = '/home/leonardo/Desktop/python_script/output_folder'
ref = 200
dirsAndFiles = {} # here we store a structure {folder: [file1, file2], folder2: [file2, file4] }
dirs = [x[0] for x in os.walk(rootdir)] # here we store all sub-dirs
for dir in dirs:
dirsAndFiles[dir] = [f for f in os.listdir(dir) if os.path.isfile(os.path.join(dir, f))]
for (dir, files) in dirsAndFiles.iteritems():
if len(files) > ref:
for i in xrange(int(0.2*len(files))): # copy 20% of files
fe = random.choice(files)
files.remove(fe)
shutil.copy(os.path.join(dir, fe), outdir)
else: # copy all files
for file in files:
shutil.copy(os.path.join(dir, file), outdir)