transfering images into another data directory - python

I have a data structure format as below:
Dataset:
training-
-Cat
-dog
-monkey
I would like to transfer/move 10 percent of files from each dataset to validation dataset. How can I do it using python?. It should automatically create the directories as well
Dataset:
validation-
-Cat
-dog
-monkey

You can try:
import os
source = 'C:/.../training/'
destination = 'C:/.../validation/'
if not os.path.exists(destination):
# Create a new directory because it does not exist
os.makedirs(destination)
allfiles = os.listdir(source)
for f in allfiles:
os.rename(source + f, destination + f)

Try this, it should help, yet not tested on Windows (only Ubuntu). But you can modify if path string is different on Windows OS.
Tested on : Python = 3.6.13, numpy = 1.19.2
from glob import glob
import os
import numpy as np
import shutil
def copy_folder(src, dst, percent_keep=0.1):
all_files = glob(f"{src}/*")
# select folders
folders = [folder for folder in all_files if os.path.isdir(folder)]
# select files
all_files = [file for file in all_files if os.path.isfile(file)]
print(f"There are {len(folders)} folders in {src.split('training')[-1]}")
print(f"There are {len(all_files)} files in {src.split('training')[-1]}")
for folder in folders:
# iterate through subfolders
copy_folder(folder, dst, percent_keep)
if len(all_files) > 0:
# find path to be attacked to validation path
remaining_path = src.split("training/")[-1]
new_path = os.path.join(dst, "validation", remaining_path) # new path for destination
if not os.path.exists(new_path):
os.makedirs(new_path)
# select samples from all files you have
keep_files = np.random.choice(all_files, int(len(all_files) * percent_keep))
print(f"Copying {len(keep_files)} random files")
for index, file in enumerate(keep_files):
print(f"\rCopying {index+1} / {len(keep_files)}", end="")
shutil.copyfile(file, os.path.join(new_path, file.rsplit("/")[-1]))
print("")
if __name__ == "__main__":
src = "/home/user/Dataset/training" # Should be path to training folder
# should be path of directory one below training folder
# (lets say root) it will attach validation folder later in code
dst = "/home/user/Dataset/"
copy_folder(src, dst, 0.1)
if you dont want to use numpy for selecting random file to copy to validation folder, use random library.
Something like:
keep_files = random.choices(all_files,k=int(len(all_files) * percent_keep) )
If you dont want to use shutils or glob, you can use os library:
os.lisdirs() # instead of glob
os.rename() # instead of shutils (maybe somethind different, not tested)
if you dont want random samples , use :
keep_files = all_files [:int(len(all_files) * percent_keep)]

Related

Run Python script on all files in the directory after a file has run, create a new folder and name the folder after the file

I wrote a Python script TestData.pythat uses Pandas and NumPy to test a CSV for data anomalies. It inputs one CSV and outputs 4 new ones. For each input file that needs testing I do the following:
Copy the name of the unknown file. In this example: unknownfilename1.csv
Create a folder.
Rename the New Folder by pasting in unknownfilename1.csv, removing the .csv
Paste unknownfilename1.csv into data = pd.read_csv("unknownfilename0.csv")
Drag TestData.py into the folder unknownfilename1
Finally, run TestData.py
import pandas as pd
import numpy as np
# Import raw data
data = pd.read_csv("unknownfilename1.csv", encoding='latin-1' )
#################################################
# Over 500 lines of code using Pandas and Numpy #
#################################################
# failed at least one testcase, needs to be fixed before importing.
failed.to_csv("C:/users/path/Failed.csv", index = False)
# Output passed rows.
passed.to_csv("C:/users/path/Passed.csv", index = False)
# Ready to import.
newimpomatic.to_csv("C:/users/path/Import.csv", index = False)
# Duplicates IDs
duplicated.to_csv("C:/users/path/duplicated.csv", index = False)
I would like each file to be tested in:
C:/users/path/unknownfilename1.csv
C:/users/path/unknownfilename2.csv
C:/users/path/unknownfilename3.csv
To output:
C:/users/path/unknownfilename1/Failed.csv
C:/users/path/unknownfilename1/Passed.csv
C:/users/path/unknownfilename1/Import.csv
C:/users/path/unknownfilename1/duplicated.csv
C:/users/path/unknownfilename2/Failed.csv
C:/users/path/unknownfilename2/Passed.csv
C:/users/path/unknownfilename2/Import.csv
C:/users/path/unknownfilename2/duplicated.csv
C:/users/path/unknownfilename3/Failed.csv
C:/users/path/unknownfilename3/Passed.csv
C:/users/path/unknownfilename3/Import.csv
C:/users/path/unknownfilename3/duplicated.csv
If I have 100s different files in a folder. What is the easiest way to add something to my script to test all files, after each file is tested, create a new folder and then name the folder after the file that was tested?
The Path class in the python builtin library pathlib is great at this, and working with files/folder locations in general. With glob(pattern: str), you can yield all matches to a particular file pattern in a directory, and iterate over those matches.
https://docs.python.org/3.9/library/pathlib.html#pathlib.Path.glob
You can also use Path to grab the name of the file and create a new directory to place your outputted csvs.
The file below assumes it is in the same directory as all of the original csvs, but that is changeable. I call that directory base_dir, equivalent to what you listed as C:/users/path/
/users/path/main.py:
from pathlib import Path
import pandas as pd
import numpy as np
failed_csv = 'Failed.csv'
passed_csv = 'Passed.csv'
import_csv = 'Import.csv'
dup_csv = 'duplicated.csv'
def get_root() -> Path:
return Path(__file__).resolve().parent
def process(csv_file: Path, out_dir: Path) -> None:
data = pd.read_csv(csv_file, encoding='latin-1')
###
### Do existing processing of data DataFrame
###
# Save files. These print statements will show the final
# file path for each of the output csvs.
print(out_dir / failed_csv) # '/users/path/my_file/Failed.csv'
print(out_dir / passed_csv) # '/users/path/my_file/Passed.csv'
print(out_dir / import_csv) # '/users/path/my_file/Import.csv'
print(out_dir / dup_csv) # '/users/path/my_file/duplicated.csv'
failed.to_csv(out_dir / failed_csv, index=False)
passed.to_csv(out_dir / passed_csv, index=False)
newimpomatic.to_csv(out_dir / import_csv, index=False)
duplicated.to_csv(out_dir / dup_csv, index=False)
def main(base_dir: Path) -> None:
print(f'Processing files in {base_dir}: \n')
n_process = 0
for csv_file in base_dir.glob('*.csv'):
# ex. csv_file = "/users/path/my_file.csv"
name: str = csv_file.stem # name = "my_file"
output_dir: Path = base_dir / name # output_dir = "/users/path/my_file"
print(f'Creating directory "{output_dir}"')
Path.mkdir(output_dir, exist_ok=True)
print(f'Processing "{csv_file}"')
process(csv_file=csv_file, out_dir=output_dir)
print(f'Completed processing\n')
n_process += 1
print(f'\nProcessed {n_process} files')
if __name__ == '__main__':
root = get_root() # root = "users/path"
main(base_dir=root)

Renaming multiple csv files within a folder in Python

I have a folder with 50 .csv files. The .csv files are auto-generated and a results/ output from a process-based model (long and automatically named). For example, sandbox_username_vetch_scaleup_IA_1.csv; sandbox_username_vetch_scaleup_IA_2.csv, and it continues till sandbox_username_vetch_scaleup_IA_50.csv.
I am trying to shorten the file names in a way so that the files are names are IA_1, IA_2 ...up to IA_50 and subsequently the new .csv file name gets added as a column to the data frame. Here is what I have tried so far
# import necessary libraries
import pandas as pd
import os
import glob
import sys
from pathlib import Path
import re
data_p = "/Users/Username/Documents/HV_Scale/CWAD"
output_p = "/Users/Username/Documents/HV_Scale/CWAD"
retval = os.getcwd()
print (retval) # see in which folder you are
os.chdir(data_p) # move to the folder with your data
os.getcwd()
filenames = sorted(glob.glob('*.csv'))
fnames = list(filenames) # get the names of all your files
#print(fnames)
#Loop over
for f in range(len(fnames)):
print(f'fname: {fnames[f]}\n')
pfile = pd.read_csv(fnames[f], delimiter=",") # read in file
#extract filename
filename = fnames[f]
parts = filename.split(".") # giving you the number in file name and .csv
only_id = parts[0].split("_") # if there is a bracket included
# get IA from your file
filestate = pfile["IA"][0] # assuming this is on the first row
filestate = str(filestate)
# get new filename
newfilename = only_id[0]+"-"+filestate+parts[1]
# save your file (don't put a slash at the end of your directories on top)
pfile.to_csv(output_p+"/"+newfilename, index = False, header = True)
Here is the code for adding the csv file name as a column
import glob
import os
import shutil
import sys
import pandas as pd
path = '/Users/Username/Documents/HV_Scale/IA_CWAD/short'
all_files = glob.glob(os.path.join(path, "*.csv"))
names = [os.path.basename(x) for x in glob.glob(path+'\*.csv')]
df = pd.DataFrame()
for file_ in all_files:
file_df = pd.read_csv(file_,sep=';', parse_dates=[0], infer_datetime_format=True,header=None )
file_df['file_name'] = file_
df = df.append(file_df)
#However, this adds the old csv file name and not the renamed one
In order to rename and move these files, all you need is:
import glob
import os
import shutil
import sys
SOURCE = '<Your source directory>'
TARGET = '<Your target directory>'
for file in glob.glob(os.path.join(SOURCE, '*_IA_*.csv')):
idx = file.index('_IA_')
filename = file[idx+1:]
target = os.path.join(TARGET, filename)
if os.path.exists(target):
print(f'Target file {target} already exists', file=sys.stderr)
else:
shutil.copy(file, target)
As there's nothing in the OP's question that tries to handle modification of the CSV files, that is left as an exercise for the OP.
Source and target directories should be different otherwise this can lead to ambiguous results

How to rename all files to include the directory name?

I'm trying to use a For loop in the code below to go through a list of files and rename them with the file directory's name.
import re # add this to your other imports
import os
for files in os.walk("."):
for f_new in files:
folder = files.split(os.sep)[-2]
print(folder)
name_elements = re.findall(r'(Position)(\d+)', f_new)[0]
name = name_elements[0] + str(int(name_elements[1]))
print(name) # just for demonstration
dst = folder + '_' + name
print(dst)
os.rename('Position014 (RGB rendering) - 1024 x 1024 x 1 x 1 - 3 ch (8 bits).tif', dst)
Use pathlib
Path.rglob: This is like calling Path.glob() with '**/' added in front of the given relative pattern:
.parent or .parents[0]: An immutable sequence providing access to the logical ancestors of the path
If yo want different parts of the path, index parents[] differently
file.parents[0].stem returns 'test1' or 'test2' depending on the file
file.parents[1].stem returns 'photos'
file.parents[2].stem returns 'stack_overflow'
.stem: The final path component, without its suffix
.suffix: The file extension of the final component
.rename: Rename this file or directory to the given target
The following code, finds only .tiff files. Use *.* to get all files.
If you only want the first 10 characters of file_name:
file_name = file_name[:10]
form pathlib import Path
# set path to files
p = Path('e:/PythonProjects/stack_overflow/photos/')
# get all files in subdirectories with a tiff extension
files = list(p.rglob('*.tiff'))
# print files example
[WindowsPath('e:/PythonProjects/stack_overflow/photos/test1/test.tiff'), WindowsPath('e:/PythonProjects/stack_overflow/photos/test2/test.tiff')]
# iterate through files
for file in files:
file_path = file.parent # get only path
dir_name = file.parent.stem # get the directory name
file_name = file.stem # get the file name
suffix = file.suffix # get the file extension
file_name_new = f'{dir_name}_{file_name}{suffix}' # make the new file name
file.rename(file_path / file_name_new) # rename the file
# output files renamed
[WindowsPath('e:/PythonProjects/stack_overflow/photos/test1/test1_test.tiff'), WindowsPath('e:/PythonProjects/stack_overflow/photos/test2/test2_test.tiff')]

Placing randomly selected images into new folder

I have a directory that contains a large amount of sub directories.
Within each of these subdirectories are different jpegs, pngs.
I want to:
Select X amount of random images from these subdirectories
Create a new folder and copy these selected random images inside.
Thanks to help received here already I can print out a random selection of images using os.walk and random.choice.
import os
import random
import shutil
files_list = []
for root, dirs, files in os.walk("/Path/to/Directory"):
for file in files:
#all
if file.endswith(".jpg") or file.endswith(".png") or file.endswith(".jpeg"):
files_list.append(os.path.join(root, file))
#print images
#lets me count and print the amount of jpeg,jpg,pmg
file_count = len(files_list)
print file_count
print files_list
print(random.sample(files_list, 2)) #prints two random files from list
However, my issue is with actually selecting random images (not their names)
I have tried to create a variable imagePath that uses os.walk
#creates a variable imagePath that lets me access all img files in different folders
imagePath = os.walk("/Path/to/Directory")
and a new variable to randomly select a single image from imagePath
#create a variable that lets me choose random iamge from imagePath
randomImages = random.choice(os.listdir(imagePath))
and then created a new directory and used shutil.copy to move radnomally selected image into this new directory
#creates a new directory
os.mkdir('testDirectory')
#moves the randomly selected image into new directory
shutil.copy(randomImages, testDirectory)
However, I am getting the following error:
Traceback (most recent call last):
File "crawl.py", line 28, in <module>
randomImages = random.choice(os.listdir(imagePath))
TypeError: coercing to Unicode: need string or buffer, generator found
I have also tried
for root, dirs, files in os.walk("/Path/to/Directory", topdown=False):
imagePath = ("/Path/to/Directory") #creates a variable that lets me access all img files in different folders
randomImages = random.choice(os.listdir(imagePath))
print randomImages
But this returns a random selection of sub directories (not images within) along with .ds store files.
Here is the code, you want to move the files not make another copy I guess.
import os
import random
import shutil
files_list = []
for root, dirs, files in os.walk("<SOURCE_DIR>"):
for file in files:
#all
if file.endswith(".jpg") or file.endswith(".png") or file.endswith(".jpeg"):
files_list.append(os.path.join(root, file))
#print images
#lets me count and print the amount of jpeg,jpg,pmg
file_count = len(files_list)
print file_count
# print files_list
filesToCopy = random.sample(files_list, 2) #prints two random files from list
destPath = "<DESTINATION_DIR>"
# if destination dir does not exists, create it
if os.path.isdir(destPath) == False:
os.makedirs(destPath)
# iteraate over all random files and move them
for file in filesToCopy:
shutil.move(file, destPath)
You should be able to feed shutil.copy() a source and destination file path. It seems to me that you have a list of files already so you can just copy them.
import os
import random
import shutil
files_list = []
for root, dirs, files in os.walk("/Path/to/Directory"):
for file in files:
#all
if file.endswith(".jpg") or file.endswith(".png") or file.endswith(".jpeg"):
files_list.append(os.path.join(root, file))
#print images
#lets me count and print the amount of jpeg,jpg,pmg
file_count = len(files_list)
print file_count
print files_list
selected_files = random.sample(files_list, 2)) #assign to a list
dest_path = "/path/to/new/folder/"
os.mkdir(dest_path)
for src_path in selected_files:
shutil.copy(src_path, os.path.join(dest_path, os.path.basename(src_path)))

Organizing data by filetype

I am trying to sort a large number of files based off of their file extension. A lot of the files are .doc, .docx, .xls, etc.
This is what I was thinking in my head, but if there is a simpler way to do things, let me know! I do have multiple files with the same extension, so I don't want it to create a new folder for that extension every time and overwrite the previous file. I also have a much larger list, but for this example I don't believe all of them are needed. The OS is MacOS.
import os, shutil
extList = ['.doc', '.docx', '.xls']
for ext in extList:
os.mkdir(path + '/' + ext +'_folder')
for file in os.listdir(filepath):
if file.endswith(ext): #missing an indent
print(file)
shutil.copyfile(file + '/' + ext +'_folder' + file)
Also, if I run into a file that I do not have on my list, I would like it to go into a folder named 'noextlist'.
Here is what I was able to create quickly
import os, re, shutil
DocFolder = r'...'#Your doc folder path
DocxFolder = r'...'#Your docx folder path
XlsFolder = r'...'#Your xls folder path
MiscFolder = r'...'#Your misc folder path
for root, dirs, files in os.walk(r'...'): #Your folder path you want to sort
for file in files:
if file.endswith(".doc"):
sourceFolder = os.path.join(root,file)
print sourceFolder
shutil.copy2(sourceFolder,DocFolder)
elif file.endswith(".docx"):
sourceFolder = os.path.join(root,file)
print sourceFolder
shutil.copy2(sourceFolder,DocxFolder)
elif file.endswith(".xls"):
sourceFolder = os.path.join(root,file)
print sourceFolder
shutil.copy2(sourceFolder,XlsFolder)
else:
sourceFolder = os.path.join(root,file)
print sourceFolder
shutil.copy2(sourceFolder,MiscFolder)
Edit:The main function here is the for root,dirs,files in os.walk This allows the program to transverse through the provided path to search all files including the ones in the sub folder and sort it out accordingly.
import errno
import shutil
from os import listdir, mkdir
from os.path import splitext, join
# set for fast lookup
extList = set(['.doc', '.docx', '.xls'])
# source path
filepath = ...
# dest path
path = ...
for f in listdir(filepath):
# extract extension from file name
ext = splitext(f)[1]
if ext in extList:
dir_ = join(path, "{}_folder".format(ext))
try:
mkdir(dir_)
except OSError as e:
if ex.errno != errno.EEXIST:
raise # raise if any other error than "already exists"
dest = join(dir_, f)
else:
dest = join(path, "noextlist_folder", f)
shutil.copy2(join(filepath, f), dest)
If I understand correctly, you like your solution but you need a way to rename files with duplicate names so that the extras don't disappear. You can check if the destination file already exists and construct a variant name by adding _1, _2, etc. to the filename until you find something unused.
newpathname = path + '/' + ext +'_folder' + "/" + file
n = 0
while os.path.exists(newpathname):
n += 1
base, ext = os.path.splitext(newpathname)
newpathname = "%s_%d%s" % (base, n, ext)
shutil.copyfile(filepath+"/"+file, newpathname)
But your code has some other glitches, so here's a rewritten scanner. It uses os.walk() to descend into several levels of subdirectories (you don't say if that's needed or not), and it collects files of all extensions in one pass. And it constructs variant names as before.
import os, shutil
extList = ['.doc', '.docx', '.xls']
from os.path import join as joinpath
# Make sure the destination directories exist
for ext in extList:
extdir = joinpath(path, ext[1:]+"_folder")
if not os.path.exists(extdir):
os.mkdir(extdir)
for dirname, _dirs, files in os.walk(filepath):
for file in files:
base, ext = os.path.splitext(file)
if ext not in extList:
continue
destpath = joinpath(path, ext[1:]+"_folder")
n = 0
newpathname = joinpath(destpath, file)
# If the new name is in use, find an unused variant
while os.path.exists(newpathname):
n += 1
newfile = "%s_%d%s" % (base, n, ext)
newpathname = joinpath(path, newfile)
sh.copy(joinpath(dirname, file), newpathname) # or other copy method

Categories

Resources