Copying random files from a file tree - python

I have the same problem as here but now I'm trying to do the same with python because it's more suited to the task.
I've started with this:
import os
import shutil
import random
import glob
root_dir = '/home/leonardo/Desktop/python_script/rfe'
output_dir = '/home/leonardo/Desktop/python_script/output_folder'
ref = 200
folders_root_dir = os.listdir(root_dir)
print folders_root_dir
count = len(folders_root_dir)
print count
for i in xrange(count):
folder_inside = root_dir + '/' + folders_root_dir[i]
print folder_inside
number_files_folder_inside = len(os.listdir(folder_inside))
print number_files_folder_inside
if number_files_folder_inside > ref:
ref_copy = round(0.2*number_files_folder_inside)
print ref_copy
# here I have to copy 20% of the files in this folder to the output folder
else:
# here I have to copy all files from the folder to the output_dir
I tried to use os.walk() but I'm new to python and selecting files while the function is working proved to be really tough.

You'll need to import these:
import os
import shutil
import random
You can get all the files in a directory like this:
files = [file for file in os.listdir(dir) if os.path.isfile(os.path.join(dir, file))]
Then use a conditional:
if len(files) < 200:
for file in files:
shutil.copyfile(os.path.join(dir, file), dst)
else:
# Amount of random files you'd like to select
random_amount = 1000
for x in xrange(random_amount):
if len(files) == 0:
break
else:
file = random.choice(files)
shutil.copyfile(os.path.join(dir, file), outputdir)

A more compact solution (also noticing that copyfile does not really do the job properly unless one specifies the target file name as well):
import os
import shutil
import random
def get_file_list(input_dir):
return [file for file in os.listdir(input_dir) if os.path.isfile(os.path.join(input_dir, file))]
def get_random_files(file_list, N):
return random.sample(file_list, N)
def copy_files(random_files, input_dir, output_dir):
for file in random_files:
shutil.copy(os.path.join(input_dir, file), output_dir)
def main(input_dir, output_dir, N):
file_list = get_file_list(input_dir)
random_files = get_random_files(file_list, N)
copy_files(random_files, input_dir, output_dir)

import os
import shutil
import random
root_dir = '/home/leonardo/Desktop/python_script/qar'
output_dir = '/home/leonardo/Desktop/python_script/output_folder'
ref = 1
for root, dirs, files in os.walk(root_dir):
number_of_files = len(os.listdir(root))
if number_of_files > ref:
ref_copy = int(round(0.2 * number_of_files))
for i in xrange(ref_copy):
chosen_one = random.choice(os.listdir(root))
file_in_track = root
file_to_copy = file_in_track + '/' + chosen_one
if os.path.isfile(file_to_copy) == True:
shutil.copy(file_to_copy,output_dir)
print file_to_copy
else:
for i in xrange(len(files)):
track_list = root
file_in_track = files[i]
file_to_copy = track_list + '/' + file_in_track
if os.path.isfile(file_to_copy) == True:
shutil.copy(file_to_copy,output_dir)
print file_to_copy
print 'Finished !'
The final code has this face
thank you guys for the help !
cheers !

I want this for splitting my dataset to train,test and validation.
here is my code :
import os
import shutil
import random
import numpy as np
dir = r'E:\down\imgs'
train_dir = r'E:/train_test_split/train'
test_dir = r'E:/train_test_split/test'
valid_dir = r'E:/train_test_split/validation'
files = [file for file in os.listdir(dir) if os.path.isfile(os.path.join(dir, file))]
train_count = np.round(50/100*len(files))
test_count = np.round(30/100*len(files))
valid_count = np.round(20/100*len(files))
rndnums = list(random.sample(range(0, len(files)), len(files)))
print("len(files)",len(files))
# print("all",len(files))
# print("train",np.round(train*len(files)))
# print("test",np.round(test*len(files)))
# print("valid",np.round(valid*len(files)))
#
# print("sum",np.round(train*len(files)) + np.round(test*len(files)) + np.round(valid*len(files)))
# Amount of random files you'd like to select
##train_files
print(rndnums)
train_file_index = rndnums[0:int(train_count)+1]
train_file_name = [files[i] for i in train_file_index]
test_file_index = rndnums[int(train_count)+1:int(train_count + test_count)+1]
test_file_name = [files[i] for i in test_file_index]
valid_file_index = rndnums[int(train_count + test_count)+1:]
valid_file_name = [files[i] for i in valid_file_index]
for x in train_file_name:
file = x
shutil.copyfile(os.path.join(dir, file), os.path.join(train_dir, file))
##test_files
for y in test_file_name:
file = y
shutil.copyfile(os.path.join(dir, file), os.path.join(test_dir, file))
##valid_files
for z in valid_file_name:
file = z
shutil.copyfile(os.path.join(dir, file), os.path.join(valid_dir, file))

maybe something like (untested)
import os
THRESHOLD = 200
root_dir = "\home..."
output_dir = "\home....."
for top, dirs, nondirs in os.walk(root_dir):
for name in nondirs[:THRESHOLD]:
path = os.path.join(top, name)
destination = os.path.join(output_dir, name)
os.rename(path, destination)

import random
import shutil
import os
rootdir = '/home/leonardo/Desktop/python_script/qar'
outdir = '/home/leonardo/Desktop/python_script/output_folder'
ref = 200
dirsAndFiles = {} # here we store a structure {folder: [file1, file2], folder2: [file2, file4] }
dirs = [x[0] for x in os.walk(rootdir)] # here we store all sub-dirs
for dir in dirs:
dirsAndFiles[dir] = [f for f in os.listdir(dir) if os.path.isfile(os.path.join(dir, f))]
for (dir, files) in dirsAndFiles.iteritems():
if len(files) > ref:
for i in xrange(int(0.2*len(files))): # copy 20% of files
fe = random.choice(files)
files.remove(fe)
shutil.copy(os.path.join(dir, fe), outdir)
else: # copy all files
for file in files:
shutil.copy(os.path.join(dir, file), outdir)

Related

Change filename for multiple files

I want to change filename for all my files in a folder. They all end with a date and time like "filename 2019-05-20 1357" and I want the date first for all files. How can I do that simplest way?
#!/usr/bin/python3
import shutil, os, re
r = re.compile(r"^(.*) (\d{4}-\d{2}-\d{2} \d{4})$")
for f in os.listdir():
m = r.match(f)
if m:
shutil.move(f, "{} {}".format(m.group(2), m.group(1)))
Quick and roughly tested version
Here is my Implementation:
from datetime import datetime
import os
path = '/Users/name/desktop/directory'
for _, file in enumerate(os.listdir(path)):
os.rename(os.path.join(path, file), os.path.join(path, str(datetime.now().strftime("%d-%m-%Y %H%M"))+str(file)))
Output Format:
20-05-2019 1749filename.ext
import os
import re
import shutil
dir_path = '' # give the dir name
comp = re.compile(r'\d{4}-\d{2}-\d{2}')
for file in os.listdir(dir_path):
if '.' in file:
index = [i for i, v in enumerate(file,0) if v=='.'][-1]
name = file[:index]
ext = file[index+1:]
else:
ext=''
name = file
data = comp.findall(name)
if len(data)!=0:
date= comp.findall(name)[0]
rest_name = ' '.join(comp.split(name)).strip()
new_name = '{} {}{}'.format(date,rest_name,'.'+ext)
print('changing {} to {}'.format(name, new_name))
shutil.move(os.path.join(dir_path,name), os.path.join(dir_path, new_name))
else:
print('file {} is not change'.format(name))

Python: How to get the full path of a file in order to move it?

I had files that were in zips. I unzipped them with Zip-7 so they are in folders with the zip file names.
Each of these folders has either a .otf or .ttf (some have both) that I want out of them and moved to another folder.
I have tried a few methods of getting the full path of the files but every one of them leaves out the folder that the file is actually in.
Here is my latest try:
import os
import shutil
from pathlib import Path
result = []
for root, dirs, files in os.walk("."):
for d in dirs:
continue
for f in files:
if f.endswith(".otf"):
print(f)
p = Path(f).absolute()
parent_dir = p.parents[1]
p.rename(parent_dir / p.name)
elif f.endswith(".ttf"):
print(f)
p = Path(f).absolute()
parent_dir = p.parents[1]
p.rename(parent_dir / p.name)
else:
continue
Other attempts:
# parent_dir = Path(f).parents[1]
# shutil.move(f, parent_dir)
#print("OTF: " + f)
# fn = f
# f = f[:-4]
# f += '\\'
# f += fn
# result.append(os.path.realpath(f))
#os.path.relpath(os.path.join(root, f), "."))
I know this is something simple but I just can't figure it out. Thanks!
You should join the file name with the path name root:
for root, dirs, files in os.walk("."):
for d in dirs:
continue
for f in files:
if f.endswith(".otf"):
p = Path(os.path.join(root, f)).absolute()
parent_dir = p.parents[1]
p.rename(parent_dir / p.name)
elif f.endswith(".ttf"):
p = Path(os.path.join(root, f)).absolute()
parent_dir = p.parents[1]
p.rename(parent_dir / p.name)
else:
continue
for root, dirs, files in os.walk(".")
for d in dirs:
continue
for f in files:
print(os.path.abspath(f))
You can use os.path.abspath() to get a path of a full file
You would also need to still filter for the certain file types.

How would I exclude directories from os.listdir results?

I'm making a script that will encode files within a directory using b64/b16 and I'm using os.listdir to do so, but it also lists directories which causes problems since now it's trying to encode directories as if it were a file.
How would I be able to exclude directories from os.listdir results?
import os
import sys
import base64
import codecs
import time
import string
import glob
#C:\\Users\\Fedora\\Desktop\\Win 10
path = "C:\\Users\\Fedora\\Desktop\\Win 10"
dirs = os.listdir(path)
files = []
filecount = 0
fileprogress = 0
for file in dirs:
files.append(file)
filecount = filecount + 1
for x in files:
os.system("cls")
fileprogress = fileprogress + 1
print("File " + str(fileprogress) + "/" + str(filecount))
print("Encrypting " + x + "...")
inputfile = open(path + "\\" + x, "rb")
data = inputfile.read()
inputfile.close()
data = base64.b16encode(data)
data = base64.b64encode(data)
data = base64.b16encode(data)
data = base64.b64encode(data)
data = base64.b16encode(data)
outputfile = open(path + "\\" + x + ".crypt", "wb")
outputfile.write(data)
outputfile.close()
use filter
filepath = "C:\\Users\\Fedora\\Desktop\\Win 10"
dirs = os.listdir(path)
files = filter(lambda x:os.path.isfile(os.path.join(filepath, x)), dirs)
or list comprehension with os.path.isfile()
filepath = "C:\\Users\\Fedora\\Desktop\\Win 10"
dirs = os.listdir(path)
files = [x for x in dirs if os.path.isfile(os.path.join(filepath, x))]
You can use os.path.isdir function to check if the current file is a directory.
Also, it is much better to use string formatting operations instead of string concatenation: not
print("File " + str(fileprogress) + "/" + str(filecount))
but
print("File {}/{}".format(fileprogress, filecount))
Such code is much easier to understand and modify.
Instead of using os.listdir() your can use os.walk which will return separate list for files and directories
python-oswalk-example
import os
path = "C:\\Users\\Fedora\\Desktop\\Win 10"
for (path, dirs, files) in os.walk(path):
print path
print dirs
print files
pythoncentral os-walk
#Import the os module, for the os.walk function
import os
#Set the directory you want to start from
path = "C:\\Users\\Fedora\\Desktop\\Win 10"
for dirName, subdirList, fileList in os.walk(path):
print('Found directory: %s' % dirName)
for fname in fileList:
print('\t%s' % fname)

Zipping file with shutil module

I use below code to moving files to their specific folders but at the end I don't know how i can zip those folders.
Note: i want use shutil module to zip the file.
import shutil
import os
source="/tmp/"
destination1="/tmp/music/"
destination2="/tmp/picture/"
destination3="/tmp/video/"
if not os.path.exists(destination1):
os.makedirs(destination1)
if not os.path.exists(destination2):
os.makedirs(destination2)
if not os.path.exists(destination3):
os.makedirs(destination3)
for f in os.listdir(source):
if f.endswith(".MP3") or f.endswith(".wma") or f.endswith(".WMA") or f.endswith(".mp3") :
shutil.move(source + f,destination1)
if f.endswith(".png") or f.endswith(".PNG") or f.endswith(".jpg") or f.endswith(".JPG") or f.endswith(".GIF") or f.endswith(".gif"):
shutil.move(source + f,destination2)
if f.endswith(".MP4") or f.endswith(".mp4") or f.endswith(".WMV") or f.endswith(".FLV") or f.endswith(".flv") or f.endswith(".wmv"):
shutil.move(source + f,destination3)
#now zipping:
shutil.make_archive("archive",'zip',"/tmp/","music"+"video"+"picture")
"music"+"video"+"picture"
gives you
'musicvideopicture'
the simplest way will be make dir /tmp/archive/ and there music, video, pictures,
and then
shutil.make_archive("archive",'zip',"/tmp/archive")
Edit:
consider using gztar :)
Edit2:
import shutil
import os
source = "/tmp/"
dest_base = "/tmp/archive/"
destination1 = dest_base + "music/"
destination2 = dest_base + "picture/"
destination3 = dest_base + "video/"
audio_ext = ('mp3', 'wma')
pictu_ext = ('png', 'jpg', 'gif')
video_ext = ('mp4', 'wmv', 'flv', 'avi')
if not os.path.exists(destination1):
os.makedirs(destination1)
if not os.path.exists(destination2):
os.makedirs(destination2)
if not os.path.exists(destination3):
os.makedirs(destination3)
for f in os.listdir(source):
ext = f.split('.')[-1].lower()
if ext in audio_ext:
shutil.move(source + f, destination1)
elif ext in pictu_ext:
shutil.move(source + f, destination2)
elif ext in video_ext:
shutil.move(source + f, destination3)
#now zipping:
shutil.make_archive("archive", 'gztar', "/tmp/archive")

Flatten complex directory structure in Python

I want to move files from a complex directory structure to just one place. For example i have this deep hierarchy:
foo/
foo2/
1.jpg
2.jpg
...
I want it to be:
1.jpg
2.jpg
...
My current solution:
def move(destination):
for_removal = os.path.join(destination, '\\')
is_in_parent = lambda x: x.find(for_removal) > -1
with directory(destination):
files_to_move = filter(is_in_parent,
glob_recursive(path='.'))
for file in files_to_move:
shutil.move(file, destination)
Definitions: directory and glob_recursive. Note, that my code only moves files to their common parent directory, not an arbitrary destination.
How can i move all files from a complex hierarchy to a single place succinctly and elegantly?
I don't like testing the name of the file about to be moved to see if we're already in the destination directory. Instead, this solution only scans the subdirectories of the destination
import os
import itertools
import shutil
def move(destination):
all_files = []
for root, _dirs, files in itertools.islice(os.walk(destination), 1, None):
for filename in files:
all_files.append(os.path.join(root, filename))
for filename in all_files:
shutil.move(filename, destination)
Explanation: os.walk walks recursively the destination in a "top down" manner. whole filenames are constructed with the os.path.join(root, filename) call. Now, to prevent scanning files at the top of the destination, we just need to ignore the first element of the iteration of os.walk. To do that I use islice(iterator, 1, None). One other more explicit way would be to do this:
def move(destination):
all_files = []
first_loop_pass = True
for root, _dirs, files in os.walk(destination):
if first_loop_pass:
first_loop_pass = False
continue
for filename in files:
all_files.append(os.path.join(root, filename))
for filename in all_files:
shutil.move(filename, destination)
this would do, it also renames files if they collide (I commented out the actual move and replaced with a copy):
import os
import sys
import string
import shutil
#Generate the file paths to traverse, or a single path if a file name was given
def getfiles(path):
if os.path.isdir(path):
for root, dirs, files in os.walk(path):
for name in files:
yield os.path.join(root, name)
else:
yield path
destination = "./newdir/"
fromdir = "./test/"
for f in getfiles(fromdir):
filename = string.split(f, '/')[-1]
if os.path.isfile(destination+filename):
filename = f.replace(fromdir,"",1).replace("/","_")
#os.rename(f, destination+filename)
shutil.copy(f, destination+filename)
Run recursively through directory, move the files and launch move for directories:
import shutil
import os
def move(destination, depth=None):
if not depth:
depth = []
for file_or_dir in os.listdir(os.path.join([destination] + depth, os.sep)):
if os.path.isfile(file_or_dir):
shutil.move(file_or_dir, destination)
else:
move(destination, os.path.join(depth + [file_or_dir], os.sep))
import os.path, shutil
def move(src, dest):
not_in_dest = lambda x: os.path.samefile(x, dest)
files_to_move = filter(not_in_dest,
glob_recursive(path=src))
for f in files_to_move:
shutil.move(f, dest)
Source for glob_recursive. Does not change name of file, if they collide.
samefile is a safe way to compare paths. But it doesn't work on Windows, so check How to emulate os.path.samefile behaviour on Windows and Python 2.7?.
def splitPath(p):
a,b = os.path.split(p)
return (splitPath(a) if len(a) and len(b) else []) + [b]
def safeprint(s):
try:
print(s)
except UnicodeEncodeError:
if sys.version_info >= (3,):
print(s.encode('utf8').decode(sys.stdout.encoding))
else:
print(s.encode('utf8'))
def flatten(root, doit):
SEP = "¦"
REPL = "?"
folderCount = 0
fileCount = 0
if not doit:
print("Simulating:")
for path, dirs, files in os.walk(root, topdown=False):
if path != root:
for f in files:
sp = splitPath(path)
np = ""
for element in sp[1:]:
e2 = element.replace(SEP, REPL)
np += e2 + SEP
f2 = f.replace(SEP, REPL)
newName = np + f2
safeprint("Moved: "+ newName )
if doit:
shutil.move(os.path.join(path, f), os.path.join(root, f))
# Uncomment, if you want filenames to be based on folder hierarchy.
#shutil.move(os.path.join(path, f), os.path.join(root, newName))
fileCount += 1
safeprint("Removed: "+ path)
if doit:
os.rmdir(path)
folderCount += 1
if doit:
print("Done.")
else:
print("Simulation complete.")
print("Moved files:", fileCount)
print("Removed folders:", folderCount)
directory_path = r"C:\Users\jd\Documents\myFtpData"
flatten(directory_path, True)
Adding on to the answers, I believe my answer will satisfy all your needs, the other answers fail when there is a subdirectory and file with the same filename as the upper directory.
This was SOLVED here, Also look at my Github Repo for Structured File Copy and Flattened File Copy:
import os, fnmatch, shutil
PATTERN = '*.txt' # Regex Pattern to Match files
INPUT_FOLDER = "A" # os.getcwd()
INPUT_FOLDER = os.path.abspath(INPUT_FOLDER)
include_input_foldername = False
prepend = "_included" if include_input_foldername else ""
OUTPUT_FOLDER = f"Structured_Copy_{os.path.basename(INPUT_FOLDER)}{prepend}"
os.makedirs(OUTPUT_FOLDER, exist_ok=True)
def find(pattern, path):
"""Utility to find files wrt a regex search"""
result = []
for root, dirs, files in os.walk(path):
for name in files:
if fnmatch.fnmatch(name, pattern):
result.append(os.path.join(root, name))
return result
all_files = find(PATTERN, INPUT_FOLDER)
for each_path in all_files:
relative_path = os.path.relpath(each_path, os.path.dirname(INPUT_FOLDER)) if include_input_foldername else os.path.relpath(each_path, INPUT_FOLDER)
flattened_relative_fullpath = os.path.join(OUTPUT_FOLDER, relative_path)
os.makedirs(os.path.dirname(flattened_relative_fullpath), exist_ok=True)
shutil.copy(each_path, flattened_relative_fullpath)
print(f"Copied {each_path} to {flattened_relative_fullpath}")
print(f"Finished Copying {len(all_files)} Files from : {INPUT_FOLDER} to : {OUTPUT_FOLDER}")

Categories

Resources