Update:
Basically, I want to divide a large folder into 2 groups.
The large folder is made up of 120 subfolders. An example subfolder name would be n02085620-Chihuahua. n02085620-Chihuahua has 152 .jpg files. An example file in it would be n02085620_7.jpg. You see n02085620 is the same.
Names of files that should go to group #1 are given in sample_submission.csv. Other files should invariably go to group #2. Each file should be inside its respective mother folder.
Initial post:
I want to copy 8580 .img files to 120 folders.
I'm using shutil to achieve that.
My main folder is
source = '/Users/turuud/Desktop/TUT/Dscience_exp/Images/'
while my destination folder looks like this:
test_path = '/Users/turuud/Desktop/TUT/Dscience_exp/test/'
The destination folder should include 120 sub-folders. I'm using os.path.join to achieve that.
The problem is that I'm not getting 120 directories with embedded images belonging to them. Instead, I'm getting 120 binary files that can't be used the way they are.
Below is the entire code:
import os
import csv
import shutil
source = '/Users/turuud/Desktop/TUT/Dscience_exp/Images/'
testcsv = '/Users/turuud/Desktop/TUT/Dscience_exp/sample_submission.csv'
test_path = '/Users/turuud/Desktop/TUT/Dscience_exp/test/'
with open(testcsv, 'r') as file:
csvreader = csv.reader(file)
next(csvreader)
for row in csvreader:
for directory in os.listdir(source):
if os.path.isdir(os.path.join(source, directory)):
for img_file in os.listdir(os.path.join(source, directory)):
if str(row[0]) == str(img_file):
print(f"{row[0]} and {img_file} are exactly same")
shutil.copy(os.path.join(source, directory, img_file), os.path.join(test_path, directory))
In the folder "test", I want to have directories that include their respective files. No binary files.
This is the result when last line of the code:
shutil.copy(os.path.join(source, directory, img_file), os.path.join(test_path, directory))
is changed to
shutil.copy(os.path.join(source, directory, img_file), test_path)
click here
I wanted them embedded in their mother folders.
I'd start from simplifying task. Let's say we have next file structure:
│ rules.csv
│
└───source
├───1
│ 1.jpg
│ 2.jpg
│
├───2
│ 3.jpg
│ 4.jpg
│
├───3
│ 5.jpg
│ 6.jpg
│
├───4
│ 7.jpg
│ 8.jpg
│
└───5
10.jpg
9.jpg
And rules.csv has next content:
1.jpg
2.jpg
5.jpg
9.jpg
Our python script should create folder result with two subfolders group_1 and group_2 with next structure:
└───result
├───group_1
│ ├───1
│ │ 1.jpg
│ │ 2.jpg
│ │
│ ├───3
│ │ 5.jpg
│ │
│ └───5
│ 9.jpg
│
└───group_2
├───2
│ 3.jpg
│ 4.jpg
│
├───3
│ 6.jpg
│
├───4
│ 7.jpg
│ 8.jpg
│
└───5
10.jpg
To make this we need firstly to load CSV and save first column of every row (which is filename), I'll use set as a container. Then we need to iterate over child folders of source folder and save filenames which exists in CSV to group_1 folder and others to group_2 folder.
import csv
from pathlib import Path
from shutil import copy2
source_dir = Path(r"C:\Users\Ori\Pictures")
rules_path = r"C:\Users\Ori\Documents\rules.csv"
result_dir = Path(r"C:\Users\Ori\Pictures\sorted")
group_1_dir = result_dir / "group_1"
group_2_dir = result_dir / "group_2"
with open(rules_path) as f:
reader = csv.reader(f)
# next(reader)
group_1_files = {row[0] for row in reader}
for path in source_dir.iterdir():
if path.is_dir():
for file in path.iterdir():
if file.is_file():
if file.name in group_1_files:
destination = group_1_dir / file.relative_to(source_dir)
else:
destination = group_2_dir / file.relative_to(source_dir)
destination.parent.mkdir(parents=True, exist_ok=True)
copy2(file, destination)
Also you can use .glob() to find all .jpg files, it is more universal in case if depth can be more than 2:
import csv
from pathlib import Path
from shutil import copy2
source_dir = Path(r"C:\Users\Ori\Pictures")
rules_path = r"C:\Users\Ori\Documents\rules.csv"
result_dir = Path(r"C:\Users\Ori\Pictures\sorted")
group_1_dir = result_dir / "group_1"
group_2_dir = result_dir / "group_2"
with open(rules_path) as f:
reader = csv.reader(f)
# next(reader)
group_1_files = {row[0] for row in reader}
for jpg in source_dir.glob(r"*\*.jpg"):
if jpg.name in group_1_files:
destination = group_1_dir / jpg.relative_to(source_dir)
else:
destination = group_2_dir / jpg.relative_to(source_dir)
destination.parent.mkdir(parents=True, exist_ok=True)
copy2(file, destination)
Related
I was creating a simple python script that iterates through files in a directory and changes the names of the files based on a num variable that increases by one after each iteration.
Here is the code used, with the directory and files involved.
import os
directory = 'JAN'
num = 1
for filename in os.listdir(directory):
f = os.path.join(directory, filename)
new_f = f'JAN-{num}.txt'.format(num=num)
if os.path.isfile(filename):
os.rename(filename, new_f)
num += 1
print(new_f)
Current Files
├── _JAN
│ ├── first.txt
│ ├── second.txt
│ ├── third.txt
│ └── fourth.txt
Desired Changes to Files
├── _JAN
│ ├── JAN-1.txt
│ ├── JAN-2.txt
│ ├── JAN-3.txt
│ └── JAN-4.txt
I have tried a few iterations and the only output received is each new file name printed to the console with no changes to the file names within the JAN directory.
Any help would be much appreciated.
Like the comments say, you need to check for f not filename and you also need to save it to the dir:
import os
directory = 'JAN'
num = 1
for filename in os.listdir(directory):
f = os.path.join(directory, filename)
new_f = os.path.join(directory, f'JAN-{num}.txt'.format(num=num))
if os.path.isfile(f):
os.rename(f, new_f)
num += 1
print(new_f)
You can also use enumerate instead of num:
for i, filename in enumerate(os.listdir(directory), 1):
f = os.path.join(directory, filename)
new_f = os.path.join(directory, f'JAN-{i}.txt')
if os.path.isfile(f):
os.rename(f, new_f)
print(new_f)
additional note: there is no need to use .format when you use f'' already.
import os
directory = 'JAN'
num = 1
os.chdir(directory) # Changes current working directory to "JAN"
for filename in os.listdir(): # equals to: for filename in current directory.
new_f = f'{directory}-{num}.txt'
if os.path.isfile(filename): # now it can detect filename since we've changed the current working directory.
os.rename(filename, new_f)
num += 1
Have a look to os.scandir. It's an iterator whose object are os.DirEntry instances which make them easy to work with (have attributes such as name, path, is_file, ...), in particular you can bypass the side-effect of listdir that returns basenames and not paths.
Here assumed that the files are already in the right order in the directory.
import os
directory = 'JAN'
new_f_template = os.path.join(directory, 'JAN-{}.txt')
with os.scandir(directory) as files:
for i, f in enumerate(files, 1):
if f.is_file():
new_f = new_f_template.format(i)
os.rename(f.path, new_f)
print(f"{f.name} -> {new_f}")
How can I efficiently calculate the size of every subfolder and file in a given directory?
The code I have so far does what I want, but it is inefficient and slow because of how I have to calculate the parent folder size.
Here's my current timing:
Section 1: 0.53 s
Section 2: 30.71 s
Code:
import os
import time
import collections
def folder_size(directory):
parents = []
file_size = collections.defaultdict(int)
parent_size = collections.defaultdict(int)
t0 = time.time()
#### Section 1 ####
for root, dirs, files in os.walk(directory):
root = os.path.abspath(root)
parents.append(root)
for f in files:
f = os.path.join(root, f)
file_size[f] += os.path.getsize(f)
###################
t1 = time.time()
print(f'walk time: {round(t1-t0, 2)}')
#### Section 2 ####
for parent in parents:
parent_split = parent.split(os.sep)
for filename, value in file_size.items():
parent_for_file = filename.split(os.sep)[:len(parent_split)]
if parent_split == parent_for_file:
parent_size[parent] += value
###################
t2 = time.time()
print(f'parent size time: {round(t2-t1, 2)}')
return file_size, parent_size
Section 2 of the code is inefficient for a couple reasons:
Inefficiency #1
I need to capture folders where there are no files. For example, in a folder structure like this:
TopFolder
├── FolderA
│ ├── folder_P1
│ │ ├── folder_P1__file_1.txt
│ │ └── folder_P1__file_2.txt
│ ├── folder_P10
│ │ ├── folder_P10__file_1.txt
│ │ └── folder_P10__file_2.txt
.
.
.
I want to end up with a size (in bytes) for each directory, like this:
'..../TopFolder': 114000,
'..../TopFolder/FolderA': 38000,
'..../TopFolder/FolderA/folder_P1': 38,
'..../TopFolder/FolderA/folder_P10': 38,
.
.
.
In order to get the total size for folders that have subfolders, like TopFolder and FolderA, I stored the parents separately, so I could go back and calculate their size based on the file sizes.
Inefficiency #2
The code is really slow because I have to split() the strings to determine the parent (confirmed with the cProfile module). I have to do this because if I do something like the snippet below, certain folder sizes will be calculated incorrectly. I also tried using re.split(), but that's even slower.
#### Section 2 ####
...
for parent in parents:
for filename, value in file_size.items():
if parent in filename:
parent_size[parent] += value
...
###################
Here's the wrong output with if parent in filename:
'..../TopFolder': 114000,
'..../TopFolder/FolderA': 38000,
'..../TopFolder/FolderA/folder_P1': 4256,
'..../TopFolder/FolderA/folder_P10': 456,
'..../TopFolder/FolderA/folder_P100': 76,
'..../TopFolder/FolderA/folder_P1000': 38,
.
.
.
Here's the correct output with the original code:
'..../TopFolder': 114000,
'..../TopFolder/FolderA': 38000,
'..../TopFolder/FolderA/folder_P1': 38,
'..../TopFolder/FolderA/folder_P10': 38,
'..../TopFolder/FolderA/folder_P100': 38,
'..../TopFolder/FolderA/folder_P1000': 38,
.
.
.
Section 2 either needs to be improved so it runs faster, or Section 2 needs to be incorporated into Section 1. I've searched the internet for ideas, but have only been able to find info on calculating the top level directory size and am running out of ideas.
Here's the code I used to create a sample directory structure:
import os
folder = 'TopFolder'
subfolders = ['FolderA', 'FolderB', 'FolderC']
for i in range(1000):
for subfolder in subfolders:
path = os.path.join(folder, subfolder, f'folder_P{i + 1}')
if not os.path.isdir(path):
os.makedirs(path)
for k in range(2):
with open(os.path.join(path, f'folder_P{i + 1}__file_{k + 1}.txt'), 'w') as file_out:
file_out.write(f'Hello from file {k + 1}!\n')```
With os.walk you don't get to use the file entry objects generated by os.scandir, which os.walk calls internally. Write a recursive function yourself with os.scandir, so you can use the stat object of each file entry rather than having to make a separate system call with os.path.getsize for each file. You should also not parse the path just to look for a parent directory name since you already have the parent directory name when you list a directory with that name.
The following example takes only 0.2 seconds to produce the desired output for your test directory structure on repl.it:
import os
def folder_size(directory):
def _folder_size(directory):
total = 0
for entry in os.scandir(directory):
if entry.is_dir():
_folder_size(entry.path)
total += parent_size[entry.path]
else:
size = entry.stat().st_size
total += size
file_size[entry.path] = size
parent_size[directory] = total
file_size = {}
parent_size = {}
_folder_size(directory)
return file_size, parent_size
file_size, parent_size = folder_size('TopFolder')
Demo: https://replit.com/#blhsing/SparseStainedNature
In one directory there are several folders that their names are as follows: 301, 302, ..., 600.
Each of these folders contain two folders with the name of A and B. I need to copy all the image files from A folders of each parent folder to the environment of that folder (copying images files from e.g. 600>A to 600 folder) and afterwards removing A and B folders of each parent folder. I found the solution from this post but I don't know how to copy the files into parent folders instead of sub-folder and also how to delete the sub-folders after copying and doing it for several folders.
import shutil
import os, sys
exepath = sys.argv[0]
directory = os.path.dirname(os.path.abspath(exepath))+"\\Files\\"
credit_folder = os.path.dirname(os.path.abspath(exepath))+"\\Credits\\"
os.chdir(credit_folder)
os.chdir(directory)
Source = credit_folder
Target = directory
files = os.listdir(Source)
folders = os.listdir(Target)
for file in files:
SourceCredits = os.path.join(Source,file)
for folder in folders:
TargetFolder = os.path.join(Target,folder)
shutil.copy2(SourceCredits, TargetFolder)
print(" \n ===> Credits Copy & Paste Sucessfully <=== \n ")
#hellohawii gave an excellent answer. Following code also works and you only need change value of Source when using.
import shutil
import os, sys
from tqdm import tqdm
exepath = sys.argv[0] # current path of code
Source = os.path.dirname(os.path.abspath(exepath))+"\\Credits\\" # path of folders:301, 302... 600
# Source = your_path_of_folders
files = os.listdir(Source) # get list of folders under 301 etc, in your situation: [A, B]
def get_parent_dir(path=None, offset=-1):
"""get parent dir of current path"""
result = path if path else __file__
for i in range(abs(offset)):
result = os.path.dirname(result)
return result
def del_files0(dir_path):
"""delete full folder"""
shutil.rmtree(dir_path)
for file_path in files:
current_path = os.path.join(Source, file_path) # current_path
if file_path == 'A': # select the folder to copy
file_list = os.listdir(current_path) # get file_list of selected folder
parent_path = get_parent_dir(current_path) # get parent dir path, namely target path
for file in tqdm(file_list):
shutil.copy(file, parent_path)
del_files0(current_path) # delete current path(folder)
print(" \n ===> Credits Copy & Paste & delete Successfully <=== \n ")
I recommend you to use the Pathlib.
from pathlib import Path
import shutil
from tqdm import tqdm
folder_to_be_sorted = Path("/your/path/to/the/folder")
for folder_named_number_i in tqdm(list(folder_to_be_sorted.iterdir())):
# folder_named_number_i is 301, 302, ..., 600
A_folder = folder_named_number_i / "A"
B_folder = folder_named_number_i / "B"
# move files
for image_i in A_folder.iterdir():
shutil.move(str(image_i), folder_named_number_i)
# remove directories
shutil.rmtree(str(A_folder))
shutil.rmtree(str(B_folder))
The os.path is a more low-level module. I post another version here since you are using the os module in your question.
import shutil
import os
from tqdm import tqdm
folder_to_be_sorted = "/your/path/to/the/folder"
for folder_named_number_name in tqdm(os.listdir(folder_to_be_sorted)):
folder_named_number_i = os.path.join(folder_to_be_sorted, folder_named_number_name)
# folder_named_number_i is 301, 302, ..., 600
A_folder = os.path.join(folder_named_number_i, "A")
B_folder = os.path.join(folder_named_number_i, "B")
# move files
for image_i_name in os.listdir(A_folder):
image_i = os.path.join(A_folder, image_i_name)
shutil.move(str(image_i), folder_named_number_i)
# remove directories
shutil.rmtree(str(A_folder))
shutil.rmtree(str(B_folder))
By the codes above I suppose you want to transfrom
# /your/path/to/the/folder
# │
# └───301
# │ │
# │ └───A
# │ │ └───image_301_A_1.png
# │ │ └───image_301_A_2.png
# │ │ └───image_301_A_3.png
# │ │ └───...(other images)
# │ │
# │ └───B
# │ └───image_301_B_1.png
# │ └───image_301_B_2.png
# │ └───image_301_B_3.png
# │ └───...(other images)
# │
# └───302(like 301)
# :
# :
# └───600(like 301)
to:
# /your/path/to/the/folder
# │
# └───301
# │ │
# │ └───image_301_A_1.png
# │ └───image_301_A_2.png
# │ └───image_301_A_3.png
# │ └───...(other images in folder 301/A/)
# │
# └───302(like 301)
# :
# :
# └───600(like 301)
I am writing a script that takes 2 JSON files which will just be a basic object of key/value pairs and compare the lengths of the files (number of keys) and well as if the keys are matching.
I will have an environment folder called vars which is where all of the JSON files will exist. I am able to run my script to compare one file to the config file, but I am wondering if there is a way to check each file in the vars folder one after another. i.e. I first go through the script for file1.json against config.json, return the desired result, then run the script for file2.json against config.json immediately afterwards. I am unsure of how I can go about this.
My project structure looks like this, where there will always be a config.json being compared against the rest of the JSON files:
├── .
│ ├── vars/
│ │ ├── file1.json
│ │ └-─ file2.json
| | |__ config.json
│ ├── main.py
Example of JSON files:
{
"appSettings": {
"FileOne": "file1.json"
"Audience": "https://{}",
"Domain": "url.com",
"AuthKey": "1234",
"AuthURL": "https://{}"
}
}
-----------------------------
{
"appSettings": {
"FileTwo": "file2.json"
"Audience": "https://{}",
"Domain": "url.com",
"AuthKey": "1234",
"AuthURL": "https://{}"
}
}
The goal is to load up these files and compare them as they should be identical. If I only have the 2 files in my vars folder (file1.json & config.json), the script works perfectly fine. If I was to add a second JSON file to compare (file2.json), it will only run against one of the files.
import os
import json
path = './vars'
def main():
json_files = []
param_config_file = ''
for root, dirs, files in os.walk(path):
for file in files:
if ('config' in file):
param_config_file = file
if (file.endswith('.json') and 'config' not in file):
json_files.append(file)
check_file_lengths(json_files, param_config_file)
return check_setting_files(json_files, param_config_file)
def return_func_settings(func_files):
app_settings = ''
for file in func_files:
f = open(f'{path}/{file}')
app_settings = json.load(f)
# print({'file_name': file, 'settings': app_settings['appSettings']})
return {'file_name': file, 'settings': app_settings['appSettings']}
def return_config_settings(config_file):
f = open(f'{path}/{config_file}')
config_app_settings = json.load(f)
return {'file_name': config_file, 'settings': config_app_settings['appSettings']}
def check_file_lengths(func_file, config):
func_settings = return_func_settings(func_file)
config_settings = return_config_settings(config)
file = func_settings['file_name']
config = config_settings['file_name']
print(f"Checking '{file}' against '{config}'")
if (len(func_settings['settings']) != len(config_settings['settings'])):
print("WARNING: The file sizes are different! Debuging...")
def check_setting_files(json, config):
missing_settings = []
is_matching = False
func_settings = return_func_settings(json)
config_settings = return_config_settings(config)
main_file, sub_file = '', ''
if (len(func_settings['settings']) >= len(config_settings['settings'])):
main_file = func_settings
sub_file = config_settings
else:
main_file = config_settings
sub_file = func_settings
for app_setting in main_file['settings']:
if (app_setting in sub_file['settings']):
is_matching = True
else:
missing_settings.append(app_setting)
print(
f"App setting '{app_setting}' doesn't exist in {sub_file['file_name']}")
is_matching = False
if (is_matching):
print(
f"The app settings in '{main_file['file_name']}' are in line with '{sub_file['file_name']}'")
else:
raise Exception(
f"App settings: {missing_settings} are missing in '{sub_file['file_name']}'")
main()
I put together a Repl to demonstrate what I'm trying to do: https://replit.com/#AlexWhitmore/python-stuff?v=1
using os.listdir, it is possible to iterate through files in a folder, and you can match the names to what you want to do
import os
# Get the list of all files and directories
path = "YOUR_PATH/vars"
dir_list = os.listdir(path)
# need to read from each file
for dir in dir_list:
if dir == "file.json":
# read content and do the thing
If I have a folder which has multiple subfolders, each has same structure sub1, sub2 and sub3:
├─a
│ ├─sub1
│ ├─sub2
│ └─sub3
├─b
│ ├─sub1
│ ├─sub2
│ └─sub3
└─c
├─sub1
├─sub2
└─sub3
...
I want to copy sub1 and sub2 if there are image files inside and ignore other subfolders (sub3...) while keeping tree structure of directory. This is expected result:
├─a
│ ├─sub1
│ └─sub2
├─b
│ ├─sub1
│ └─sub2
└─c
├─sub1
└─sub2
...
What should I do to get expected result? I have try following code but it only copy sub1, sub2 and sub3, I got TypeError: copy() takes no arguments (1 given).
Thanks to #PrasPJ.
import os
import os.path
import shutil
import fnmatch
src_dir= ['C:/Users/User/Desktop/source/input'] # List of source dirs
included_subdirs = ['sub1', 'sub2'] # subdir to include from copy
dst_dir = 'C:/Users/User/Desktop/source/output' # folder for the destination of the copy
for root_path in src_dir:
#print(root_path)
for root, dirs, files in os.walk(root_path): # recurse walking
for dir in included_subdirs:
#print(dir)
if dir in dirs:
source = os.path.join(root,dir)
f_dst = source.replace(root_path, dst_dir)
print (source, f_dst)
for file in os.listdir(source):
print(file)
if file.endswith(".jpg") or file.endswith(".jpeg"):
shutil.copytree(source, f_dst)
Reference related:
https://www.daniweb.com/programming/software-development/threads/356380/how-to-copy-folders-directories-using-python
Python shutil copytree: use ignore function to keep specific files types
How to copy contents of a subdirectory in python
Hi you are trying to use dictionary's copy function which is wrong
import os
import os.path
import shutil
import fnmatch
list_of_dirs_to_copy = ['C:/Users/User/Desktop/test/input'] # List of source dirs
included_subdirs = ['sub1', 'sub2'] # subdir to include from copy
dest_dir = 'C:/Users/User/Desktop/test/output' # folder for the destination of the copy
for root_path in list_of_dirs_to_copy:
print(root_path)
for root, dirs, files in os.walk(root_path): # recurse walking
for dir in included_subdirs:
print(dir)
if dir in dirs:
source = os.path.join(root,dir)
f_dest = source.replace(root_path, dest_dir)
print source, f_dest
shutil.copytree(source,f_dest)
import os
import os.path
import shutil
import fnmatch
list_of_dirs_to_copy = ['C:/Users/User/Desktop/test/input'] # List of source dirs
included_subdirs = ['sub1', 'sub2'] # subdir to include from copy
dest_dir = 'C:/Users/User/Desktop/test/output' # folder for the destination of the copy
for root_path in list_of_dirs_to_copy:
print(root_path)
for root, dirs, files in os.walk(root_path): # recurse walking
for dir in included_subdirs:
print(dir)
if dir in dirs:
source = os.path.join(root,dir)
for f in os.listdir(source):
print os.path.join(source,f)
if os.path.isfile(os.path.join(source,f)) and (f.endswith("jpg") or f.endswith("jpeg")):
f_dest = source.replace(root_path, dest_dir)
print source, f_dest
shutil.copytree(source,f_dest)
break # found any one matching file
Here is the update
for root_path in list_of_dirs_to_copy:
for root, dirs, files in os.walk(root_path): # recurse walking
path_to_copy = {}
for dir in included_subdirs:
if dir in dirs:
source = os.path.join(root,dir)
for f in os.listdir(source):
if os.path.isfile(os.path.join(source,f)) and (f.endswith("jpg") or f.endswith("jpeg")):
f_dest = source.replace(root_path, dest_dir)
path_to_copy.update({source: f_dest})
if len(path_to_copy) == len(included_subdirs):
for source, f_dest in path_to_copy.iteritems():
shutil.copytree(source,f_dest)