How can I clasify files based on their extension in Python?

How can I clasify files based on their extension in Python? - python

I want to move files into folders based on their extensions and categorize them.
I've tried shutil.move() to categorize it. But it gives an error like this:
shutil.Error: Cannot move a directory 'C:\Users\user\Desktop\deneme' into itself 'None'.
How can i fix the problem?
My code:
import os
from os import path
import pathlib
import shutil
path_ = input("Directory: ")
file_list = os.listdir(path_)
os.chdir(path_)
current_directory = os.getcwd()
e = []
a = 0
for i in file_list:
ext = os.path.splitext(i)[1][1:]
e.append(ext)
# print("Ext:", ext)
for j in range(len(e)):
if e[j] == ",":
j = j + 1
continue
os.mkdir(str(j))
os.rename(str(j), e[j])
new_folder = e[j]
for f in os.listdir(current_directory):
new_directory = os.chdir(new_folder)
if f == ",":
f +=1
continue
shutil.move(os.path.join(current_directory), str(new_directory))
#print("it is moved")
print(os.path.dirname(os.path.abspath(str(e[j]))))

Try os.path.splitext()[1] (it returns a list. The 0th element is the filename and the 1st is the extension) if you want to find the file extension

Related

move files to subdirectories that are named on part of the filenames

I have a few data files in a directory, and I want to move them to the subdirectories based on their filenames. Let's say we created the first directory named "20220322_170444," and it should contain the first four files only because in the next file the "el" is less than the previous one, so the second folder, let's say is "20220322_170533", then it should contain next eight files until the el becomes less again than the previous name.
example data
files =[
'cfrad.20220322_170444.122_COW1_v2_s02_el3.40_SUR.nc',
'cfrad.20220322_170456.550_COW1_v2_s03_el4.22_SUR.nc',
'cfrad.20220322_170508.975_COW1_v2_s04_el5.09_SUR.nc',
'cfrad.20220322_170521.397_COW1_v2_s05_el5.99_SUR.nc',
'cfrad.20220322_170533.811_COW1_v2_s06_el0.45_SUR.nc',
'cfrad.20220322_170546.228_COW1_v2_s07_el1.20_SUR.nc',
'cfrad.20220322_170558.648_COW1_v2_s08_el1.90_SUR.nc',
'cfrad.20220322_170611.072_COW1_v2_s09_el2.61_SUR.nc',
'cfrad.20220322_170623.503_COW1_v2_s10_el3.40_SUR.nc',
'cfrad.20220322_170635.923_COW1_v2_s11_el4.21_SUR.nc',
'cfrad.20220322_170648.341_COW1_v2_s12_el5.09_SUR.nc',
'cfrad.20220322_170700.765_COW1_v2_s13_el5.99_SUR.nc',
'cfrad.20220322_170713.179_COW1_v2_s14_el0.45_SUR.nc',
'cfrad.20220322_170725.604_COW1_v2_s15_el1.20_SUR.nc',
'cfrad.20220322_170738.030_COW1_v2_s16_el1.90_SUR.nc',
'cfrad.20220322_170750.461_COW1_v2_s17_el2.61_SUR.nc',
'cfrad.20220322_170802.877_COW1_v2_s18_el3.40_SUR.nc',
'cfrad.20220322_170815.301_COW1_v2_s19_el4.22_SUR.nc',
'cfrad.20220322_170827.715_COW1_v2_s20_el8.01_SUR.nc',
'cfrad.20220322_170840.144_COW1_v2_s21_el11.02_SUR.nc']
for file in files:
np.savetxt(fname=file, X=np.array([1,1]))
What I tried is
import numpy as np
from datetime import datetime
import glob, os, re
import shutil
sweeps = []
temp = []
for i, file in enumerate(files[:19]):
match_str = re.search(r'\d{4}\d{2}\d{2}_\d{2}\d{2}\d{2}', file)
res = datetime.strptime(match_str.group(), '%Y%m%d_%H%M%S')
print(res.strftime("%Y%m%d_%H%M%S"))
el_pos = int(files[i].find('el'))
st_pos = files[i][el_pos+1:el_pos+3]
el_pos1 = int(files[i+1].find('el'))
end_pos = files[i+1][el_pos1+1:el_pos1+3]
# print(files[i][s_pos+1:s_pos+3],files[i+1][s_pos1+1:s_pos1+3])
temp.append(files[i])
print("len(files):",len(files),i)
print(st_pos,end_pos)
# print()
if st_pos>end_pos:
print("temp len: ", len(temp))
sweeps.append(temp)
temp = []
elif len(files)-i==2:
print('entered')
sweeps.append(temp)
I now have a list named sweeps, and it contains the desired files; how can I now move these files to the directories,m but the directories should be named as I stated above based on the date. I have also the date string in variable res.strftime("%Y%m%d_%H%M%S") can be used to create directories.

Some string splitting can do this for you.
import shutil
import os
files = [
"cfrad.20220322_170444.122_COW1_v2_s02_el3.40_SUR.nc",
"cfrad.20220322_170456.550_COW1_v2_s03_el4.22_SUR.nc",
"cfrad.20220322_170508.975_COW1_v2_s04_el5.09_SUR.nc",
"cfrad.20220322_170521.397_COW1_v2_s05_el5.99_SUR.nc",
"cfrad.20220322_170533.811_COW1_v2_s06_el0.45_SUR.nc",
"cfrad.20220322_170546.228_COW1_v2_s07_el1.20_SUR.nc",
"cfrad.20220322_170558.648_COW1_v2_s08_el1.90_SUR.nc",
"cfrad.20220322_170611.072_COW1_v2_s09_el2.61_SUR.nc",
"cfrad.20220322_170623.503_COW1_v2_s10_el3.40_SUR.nc",
"cfrad.20220322_170635.923_COW1_v2_s11_el4.21_SUR.nc",
"cfrad.20220322_170648.341_COW1_v2_s12_el5.09_SUR.nc",
"cfrad.20220322_170700.765_COW1_v2_s13_el5.99_SUR.nc",
"cfrad.20220322_170713.179_COW1_v2_s14_el0.45_SUR.nc",
"cfrad.20220322_170725.604_COW1_v2_s15_el1.20_SUR.nc",
"cfrad.20220322_170738.030_COW1_v2_s16_el1.90_SUR.nc",
"cfrad.20220322_170750.461_COW1_v2_s17_el2.61_SUR.nc",
"cfrad.20220322_170802.877_COW1_v2_s18_el3.40_SUR.nc",
"cfrad.20220322_170815.301_COW1_v2_s19_el4.22_SUR.nc",
"cfrad.20220322_170827.715_COW1_v2_s20_el8.01_SUR.nc",
"cfrad.20220322_170840.144_COW1_v2_s21_el11.02_SUR.nc",
]
for f in files:
with open(f, "w") as of:
of.write("\n")
# force the if statement below to be True on first run
el = 99999999
basepath = "."
for f in files:
new_el = int(f.split(".")[2].split("_")[-1].replace("el", ""))
if new_el < el:
# store new dir name
curr_dir = f.split(".")[1]
print(curr_dir)
# create directory
os.makedirs(curr_dir, exist_ok=True)
# store new el
el = new_el
# move file
shutil.move(f"{basepath}{os.sep}{f}", f"{basepath}{os.sep}{curr_dir}{os.sep}{f}")

Move files recursively to new directory shutil.move

I am trying to move all.jpg files, recursively from the CWD, into a single new directory. Unable to make shutil.move recursive. Any hints in regard of the last line in the code?
import glob
import shutil
import os
from_dir = input('Enter recursive FROM directory (if CWD, enter .): ')
to_dir = input('Enter TO directory (if CWD, enter .): ')
if not os.path.exists(to_dir):
os.makedirs(to_dir)
for imagfile in glob.iglob(os.path.join(from_dir, "*.jpg")):
shutil.move(imagfile, to_dir)
I tried these, do not work:
#shutil.move(os.path.join(root, imagfile), os.path.join(to_dir, imagfile))
#shutil.move(from_dir, imagfile, to_dir)
#shutil.move(os.path.join(from_dir, imagfile), to_dir)
#shutil.move(imagfile, to_dir+imagfile)
#shutil.move(from_dir+imagfile, to_dir+imagfile)

Try This:
import os, time, inspect, shutil
main_path = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
from_dir = "" # write your source dir
to_dir = "" # write your target dir
def check_target():
global to_dir
if to_dir.strip() == "":
print("Plz Insert a Valid 'to_dir' path!")
exit()
elif to_dir == ".":
to_dir = os.path.join(main_path, to_dir)
elif os.path.exists(to_dir) is True:
to_dir = os.path.abspath(to_dir)
else:
os.mkdir(to_dir)
check_target()
for dirpath, _, filenames in os.walk(from_dir):
for items in filenames:
file_full_path = os.path.abspath(os.path.join(dirpath, items))
if file_full_path.lower().endswith(".jpg") or file_full_path.lower().endswith(".jpeg"):
check_address = os.path.join(to_dir, os.path.basename(file_full_path))
if os.path.exists(check_address) and os.path.isfile(check_address):
warning_message = "WARNING Duplicate File Names : {0}".format(check_address)
print(warning_message)
else:
try:
shutil.move(file_full_path, check_address)
except:
print("Something Went Wrong On " + file_full_path)
else:
pass
Good Luck ...

Maybe you can recursively obtain all dir and make shutil.move to every dir。
import os
root = "/Users/xyz/Documents/fun/test"
g = os.walk(root)
dirs = []
for i in g:
dirs.append(i[0])
print(dirs)

How do I get every file of an extension from a directory to another? I wrote a code but I'm getting an exception

This piece of code is my first attempt at creating a program. I'm getting an error when running it that reads:
PermissionError: [WinError 32] The process cannot access the file
because it is being used by another process:
'C:\Users\gabri\Desktop\' -> 'C:\Users\gabri\Desktop\Planilhas
Excel\'
What am I doing wrong? The goal of this program is to get all excel, then pdf, then word files and put them in folders created by the program.
import os
from glob import glob
# import cx_Freeze
print("Digite o diretório de origem.")
dirOrigem = input()
os.chdir(dirOrigem)
excel_files = glob('*.xlsx')
excel_files.append(''.join(glob('*.xls')))
dirDestinoXL = dirOrigem + '\\' + 'Planilhas Excel'
if not os.path.exists(dirDestinoXL):
os.makedirs(dirDestinoXL)
for i in excel_files:
os.rename(f'{dirOrigem}\\{"".join(i)}', f'{dirDestinoXL}\\{"".join(i)}')
os.chdir(dirOrigem)
pdf_files = glob('*.pdf')
dirDestinoPDF = dirOrigem + '\\' + 'PDF'
if not os.path.exists(dirDestinoPDF):
os.makedirs(dirDestinoPDF)
for p in pdf_files:
os.rename(f'{dirOrigem}\\{"".join(p)}', f'{dirDestinoPDF}\\{"".join(p)}')
os.chdir(dirOrigem)
word_files = glob('*.doc')
word_files.append(glob('*.docx'))
dirDestinoWord = dirOrigem + '\\' + 'Word'
if not os.path.exists(dirDestinoWord):
os.makedirs(dirDestinoWord)
for d in word_files:
os.rename(f'{dirOrigem}\\{"".join(d)}', f'{dirDestinoWord}\\{"".join(d)}')

I tried your program and it doesn't work as it is on my computer. I changed some lines and it works. Hope it helps
import os
from glob import glob
dirOrigem = r'C:\Users\fchal\Desktop\temp' # here I changed the code just because I didn't want to bother using input()
os.chdir(dirOrigem)
excel_files = glob('*.xlsx')
excel_files.extend(glob('*.xls'))
dirDestinoXL = dirOrigem + '\\' + 'xlsfile'
if not os.path.exists(dirDestinoXL):
os.makedirs(dirDestinoXL)
for i in excel_files:
os.rename(i, os.path.join(dirDestinoXL, i))
# same procedure for pdf and word files

I know that glob can be a mess sometimes. And if the files are open, you can get errors. Here's what I would do:
import os
def move_files_with_extension(from_dir, to_dir, *extensions):
if not os.path.isdir(from_dir):
raise ValueError('{} is not a real directory'.format(from_dir))
elif not os.path.isdir(to_dir):
raise ValueError('{} is not a real directory'.format(to_dir))
files_with_extensions = all_files_with_extensions_in(from_dir, *extensions)
for file_path in files_with_extensions:
os.rename(file_path, os.path.join(to_dir, os.path.basename(file_path)))
def all_files_with_extensions_in(dir, *extensions):
files_with_extensions = list()
for dir_path, dir_names, file_names in os.walk(dir):
for file_name in file_names:
if file_name.endswith(extensions):
files_with_extensions.append(os.path.join(dir_path, file_name))
return files_with_extensions
and then you can do:
dirOrigem = input()
excel_location = os.path.join(dirOrigem, 'Planilhas Excel')
move_files_with_extension(dirOrigem, excel_location, '.xls', '.xlsx')
and so on

Python iterating through folders and \ characters

I have an old project I wanted to post up on gh-pages and I have a bunch of html files in a bunch of folders.
So I've been piecing together a piece of python that would create an index page displaying all the contained html hyperlinks so that the content will be browse-able on gh-pages similarly to how it is done with full on web servers such as Apache.
To get started I have all the content printing in one file but unfortunately python is throwing file locations as py r'Strings' where \ is escaped with a \.
I have been trying to prevent this from causing IO errors but have been getting a little stuck.
import os
class indexer:
path = "~"
prod = []
def __init__(self,p):
self.path=p
def HtmlFrek(self,k):
print("rek")
os.chdir(k)
ret="<h1>"+k+"</h1>"
files = [f for f in os.listdir('.') if os.path.isfile(f) and f.split(".")[len(f.split("."))-1]=="html"]
for t in files:
t.replace(".","")
t.replace("\\","/")
ret+= ""+k+"\n"
folders = [x[0] for x in os.walk('.')]
for k in folders:
print k
if(k == '.'):
continue
print k
ret+="<div class='blue1'>"
ret+=self.HtmlFrek(k)
ret = "</div>"
os.chdir("..")
return(ret)
def HtmlProd(self):
print("start")
ret = []
ret.append("""<!DOCTYPE html><html>""")
ret.append("<div class = 'ClearShadeLeft'>")
folders = [x[0] for x in os.walk('.')]
for k in folders:
ret[1]+="<div class='blue1'>"
ret[1]+=self.HtmlFrek(k)
ret[1] = "</div>"
ret[1] = "</div>"
ret.append("""<\html><html>""")
self.prod = ret
return(ret)
i = indexer(".")
i.HtmlProd()
print i.prod
for k in i.prod:
print k
print()
Edit: I think the answer here is to replace os.walk with [f for f in os.listdir(somedir) if os.path.isfile(f)].
Another Edit:
This version of the code works...
import os
class indexer:
path = "~"
site = "http://krewn.github.io"
proj = "Reprogramming"
prod = []
loc=[]
def __init__(self,p):
self.path=p
def fprep(self,name):
name.replace(".","")
name.replace("\\","/")
return(name)
def refPrep(self):
ref = self.site+"/"+self.proj
for qw in self.loc:
ref+="/"+qw
return(ref)
def HtmlFrek(self,adir):
self.loc.append(adir)
os.chdir(adir)
ret="<h2>"+adir+"</h2>"
files = [f for f in os.listdir('.') if os.path.isfile(f) and f.split(".")[len(f.split("."))-1]=="html"]
for t in files:
ret+=""+self.fprep(t)+"<br>\n"
images = [f for f in os.listdir('.') if os.path.isfile(f) and f.split(".")[len(f.split("."))-1]=="png"]
for i in images:
i = self.fprep(i)
ref = self.refPrep()
ret+= "<img src="+ref+"/"+i+">\n"
folders = [f for f in os.listdir(".") if not os.path.isfile(f)]
for k in folders:
if(k.__contains__(".")):
continue
ret+="<div class='blue1'>"
ret+=self.HtmlFrek(k)
ret+="</div>"
os.chdir("..")
del self.loc[len(self.loc)-1]
return(ret)
def HtmlProd(self):
print("start")
ret = ""
ret+="""<!DOCTYPE html><html>"""
ret+="<div>"
files = [f for f in os.listdir('.') if os.path.isfile(f) and f.split(".")[len(f.split("."))-1]=="html"]
for t in files:
ret+=""+self.fprep(t)+"<br>\n"
folders = [f for f in os.listdir(".") if not os.path.isfile(f)]
for k in folders:
if(k.__contains__(".")):
continue
print k
ret+="<div>"
ret+=self.HtmlFrek(k)
ret+="</div>"
ret+="</div>"
ret+="""</html>"""
self.prod = ret
return(ret)
i = indexer(".")
q=i.HtmlProd()
#print i.prod
w = open("index.html","w")
w.write(q)
w.close()

Doxygen is your friend for this sort of thing. You give doxygen a source code folder. You specify what file extensions that you consider to be source. Then it goes off and builds an index of everything in the folder. You can output this in html or as a pdf.

Copying random files from a file tree

I have the same problem as here but now I'm trying to do the same with python because it's more suited to the task.
I've started with this:
import os
import shutil
import random
import glob
root_dir = '/home/leonardo/Desktop/python_script/rfe'
output_dir = '/home/leonardo/Desktop/python_script/output_folder'
ref = 200
folders_root_dir = os.listdir(root_dir)
print folders_root_dir
count = len(folders_root_dir)
print count
for i in xrange(count):
folder_inside = root_dir + '/' + folders_root_dir[i]
print folder_inside
number_files_folder_inside = len(os.listdir(folder_inside))
print number_files_folder_inside
if number_files_folder_inside > ref:
ref_copy = round(0.2*number_files_folder_inside)
print ref_copy
# here I have to copy 20% of the files in this folder to the output folder
else:
# here I have to copy all files from the folder to the output_dir
I tried to use os.walk() but I'm new to python and selecting files while the function is working proved to be really tough.

You'll need to import these:
import os
import shutil
import random
You can get all the files in a directory like this:
files = [file for file in os.listdir(dir) if os.path.isfile(os.path.join(dir, file))]
Then use a conditional:
if len(files) < 200:
for file in files:
shutil.copyfile(os.path.join(dir, file), dst)
else:
# Amount of random files you'd like to select
random_amount = 1000
for x in xrange(random_amount):
if len(files) == 0:
break
else:
file = random.choice(files)
shutil.copyfile(os.path.join(dir, file), outputdir)

A more compact solution (also noticing that copyfile does not really do the job properly unless one specifies the target file name as well):
import os
import shutil
import random
def get_file_list(input_dir):
return [file for file in os.listdir(input_dir) if os.path.isfile(os.path.join(input_dir, file))]
def get_random_files(file_list, N):
return random.sample(file_list, N)
def copy_files(random_files, input_dir, output_dir):
for file in random_files:
shutil.copy(os.path.join(input_dir, file), output_dir)
def main(input_dir, output_dir, N):
file_list = get_file_list(input_dir)
random_files = get_random_files(file_list, N)
copy_files(random_files, input_dir, output_dir)

import os
import shutil
import random
root_dir = '/home/leonardo/Desktop/python_script/qar'
output_dir = '/home/leonardo/Desktop/python_script/output_folder'
ref = 1
for root, dirs, files in os.walk(root_dir):
number_of_files = len(os.listdir(root))
if number_of_files > ref:
ref_copy = int(round(0.2 * number_of_files))
for i in xrange(ref_copy):
chosen_one = random.choice(os.listdir(root))
file_in_track = root
file_to_copy = file_in_track + '/' + chosen_one
if os.path.isfile(file_to_copy) == True:
shutil.copy(file_to_copy,output_dir)
print file_to_copy
else:
for i in xrange(len(files)):
track_list = root
file_in_track = files[i]
file_to_copy = track_list + '/' + file_in_track
if os.path.isfile(file_to_copy) == True:
shutil.copy(file_to_copy,output_dir)
print file_to_copy
print 'Finished !'
The final code has this face
thank you guys for the help !
cheers !

I want this for splitting my dataset to train,test and validation.
here is my code :
import os
import shutil
import random
import numpy as np
dir = r'E:\down\imgs'
train_dir = r'E:/train_test_split/train'
test_dir = r'E:/train_test_split/test'
valid_dir = r'E:/train_test_split/validation'
files = [file for file in os.listdir(dir) if os.path.isfile(os.path.join(dir, file))]
train_count = np.round(50/100*len(files))
test_count = np.round(30/100*len(files))
valid_count = np.round(20/100*len(files))
rndnums = list(random.sample(range(0, len(files)), len(files)))
print("len(files)",len(files))
# print("all",len(files))
# print("train",np.round(train*len(files)))
# print("test",np.round(test*len(files)))
# print("valid",np.round(valid*len(files)))
#
# print("sum",np.round(train*len(files)) + np.round(test*len(files)) + np.round(valid*len(files)))
# Amount of random files you'd like to select
##train_files
print(rndnums)
train_file_index = rndnums[0:int(train_count)+1]
train_file_name = [files[i] for i in train_file_index]
test_file_index = rndnums[int(train_count)+1:int(train_count + test_count)+1]
test_file_name = [files[i] for i in test_file_index]
valid_file_index = rndnums[int(train_count + test_count)+1:]
valid_file_name = [files[i] for i in valid_file_index]
for x in train_file_name:
file = x
shutil.copyfile(os.path.join(dir, file), os.path.join(train_dir, file))
##test_files
for y in test_file_name:
file = y
shutil.copyfile(os.path.join(dir, file), os.path.join(test_dir, file))
##valid_files
for z in valid_file_name:
file = z
shutil.copyfile(os.path.join(dir, file), os.path.join(valid_dir, file))

maybe something like (untested)
import os
THRESHOLD = 200
root_dir = "\home..."
output_dir = "\home....."
for top, dirs, nondirs in os.walk(root_dir):
for name in nondirs[:THRESHOLD]:
path = os.path.join(top, name)
destination = os.path.join(output_dir, name)
os.rename(path, destination)

import random
import shutil
import os
rootdir = '/home/leonardo/Desktop/python_script/qar'
outdir = '/home/leonardo/Desktop/python_script/output_folder'
ref = 200
dirsAndFiles = {} # here we store a structure {folder: [file1, file2], folder2: [file2, file4] }
dirs = [x[0] for x in os.walk(rootdir)] # here we store all sub-dirs
for dir in dirs:
dirsAndFiles[dir] = [f for f in os.listdir(dir) if os.path.isfile(os.path.join(dir, f))]
for (dir, files) in dirsAndFiles.iteritems():
if len(files) > ref:
for i in xrange(int(0.2*len(files))): # copy 20% of files
fe = random.choice(files)
files.remove(fe)
shutil.copy(os.path.join(dir, fe), outdir)
else: # copy all files
for file in files:
shutil.copy(os.path.join(dir, file), outdir)

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

How can I clasify files based on their extension in Python? - python

Try os.path.splitext()[1] (it returns a list. The 0th element is the filename and the 1st is the extension) if you want to find the file extension

Related

move files to subdirectories that are named on part of the filenames

Move files recursively to new directory shutil.move

How do I get every file of an extension from a directory to another? I wrote a code but I'm getting an exception

Python iterating through folders and \ characters

Copying random files from a file tree

Categories

Resources