Goal: Clean up duplicate data for specified file types in a drive. Amongst a series of duplicates, the file that remains should be the most recently modified.
Problem: It removes duplicates, but it does not discriminate by modified date/time.
I got this code from G4G. I've only added the "if file.endswith" part.
https://www.geeksforgeeks.org/deleting-duplicate-files-using-python/
from tkinter.filedialog import askdirectory
# Importing required libraries.
from tkinter import Tk
import os
import hashlib
from pathlib import Path
import time
# We don't want the GUI window of
# tkinter to be appearing on our screen
Tk().withdraw()
# Dialog box for selecting a folder.
file_path = askdirectory(title="Select a folder")
# Listing out all the files
# inside our root folder.
list_of_files = os.walk(file_path)
# In order to detect the duplicate
# files we are going to define an empty dictionary.
unique_files = dict()
for root, folders, files in list_of_files:
# Running a for loop on all the files
for file in files:
# Finding complete file path
file_path = Path(os.path.join(root, file))
# Converting all the content of
# our file into md5 hash.
Hash_file = hashlib.md5(open(file_path, 'rb').read()).hexdigest()
# If file hash has already #
# been added we'll simply delete that file
if Hash_file not in unique_files:
unique_files[Hash_file] = file_path
else:
if file.endswith((".txt",".bmp")):
os.remove(file_path)
print(f"{file_path} has been deleted")
Bonus Problem: In a drive that I am clearing of duplicates, there are 1148 folders. However, I do not want duplicates between the immediate different folders in the drive to be considered for removal. Is there a way to iterate this script over each folder instead of selecting each folder one by one?
Related
I use this script to make zipped backup of important folder but because after 5th backup files is moves to Recycle Bin and show to everyone I am looking for setpassword opinion to protect deleted zips or even better delete old zips but permanently (not move in Recycle Bin).
from datetime import datetime
from pathlib import Path
import zipfile
OBJECT_TO_BACKUP = '/home/etre/test/' # The file or directory to backup
BACKUP_DIRECTORY = '/home/etre/test-backup/' # The location to store the backups in
MAX_BACKUP_AMOUNT = 5 # The maximum amount of backups to have in BACKUP_DIRECTORY
object_to_backup_path = Path(OBJECT_TO_BACKUP)
backup_directory_path = Path(BACKUP_DIRECTORY)
assert object_to_backup_path.exists() # Validate the object we are about to backup exists before we continue
# Validate the backup directory exists and create if required
backup_directory_path.mkdir(parents=True, exist_ok=True)
# Get the amount of past backup zips in the backup directory already
existing_backups = [
x for x in backup_directory_path.iterdir()
if x.is_file() and x.suffix == '.zip' and x.name.startswith('backup-')
]
# Enforce max backups and delete oldest if there will be too many after the new backup
oldest_to_newest_backup_by_name = list(sorted(existing_backups, key=lambda f: f.name))
while len(oldest_to_newest_backup_by_name) >= MAX_BACKUP_AMOUNT: # >= because we will have another soon
backup_to_delete = oldest_to_newest_backup_by_name.pop(0)
backup_to_delete.unlink()
# Create zip file (for both file and folder options)
backup_file_name = f'backup-{datetime.now().strftime("%Y%m%d%H%M%S")}-{object_to_backup_path.name}.zip'
zip_file = zipfile.ZipFile(str(backup_directory_path / backup_file_name), mode='w')
if object_to_backup_path.is_file():
# If the object to write is a file, write the file
zip_file.write(
object_to_backup_path.absolute(),
arcname=object_to_backup_path.name,
compress_type=zipfile.ZIP_DEFLATED
)
elif object_to_backup_path.is_dir():
# If the object to write is a directory, write all the files
for file in object_to_backup_path.glob('**/*'):
if file.is_file():
zip_file.write(
file.absolute(),
arcname=str(file.relative_to(object_to_backup_path)),
compress_type=zipfile.ZIP_DEFLATED
)
# Close the created zip file
zip_file.close()
I tried this
` zip_file.write(
object_to_backup_path.absolute(),
arcname=object_to_backup_path.name,
compress_type=zipfile.ZIP_DEFLATED
setpassword(b'1234')
`
Official Python Zip File documentation is available here
the following code useful:
from zipfile import ZipFile
import zipfile
myzip = ZipFile('test.zip')
myzip.setpassword(b"asasasasasas")
myzip.extract(member='Roughwork/pretify.html',pwd=b"asasasasasas")
Syntax:
ZipFile.extract(member, file_path=None , pwd=None)
Parameters:
members: It specifies the name of files to be extracted.
file_path: location where archive file needs to be extracted, if file_path is None then contents of zip file will be extracted to the current working directory
pwd: the password used for encrypted files, By default pwd is None.
useful link..
So, I made this little "application" that checks if a file ends with a specific extension(.png or .jpg), but my issue is if I turn it into a loop and I download something while the loop is running it won't move the file to the intended location. It only moves the file on startup.
import os
import shutil
DownloadsDir = ""
Downloadslst = os.listdir(DownloadsDir)
ImageFolder = ''
while True:
for files in Downloadslst:
if files.endswith(('.png','.jpg')):
shutil.move(DownloadsDir + files, ImageFolder)
print("File moved succefully.")
os.listdir(...) is a one-time operation that lists all the files in the current directory at the point of calling. The collection is fixed, not dynamically updated; it's just a simple list. And lists don't update themselves based on a seemingly random condition, like when the files inside a directory change. If you want your list to stay updated, you need to call the function multiple times.
I would do:
processed = set()
to_process = set()
while has_smth_to_download:
download_some_files()
for item in os.listdir(DownloadsDir):
if item not in processed:
to_process.add(item)
processed.add(item)
for files in to_process:
if files.endswith(('png','.jpg')):
shutil.move(DownloadsDir + files, ImageFolder)
print("File moved succefully.")
to_process.clear()
try this:
import os
import shutil
DownloadsDir = ""
ImageFolder = ''
while True:
Downloadslst = os.listdir(DownloadsDir)
for files in Downloadslst:
if files.endswith(('.png','.jpg')):
shutil.move(DownloadsDir + files, ImageFolder)
print("File moved succefully.")
I am currently trying to make a program for file organisation. I want my script to open a filedialog that asks for a sourcefolder where the files will be organised in based on their extension. After a sourcefolder has been selected, I want my script to make directories in that sourcefolder. I am a bit stuck on that last part.
import os
import shutil
from os import listdir
from os.path import isfile, join
import tkinter as tk
from tkinter import filedialog
from tkinter import *
print('This is a program that organises files in a given directory')
print(''
'')
print('Please select the folder in which you want to organise your files')
''' Source Folder '''
root = Tk()
root.withdraw()
source_path = filedialog.askdirectory()
The part above works just fine.
''' Create destination folders in the source folder '''
newpath1 = r'source_path\Images'
if not os.path.exists(newpath1): # check to see if they already exist
os.makedirs(newpath1)
newpath2 = r'source_path\Documents'
if not os.path.exists(newpath2):
os.makedirs(newpath2)
newpath3 = r'source_path\Else'
if not os.path.exists(newpath3):
os.makedirs(newpath3)
This, however, does not. It just removes files, or places them somewhere I cannot find them.
Any help would be greatly appreciated.
TLDR: how to make directories in a sourcefolder submitted with filedialog.
Can anyone tell me how I can get the for loop at the end of this program
NOT to rename folders, only files? I'm at a loss on this one.
I'm assuming the command to use is if os.path.isdir(file)
but I cant seem to get it to work no matter where I put it, same goes for os.path.isfile().
I'm still at the stage (my first week) where I am confused by most commands and functions, though I did dabble in ZX BASIC\AMOS and STOS in the 80s\90s so I have a rudimentary understanding of variables etc.
#FRenum-v.05
#renumbers a folder of files 01 onward preserving file extensions.
#steve Shambles. june 2018, my 2nd ever python program
from tkinter import filedialog
from tkinter import *
import os
import os.path
import subprocess
#user selects directory
root = Tk()
root.withdraw() #stop tk window opening
folder_selected = filedialog.askdirectory() #open file requestor
#change dir to folder selected by user,
os.chdir (folder_selected)
# read user selected dir
files = os.listdir(folder_selected)
# inc is counter to keep track of what file we are working on
inc = 1
for file in files:
#store file extension in string file_ext
file_ext = os.path.splitext(file)[1]
# build new filename, starting with a "0"
#then value of inc then add file ext
created_file=("0"+str(inc)+ file_ext)
#if filename does not already exist then rename it
if not os.path.exists(created_file):
os.rename(file,created_file)
#next one please, until done
inc = inc+1 #add to counter
#Display contents of folder in explorer
#https://stackoverflow.com/questions/50892257/beginner-opening-explorer-to- show-folder-contents
subprocess.Popen(["C:\\Windows\\explorer.exe", folder_selected.replace('/', '\\')])
#thanks to Michael for this line if code
load the modules can add it at the beginning of the script
from os.path import join , isfile
add this line before the loop:
files = [filenames for filenames in files if isfile(join(folder_selected, filenames))]
Example of PDF: "Smith#00$Consolidated_Performance.pdf"
The goal is to add a bookmark to page 1 of each PDF based on the filename.
(Bookmark name in example would be "Consolidated Performance")
import os
from openpyxl import load_workbook
from PyPDF2 import PdfFileMerger
cdir = "Directory of PDF" # Current directory
pdfcdir = [filename for filename in os.listdir(cdir) if filename.endswith(".pdf")]
def addbookmark(f):
output = PdfFileMerger()
name = os.path.splitext(os.path.basename(f))[0] # Split filename from .pdf extension
dp = name.index("$") + 1 # Find position of $ sign
bookmarkname = name[dp:].replace("_", " ") # replace underscores with spaces
output.addBookmark(bookmarkname, 0, parent=None) # Add bookmark
output.append(open(f, 'rb'))
output.write(open(f, 'wb'))
for f in pdfcdir:
addbookmark(f)
The UDF works fine when applied to individual PDFs, but it won't add the bookmarks when put into the loop at the bottom of the code. Any ideas on how to make the UDF loop through all PDFs within pdfcdir?
I'm pretty sure that the issue you're having has nothing to do with the loop. Rather, you're passing just the filenames and not including the directory path. It's trying to open these files in the script's current working directory (the directory the script is in, by default) rather than in the directory you read the filenames from.
So, join the directory name with each file name when calling your function.
for f in pdfcdir:
addbookmark(os.path.join(cdir, f))