As part of a wider project (to learn) I am building a script to discover discovering the files recursively in a folder. Then adding the filename (including the path) and the size in bytes to a CSV file.
I've then loaded that CSV file as a python dictionary.
What I would like to do now, is have python parse over each value in the dictionary (which is the size) and compare it to all others in the dictionary. If it finds a match, I want it to show me which keys (file name) have the matching values. I'll then do an MD5 hash on those that appear to have the same size.
The code below is as far as I've got - can anyone assist please?
#!/usr/bin/env python3
import argparse
import os
import sys
import csv
import fnmatch
def verify_args():
parser = argparse.ArgumentParser(description='Compare files recursively.')
parser.add_argument('path', help='Location to begin file comparison from.')
check = parser.parse_args()
if os.path.isdir(check.path):
print(check.path,'is a valid path - continuing' + '\n')
else:
print(check.path,'is an invalid path - exiting' + '\n')
sys.exit()
return parser.parse_args()
def listfiles(file_path):
print ('Starting comparison')
pattern = '*'
with open('/tmp/foo','w') as fo:
fo.write('file,size' + '\n')
for root, dirs, files in os.walk(file_path):
for filename in fnmatch.filter(files, pattern):
fo.write(os.path.join(root, filename) + ',' + str(os.path.getsize(os.path.join(root, filename))) + '\n')
files = {}
with open('/tmp/foo') as csvfile:
reader = csv.DictReader(csvfile)
for row in reader:
files[row['file']] = row['size']
x = files.keys()
print(x)
# Not sure now what to do
def main():
args = verify_args()
file_path = args.path
listfiles(file_path)
if __name__ == '__main__':
main()
Related
My code currently unzips one zip folder and finds the file called file.txt and extracts it. Now I need to unzip multiple folders that have the extension .zip. I have tried to use code similar to what I need it to do but the problem is that now I have to find a file called file.txt in each of those .zip folders and extract that file only . Also to store file.txt into a separate folder that has the same name where it came from. Thank you in advance for your time.
import re
import os
from zipfile import ZipFile
def pain():
print("\t\t\tinput_files.zip has been unzipped")
with ZipFile('input_files.zip', 'r') as zipObj:
zipObj.extractall()
listOfFileNames = zipObj.namelist()
for fileName in listOfFileNames:
if fileName.endswith('.txt'):
zipObj.extract(fileName, 'storage')
outfile = "output2.txt" #this will be the filename that the code will write to
baconFile = open(outfile,"wt")
file_name1 = "file.txt"
print('Filename\tLine\tnumber of numbers\tstring separated by a comma\twhite space found\ttab found\tcarriage return found\n') #This prints the master column in the python shell and this is the way the code should collect the data
baconFile.write('Filename\tLine\tnumber of numbers\tstring separated by a comma\twhite space found\ttab found\tcarriage return found\n') #This prints the master column in the output file and this is the way the code should collect the data
#for filename in os.listdir(os.getcwd() + "/input_files"):
for filename in os.listdir('C:\Users\M29858\Desktop\TestPy\Version10\input_files'):
with open("input_files/" + filename, 'r') as f:
if file_name1 in filename:
output_contents(filename, f, baconFile)
baconFile.close() #closes the for loop that the code is writing to
def output_contents(filename, f, baconFile): #using open() function to open the file inside the directory
index = 0
for line in f:
#create a list of all of the numerical values in our line
content = line.split(',') #this will be used to count the amount numbers before and after comma
whitespace_found = False
tab_found = False
false_string = "False (end of file)"
carriage_found = false_string
sigfigs = ""
index += 1 #adds 1 for every line if it finds what the command wants
if " " in line: #checking for whitespace
whitespace_found = True
if "\t" in line: #checking for tabs return
tab_found = True
if '\n' in line: #checking if there is a newline after the end of each line
carriage_found = True
sigfigs = (','.join(str(len(g)) for g in re.findall(r'\d+\.?(\d+)?', line ))) #counts the sigsfigs after decimal point
print(filename + "\t{0:<4}\t{1:<17}\t{2:<27}\t{3:17}\t{4:9}\t{5:21}"
.format(index, len(content), sigfigs, str(whitespace_found), str(tab_found), str(carriage_found))) #whatever is inside the .format() is the way it the data is stored into
baconFile.write('\n')
baconFile.write( filename + "\t{0:<4}\t{1:<17}\t{2:<27}\t{3:17}\t{4:9}\t{5:21}"
.format(index, len(content), sigfigs, str(whitespace_found), str(tab_found), str(carriage_found)))
if __name__ == '__main__':
pain()
#THIS WORKS
import glob
import os
from zipfile import ZipFile
def main():
for fname in glob.glob("*.zip"): # get all the zip files
with ZipFile(fname) as archive:
# if there's no file.txt, ignore and go on to the next zip file
if 'file.txt' not in archive.namelist(): continue
# make a new directory named after the zip file
dirname = fname.rsplit('.',1)[0]
os.mkdir(dirname)
extract file.txt into the directory you just created
archive.extract('file.txt', path=dirname)
The code I am working with takes in a .pdf file, and outputs a .txt file. My question is, how do I create a loop (probably a for loop) which runs the code over and over again on all files in a folder which end in ".pdf"? Furthermore, how do I change the output each time the loop runs so that I can write a new file each time, that has the same name as the input file (ie. 1_pet.pdf > 1_pet.txt, 2_pet.pdf > 2_pet.txt, etc.)
Here is the code so far:
path="2_pet.pdf"
content = getPDFContent(path)
encoded = content.encode("utf-8")
text_file = open("Output.txt", "w")
text_file.write(encoded)
text_file.close()
The following script solve your problem:
import os
sourcedir = 'pdfdir'
dl = os.listdir('pdfdir')
for f in dl:
fs = f.split(".")
if fs[1] == "pdf":
path_in = os.path.join(dl,f)
content = getPDFContent(path_in)
encoded = content.encode("utf-8")
path_out = os.path.join(dl,fs[0] + ".txt")
text_file = open(path_out, 'w')
text_file.write(encoded)
text_file.close()
Create a function that encapsulates what you want to do to each file.
import os.path
def parse_pdf(filename):
"Parse a pdf into text"
content = getPDFContent(filename)
encoded = content.encode("utf-8")
## split of the pdf extension to add .txt instead.
(root, _) = os.path.splitext(filename)
text_file = open(root + ".txt", "w")
text_file.write(encoded)
text_file.close()
Then apply this function to a list of filenames, like so:
for f in files:
parse_pdf(f)
One way to operate on all PDF files in a directory is to invoke glob.glob() and iterate over the results:
import glob
for path in glob.glob('*.pdf')
content = getPDFContent(path)
encoded = content.encode("utf-8")
text_file = open("Output.txt", "w")
text_file.write(encoded)
text_file.close()
Another way is to allow the user to specify the files:
import sys
for path in sys.argv[1:]:
...
Then the user runs your script like python foo.py *.pdf.
You could use a recursive function to search the folders and all subfolders for files that end with pdf. Than take those files and then create a text file for it.
It could be something like:
import os
def convert_PDF(path, func):
d = os.path.basename(path)
if os.path.isdir(path):
[convert_PDF(os.path.join(path,x), func) for x in os.listdir(path)]
elif d[-4:] == '.pdf':
funct(path)
# based entirely on your example code
def convert_to_txt(path):
content = getPDFContent(path)
encoded = content.encode("utf-8")
file_path = os.path.dirname(path)
# replace pdf with txt extension
file_name = os.path.basename(path)[:-4]+'.txt'
text_file = open(file_path +'/'+file_name, "w")
text_file.write(encoded)
text_file.close()
convert_PDF('path/to/files', convert_to_txt)
Because the actual operation is changeable, you can replace the function with whatever operation you need to perform (like using a different library, converting to a different type, etc.)
Suppose I have a text file aiq_hits.txt.
Each line in this file corresponds a filename
ant1.aiq
ant2.aiq
ant3.aiq
ant4.aiq
I want to match each line of my textfile (ant1.aiq,ant2.aiq and so on) with filenames which are present at some specific place(R:\Sample) and extract matching files into some other place (R:\sample\wsa).
I have an idea that I need to use functions like os.walk() and fnmatch.fnmatch(), shutil.copy() but I am not able to implement them
My code:
import os
import shutil
import fnmatch
with open("aiq_hits.txt","r") as in_file:
for line in in_file:
I am stuck here
import os
import shutil
sourceDir = "R:\\Sample"
targetDir = "R:\\Sample\\wsa"
existingFiles = set(f for f in os.listdir(sourceDir) if os.path.isfile(os.path.join(sourceDir, f)))
infilepath = "aiq_hits.txt"
with open(infilepath) as infile:
for line in infile:
fname = line.strip()
if fname not in existingFiles: continue
shutil.move(os.path.join(sourceDir, fname), os.path.join(targetDir, fname))
I hope this will suffice:
import os
def match_files(url,file_read, dest):
f = open(file_read, 'rb')
file_list = os.listdir(url)
print(file_list)
saved_path = os.getcwd()
print("Current working directory is " + saved_path)
os.chdir(url)
match = []
for file_name in f:
file_name = file_name.strip()
if file_name in file_list:
match.append(file_name)
os.rename(os.path.join(url, file_name), os.path.join(dest, file_name))
os.chdir(saved_path)
print match
here, url is source directory or folder from which u want to match files, file_read is the name of file (with path) in which list of file names is given, dest is the destination folder.
this code moves the matching files from url to dest, i.e. these files won't remin in url after running the code.
Alternatively you could use the glob module which allows you to enter in a expression for the file name\extension which will then return a list that you can loop over.
I'd use this module if the source directory can have files with the same extension that you want to exclude from being looped over
Also I'm assuming that the file name list is not large and so storing it in a list wont be an issue
eg (I haven't tested the below )
from glob import glob
import os
import shutil
src = 'R:\\Sample'
dst = "R:\\Sample\\wsa"
in_file_list = "aiq_hits.txt"
list_Of_files = glob(os.path.join(src, 'ant*.aiq'))
data = []
with open(in_file_list) as reader:
data += reader.readlines()
for row in list_Of_files:
file_path, file_name = os.path.split(row)
if file_name in data:
shutil.copy2(row, os.path.join(dst, file_name))
# or if you want to move the file
# shutil.move(row, os.path.join(dst, file_name))
I have a piece of code i wrote for school:
import os
source = "/home/pi/lab"
dest = os.environ["HOME"]
for file in os.listdir(source):
if file.endswith(".c")
shutil.move(file,dest+"/c")
elif file.endswith(".cpp")
shutil.move(file,dest+"/cpp")
elif file.endswith(".sh")
shutil.move(file,dest+"/sh")
what this code is doing is looking for files in a source directory and then if a certain extension is found the file is moved to that directory. This part works. If the file already exists in the destination folder of the same name add 1 at end of the file name, and before the extension and if they are multiples copies do "1++".
Like this: test1.c,test2.c, test3.c
I tried using os.isfile(filename) but this only looks at the source directory. and I get a true or false.
To test if the file exists in the destination folder you should os.path.join the dest folder with the file name
import os
import shutil
source = "/home/pi/lab"
dest = os.environ["HOME"]
# Avoid using the reserved word 'file' for a variable - renamed it to 'filename' instead
for filename in os.listdir(source):
# os.path.splitext does exactly what its name suggests - split the name and extension of the file including the '.'
name, extension = os.path.splitext(filename)
if extension == ".c":
dest_filename = os.path.join(dest, filename)
if not os.path.isfile(dest_filename):
# We copy the file as is
shutil.copy(os.path.join(source, filename) , dest)
else:
# We rename the file with a number in the name incrementing the number until we find one that is not used.
# This should be moved to a separate function to avoid code duplication when handling the different file extensions
i = 0
dest_filename = os.path.join(dest, "%s%d%s" % (name, i, extension))
while os.path.isfile(dest_filename):
i += 1
dest_filename = os.path.join(dest, "%s%d%s" % (name, i, extension))
shutil.copy(os.path.join(source, filename), dest_filename)
elif extension == ".cpp"
...
# Handle other extensions
If you want to have put the renaming logic in a separate function using glob and re this is one way:
import glob
import re
...
def rename_file(source_filename, source_ext):
filename_pattern = os.path.join(dest, "%s[0-9]*%s"
% (source_filename, source_ext))
# Contains file such as 'a1.c', 'a2.c', etc...
existing_files = glob.glob(filename_pattern)
regex = re.compile("%s([0-9]*)%s" % (source_filename, source_ext))
# Retrieve the max of the index used for this file using regex
max_index = max([int(match.group(1))
for match in map(regex.search, existing_files)
if match])
source_full_path = os.path.join(source, "%s%s"
% (source_filename, source_ext))
# Rebuild the destination filename with the max index + 1
dest_full_path = os.path.join(dest, "%s%d%s"
% (source_filename,
(max_index + 1),
source_ext))
shutil.copy(source_full_path, dest_full_path)
...
# If the file already exists i.e. replace the while loop in the else statement
rename_file(name, extension)
I din't test the code. But something like this should do the job:-
i = 0
filename = "a.txt"
while True:
if os.isfile(filename):
i+= 1
break
if i:
fname, ext = filename.split('.')
filename = fname + str(i) + '.' + ext
My program runs smoothly but I want my files from ftp to be zip in my local drive
The problem is only 1 file is being zipped after calling my main() function
Here's my code:
import os
import upload
import download
import zipfile
import ConfigParser
import ftputil
def main():
#create a folder Temp on d drive for later use
path = r'D:\Temp'
os.mkdir(path)
#parse all the values at config.ini file
config = ConfigParser.ConfigParser()
config.readfp(open('config.ini'))
server = config.get('main', 'Server')
username = config.get('main', 'Username')
password = config.get('main', 'Password')
uploads = config.get('main', 'Upload folder')
downloads = config.get('main', 'Download folder')
#connect to ftp
ftp = ftputil.FTPHost(server, username, password)
dirlist = ftp.listdir(downloads)
for list in dirlist:
ftp.chdir(downloads)
target = os.path.join(path, list)
ftp.download(list, target)
#########################################################
# THis section is where algo fails but the program run#
########################################################
#zipping files
absolute_path = r'D:\Temp'
dirlist = os.listdir(absolute_path)
filepath = r'D:\Temp\project2.zip'
for list in dirlist:
get_file = os.path.join(absolute_path, list)
zip_name = zipfile.ZipFile(filepath, 'w')
zip_name.write(get_file, 'Project2b\\' + list)
if __name__ == '__main__':
print "cannot be"
When you do this :
for list in dirlist:
get_file = os.path.join(absolute_path, list)
zip_name = zipfile.ZipFile(filepath, 'w')
zip_name.write(get_file, 'Project2b\\' + list)
you recreate a ZipFile for each file you want to zip, the "w" mode means you recreate it from scratch.
Try this (create the zip file before the loop) :
zip_name = zipfile.ZipFile(filepath, 'w')
for list in dirlist:
get_file = os.path.join(absolute_path, list)
zip_name.write(get_file, 'Project2b\\' + list)
Or this, it will open the zipfile in append mode:
for list in dirlist:
get_file = os.path.join(absolute_path, list)
zip_name = zipfile.ZipFile(filepath, 'a')
zip_name.write(get_file, 'Project2b\\' + list)
Have a look at the shutil module. There is an example using shutil.make_archive():
http://docs.python.org/library/shutil.html
If you have a lot of files you can zip them in parallel:
import zipfile
from pathlib import Path, WindowsPath
from typing import List, Text
import logging
from time import time
from concurrent.futures import ThreadPoolExecutor
logging.basicConfig(
format="%(asctime)s - %(message)s", datefmt="%H:%M:%S", level=logging.DEBUG
)
PATH = (r"\\some_directory\subdirectory\zipped")
def file_names() -> List[WindowsPath]:
p = Path(PATH)
file_names = list(p.glob("./*.csv"))
logging.info("There are %d files", len(file_names))
return file_names
def zip_file(file: WindowsPath) -> None:
zip_file_name = Path(PATH, f"{file.stem}.zip")
with zipfile.ZipFile(zip_file_name, "w") as zip:
zip.write(file, arcname=file.name, compress_type=zipfile.ZIP_DEFLATED)
def main(files: List[Text]) -> None:
t0 = time()
number_of_files = len(files)
with ThreadPoolExecutor() as executor:
for counter, _ in enumerate(executor.map(zip_file, files), start=1):
# update progress every 100 files
if counter % 100 == 0:
logging.info(
"Processed %d/%d. TT: %d:%d",
counter,
number_of_files,
*divmod(int(time() - t0), 60),
)
logging.info(
"Finished zipping %d files. Total time: %d:%d",
len(files),
*divmod(int(time() - t0), 60),
)
if __name__ == "__main__":
files = file_names()
main(files)
Best way to do this is by putting debug statements at your for loops, there are two possibilities;
one is that the first forloop only downloads one file from the ftp folder
two is that the first loop downloads all files but second loop zips only one of them
use print statements to see which files are downloaded/zipped at the loops, good luck