ZIP folder with subfolder in python - python

I need to zip a folder that containts an .xml file and a .fgdb file by using python. Could anyone help me? I tried a few scripts I found on internet but there is always some technical issue (such as creating an empty zip file, or create zip file I cannot open 'no permission' etc..)
Thanks in advance.

The key to making it work is the os.walk() function. Here is a script I assembled in the past that should work. Let me know if you get any exceptions.
import zipfile
import os
import sys
def zipfolder(foldername, target_dir):
zipobj = zipfile.ZipFile(foldername + '.zip', 'w', zipfile.ZIP_DEFLATED)
rootlen = len(target_dir) + 1
for base, dirs, files in os.walk(target_dir):
for file in files:
fn = os.path.join(base, file)
zipobj.write(fn, fn[rootlen:])
zipfolder('thenameofthezipfile', 'thedirectorytobezipped') #insert your variables here
sys.exit()

This answer is very helpful, but it took me a moment to fully understand what rootlen was for. This example uses some longer variable names to help teach what exactly is going on here. Also included is a block to validate the zip file is correct.
import os
import zipfile
src_path = os.path.join('/', 'some', 'src', 'path')
archive_name = 'myZipFile.zip'
archive_path = os.path.join('/', 'some', 'path', archive_name)
with zipfile.ZipFile(archive_path, 'w', zipfile.ZIP_DEFLATED) as archive_file:
for dirpath, dirnames, filenames in os.walk(src_path):
for filename in filenames:
file_path = os.path.join(dirpath, filename)
archive_file_path = os.path.relpath(file_path, src_path)
archive_file.write(file_path, archive_file_path)
with zipfile.ZipFile(archive_path, 'r') as archive_file:
bad_file = zipfile.ZipFile.testzip(archive_file)
if bad_file:
raise zipfile.BadZipFile(
'CRC check failed for {} with file {}'.format(archive_path, bad_file))

If someone wants some generic function to utilize in your project
def zip_compression_tree(root_path, zip_name):
with zipfile.ZipFile(zip_name, 'w') as z:
for root, dirs, files in os.walk(root):
for file in files:
z.write(os.path.join(root, file))
for directory in dirs:
z.write(os.path.join(root, directory))

Related

How can I create a zipped file in Python that when unzipped will give me the raw files rather than a folder? [duplicate]

I have two files in two different directories, one is '/home/test/first/first.pdf', the other is '/home/text/second/second.pdf'. I use following code to compress them:
import zipfile, StringIO
buffer = StringIO.StringIO()
first_path = '/home/test/first/first.pdf'
second_path = '/home/text/second/second.pdf'
zip = zipfile.ZipFile(buffer, 'w')
zip.write(first_path)
zip.write(second_path)
zip.close()
After I open the zip file that I created, I have a home folder in it, then there are two sub-folders in it, first and second, then the pdf files. I don't know how to include only two pdf files instead of having full path zipped into the zip archive. I hope I make my question clear, please help.
The zipfile write() method supports an extra argument (arcname) which is the archive name to be stored in the zip file, so you would only need to change your code with:
from os.path import basename
...
zip.write(first_path, basename(first_path))
zip.write(second_path, basename(second_path))
zip.close()
When you have some spare time reading the documentation for zipfile will be helpful.
I use this function to zip a directory without include absolute path
import zipfile
import os
def zipDir(dirPath, zipPath):
zipf = zipfile.ZipFile(zipPath , mode='w')
lenDirPath = len(dirPath)
for root, _ , files in os.walk(dirPath):
for file in files:
filePath = os.path.join(root, file)
zipf.write(filePath , filePath[lenDirPath :] )
zipf.close()
#end zipDir
I suspect there might be a more elegant solution, but this one should work:
def add_zip_flat(zip, filename):
dir, base_filename = os.path.split(filename)
os.chdir(dir)
zip.write(base_filename)
zip = zipfile.ZipFile(buffer, 'w')
add_zip_flat(zip, first_path)
add_zip_flat(zip, second_path)
zip.close()
You can override the filename in the archive with the arcname parameter:
with zipfile.ZipFile(file="sample.zip", mode="w", compression=zipfile.ZIP_DEFLATED) as out_zip:
for f in Path.home().glob("**/*.txt"):
out_zip.write(f, arcname=f.name)
Documentation reference: https://docs.python.org/3/library/zipfile.html#zipfile.ZipFile.write
Can be done that way also (this allow for creating archives >2GB)
import os, zipfile
def zipdir(path, ziph):
"""zipper"""
for root, _, files in os.walk(path):
for file_found in files:
abs_path = root+'/'+file_found
ziph.write(abs_path, file_found)
zipf = zipfile.ZipFile(DEST_FILE.zip, 'w', zipfile.ZIP_DEFLATED, allowZip64=True)
zipdir(SOURCE_DIR, zipf)
zipf.close()
As João Pinto said, the arcname argument of ZipFile.write is what you need. Also, reading the documentation of pathlib is helpful. You can easily get the relative path to something also with pathlib.Path.relative_to, no need to switch to os.path.
import zipfile
from pathlib import Path
folder_to_compress = Path("/path/to/folder")
path_to_archive = Path("/path/to/archive.zip")
with zipfile.ZipFile(
path_to_archive,
mode="w",
compression=zipfile.ZIP_DEFLATED,
compresslevel=7,
) as zip:
for file in folder_to_compress.rglob("*"):
relative_path = file.relative_to(folder_to_compress)
print(f"Packing {file} as {relative_path}")
zip.write(file, arcname=relative_path)

How to eliminate absolute path in zip archive if absolute paths for files are provided?

I have two files in two different directories, one is '/home/test/first/first.pdf', the other is '/home/text/second/second.pdf'. I use following code to compress them:
import zipfile, StringIO
buffer = StringIO.StringIO()
first_path = '/home/test/first/first.pdf'
second_path = '/home/text/second/second.pdf'
zip = zipfile.ZipFile(buffer, 'w')
zip.write(first_path)
zip.write(second_path)
zip.close()
After I open the zip file that I created, I have a home folder in it, then there are two sub-folders in it, first and second, then the pdf files. I don't know how to include only two pdf files instead of having full path zipped into the zip archive. I hope I make my question clear, please help.
The zipfile write() method supports an extra argument (arcname) which is the archive name to be stored in the zip file, so you would only need to change your code with:
from os.path import basename
...
zip.write(first_path, basename(first_path))
zip.write(second_path, basename(second_path))
zip.close()
When you have some spare time reading the documentation for zipfile will be helpful.
I use this function to zip a directory without include absolute path
import zipfile
import os
def zipDir(dirPath, zipPath):
zipf = zipfile.ZipFile(zipPath , mode='w')
lenDirPath = len(dirPath)
for root, _ , files in os.walk(dirPath):
for file in files:
filePath = os.path.join(root, file)
zipf.write(filePath , filePath[lenDirPath :] )
zipf.close()
#end zipDir
I suspect there might be a more elegant solution, but this one should work:
def add_zip_flat(zip, filename):
dir, base_filename = os.path.split(filename)
os.chdir(dir)
zip.write(base_filename)
zip = zipfile.ZipFile(buffer, 'w')
add_zip_flat(zip, first_path)
add_zip_flat(zip, second_path)
zip.close()
You can override the filename in the archive with the arcname parameter:
with zipfile.ZipFile(file="sample.zip", mode="w", compression=zipfile.ZIP_DEFLATED) as out_zip:
for f in Path.home().glob("**/*.txt"):
out_zip.write(f, arcname=f.name)
Documentation reference: https://docs.python.org/3/library/zipfile.html#zipfile.ZipFile.write
Can be done that way also (this allow for creating archives >2GB)
import os, zipfile
def zipdir(path, ziph):
"""zipper"""
for root, _, files in os.walk(path):
for file_found in files:
abs_path = root+'/'+file_found
ziph.write(abs_path, file_found)
zipf = zipfile.ZipFile(DEST_FILE.zip, 'w', zipfile.ZIP_DEFLATED, allowZip64=True)
zipdir(SOURCE_DIR, zipf)
zipf.close()
As João Pinto said, the arcname argument of ZipFile.write is what you need. Also, reading the documentation of pathlib is helpful. You can easily get the relative path to something also with pathlib.Path.relative_to, no need to switch to os.path.
import zipfile
from pathlib import Path
folder_to_compress = Path("/path/to/folder")
path_to_archive = Path("/path/to/archive.zip")
with zipfile.ZipFile(
path_to_archive,
mode="w",
compression=zipfile.ZIP_DEFLATED,
compresslevel=7,
) as zip:
for file in folder_to_compress.rglob("*"):
relative_path = file.relative_to(folder_to_compress)
print(f"Packing {file} as {relative_path}")
zip.write(file, arcname=relative_path)

Python - Need to loop through directories looking for TXT files

I am a total Python Newb
I need to loop through a directory looking for .txt files, and then read and process them individually. I would like to set this up so that whatever directory the script is in is treated as the root of this action. For example if the script is in /bsepath/workDir, then it would loop over all of the files in workDir and its children.
What I have so far is:
#!/usr/bin/env python
import os
scrptPth = os.path.realpath(__file__)
for file in os.listdir(scrptPth)
with open(file) as f:
head,sub,auth = [f.readline().strip() for i in range(3)]
data=f.read()
#data.encode('utf-8')
pth = os.getcwd()
print head,sub,auth,data,pth
This code is giving me an invalid syntax error and I suspect that is because os.listdir does not like file paths in standard string format. Also I dont think that I am doing the looped action right. How do I reference a specific file in the looped action? Is it packaged as a variable?
Any help is appriciated
import os, fnmatch
def findFiles (path, filter):
for root, dirs, files in os.walk(path):
for file in fnmatch.filter(files, filter):
yield os.path.join(root, file)
Use it like this, and it will find all text files somewhere within the given path (recursively):
for textFile in findFiles(r'C:\Users\poke\Documents', '*.txt'):
print(textFile)
os.listdir expects a directory as input. So, to get the directory in which the script resides use:
scrptPth = os.path.dirname(os.path.realpath(__file__))
Also, os.listdir returns just the filenames, not the full path.
So open(file) will not work unless the current working directory happens to be the directory where the script resides. To fix this, use os.path.join:
import os
scrptPth = os.path.dirname(os.path.realpath(__file__))
for file in os.listdir(scrptPth):
with open(os.path.join(scrptPth, file)) as f:
Finally, if you want to recurse through subdirectories, use os.walk:
import os
scrptPth = os.path.dirname(os.path.realpath(__file__))
for root, dirs, files in os.walk(scrptPth):
for filename in files:
filename = os.path.join(root, filename)
with open(filename, 'r') as f:
head,sub,auth = [f.readline().strip() for i in range(3)]
data=f.read()
#data.encode('utf-8')

Browse files and subfolders in Python

I'd like to browse through the current folder and all its subfolders and get all the files with .htm|.html extensions. I have found out that it is possible to find out whether an object is a dir or file like this:
import os
dirList = os.listdir("./") # current directory
for dir in dirList:
if os.path.isdir(dir) == True:
# I don't know how to get into this dir and do the same thing here
else:
# I got file and i can regexp if it is .htm|html
and in the end, I would like to have all the files and their paths in an array. Is something like that possible?
You can use os.walk() to recursively iterate through a directory and all its subdirectories:
for root, dirs, files in os.walk(path):
for name in files:
if name.endswith((".html", ".htm")):
# whatever
To build a list of these names, you can use a list comprehension:
htmlfiles = [os.path.join(root, name)
for root, dirs, files in os.walk(path)
for name in files
if name.endswith((".html", ".htm"))]
I had a similar thing to work on, and this is how I did it.
import os
rootdir = os.getcwd()
for subdir, dirs, files in os.walk(rootdir):
for file in files:
#print os.path.join(subdir, file)
filepath = subdir + os.sep + file
if filepath.endswith(".html"):
print (filepath)
Hope this helps.
In python 3 you can use os.scandir():
def dir_scan(path):
for i in os.scandir(path):
if i.is_file():
print('File: ' + i.path)
elif i.is_dir():
print('Folder: ' + i.path)
dir_scan(i.path)
Use newDirName = os.path.abspath(dir) to create a full directory path name for the subdirectory and then list its contents as you have done with the parent (i.e. newDirList = os.listDir(newDirName))
You can create a separate method of your code snippet and call it recursively through the subdirectory structure. The first parameter is the directory pathname. This will change for each subdirectory.
This answer is based on the 3.1.1 version documentation of the Python Library. There is a good model example of this in action on page 228 of the Python 3.1.1 Library Reference (Chapter 10 - File and Directory Access).
Good Luck!
Slightly altered version of Sven Marnach's solution..
import os
folder_location = 'C:\SomeFolderName'
file_list = create_file_list(folder_location)
def create_file_list(path):
return_list = []
for filenames in os.walk(path):
for file_list in filenames:
for file_name in file_list:
if file_name.endswith((".txt")):
return_list.append(file_name)
return return_list
There are two ways works for me.
1. Work with the `os` package and use `'__file__'` to replace the main
directory when the project locates
import os
script_dir = os.path.dirname(__file__)
path = 'subdirectory/test.txt'
file = os.path.join(script_dir, path)
fileread = open(file,'r')
2. By using '\\' to read or write the file in subfolder
fileread = open('subdirectory\\test.txt','r')
from tkinter import *
import os
root = Tk()
file = filedialog.askdirectory()
changed_dir = os.listdir(file)
print(changed_dir)
root.mainloop()

Python recursive folder read

I have a C++/Obj-C background and I am just discovering Python (been writing it for about an hour).
I am writing a script to recursively read the contents of text files in a folder structure.
The problem I have is the code I have written will only work for one folder deep. I can see why in the code (see #hardcoded path), I just don't know how I can move forward with Python since my experience with it is only brand new.
Python Code:
import os
import sys
rootdir = sys.argv[1]
for root, subFolders, files in os.walk(rootdir):
for folder in subFolders:
outfileName = rootdir + "/" + folder + "/py-outfile.txt" # hardcoded path
folderOut = open( outfileName, 'w' )
print "outfileName is " + outfileName
for file in files:
filePath = rootdir + '/' + file
f = open( filePath, 'r' )
toWrite = f.read()
print "Writing '" + toWrite + "' to" + filePath
folderOut.write( toWrite )
f.close()
folderOut.close()
Make sure you understand the three return values of os.walk:
for root, subdirs, files in os.walk(rootdir):
has the following meaning:
root: Current path which is "walked through"
subdirs: Files in root of type directory
files: Files in root (not in subdirs) of type other than directory
And please use os.path.join instead of concatenating with a slash! Your problem is filePath = rootdir + '/' + file - you must concatenate the currently "walked" folder instead of the topmost folder. So that must be filePath = os.path.join(root, file). BTW "file" is a builtin, so you don't normally use it as variable name.
Another problem are your loops, which should be like this, for example:
import os
import sys
walk_dir = sys.argv[1]
print('walk_dir = ' + walk_dir)
# If your current working directory may change during script execution, it's recommended to
# immediately convert program arguments to an absolute path. Then the variable root below will
# be an absolute path as well. Example:
# walk_dir = os.path.abspath(walk_dir)
print('walk_dir (absolute) = ' + os.path.abspath(walk_dir))
for root, subdirs, files in os.walk(walk_dir):
print('--\nroot = ' + root)
list_file_path = os.path.join(root, 'my-directory-list.txt')
print('list_file_path = ' + list_file_path)
with open(list_file_path, 'wb') as list_file:
for subdir in subdirs:
print('\t- subdirectory ' + subdir)
for filename in files:
file_path = os.path.join(root, filename)
print('\t- file %s (full path: %s)' % (filename, file_path))
with open(file_path, 'rb') as f:
f_content = f.read()
list_file.write(('The file %s contains:\n' % filename).encode('utf-8'))
list_file.write(f_content)
list_file.write(b'\n')
If you didn't know, the with statement for files is a shorthand:
with open('filename', 'rb') as f:
dosomething()
# is effectively the same as
f = open('filename', 'rb')
try:
dosomething()
finally:
f.close()
If you are using Python 3.5 or above, you can get this done in 1 line.
import glob
# root_dir needs a trailing slash (i.e. /root/dir/)
for filename in glob.iglob(root_dir + '**/*.txt', recursive=True):
print(filename)
As mentioned in the documentation
If recursive is true, the pattern '**' will match any files and zero or more directories and subdirectories.
If you want every file, you can use
import glob
for filename in glob.iglob(root_dir + '**/**', recursive=True):
print(filename)
Agree with Dave Webb, os.walk will yield an item for each directory in the tree. Fact is, you just don't have to care about subFolders.
Code like this should work:
import os
import sys
rootdir = sys.argv[1]
for folder, subs, files in os.walk(rootdir):
with open(os.path.join(folder, 'python-outfile.txt'), 'w') as dest:
for filename in files:
with open(os.path.join(folder, filename), 'r') as src:
dest.write(src.read())
TL;DR: This is the equivalent to find -type f to go over all files in all folders below and including the current one:
for currentpath, folders, files in os.walk('.'):
for file in files:
print(os.path.join(currentpath, file))
As already mentioned in other answers, os.walk() is the answer, but it could be explained better. It's quite simple! Let's walk through this tree:
docs/
└── doc1.odt
pics/
todo.txt
With this code:
for currentpath, folders, files in os.walk('.'):
print(currentpath)
The currentpath is the current folder it is looking at. This will output:
.
./docs
./pics
So it loops three times, because there are three folders: the current one, docs, and pics. In every loop, it fills the variables folders and files with all folders and files. Let's show them:
for currentpath, folders, files in os.walk('.'):
print(currentpath, folders, files)
This shows us:
# currentpath folders files
. ['pics', 'docs'] ['todo.txt']
./pics [] []
./docs [] ['doc1.odt']
So in the first line, we see that we are in folder ., that it contains two folders namely pics and docs, and that there is one file, namely todo.txt. You don't have to do anything to recurse into those folders, because as you see, it recurses automatically and just gives you the files in any subfolders. And any subfolders of that (though we don't have those in the example).
If you just want to loop through all files, the equivalent of find -type f, you can do this:
for currentpath, folders, files in os.walk('.'):
for file in files:
print(os.path.join(currentpath, file))
This outputs:
./todo.txt
./docs/doc1.odt
The pathlib library is really great for working with files. You can do a recursive glob on a Path object like so.
from pathlib import Path
for elem in Path('/path/to/my/files').rglob('*.*'):
print(elem)
import glob
import os
root_dir = <root_dir_here>
for filename in glob.iglob(root_dir + '**/**', recursive=True):
if os.path.isfile(filename):
with open(filename,'r') as file:
print(file.read())
**/** is used to get all files recursively including directory.
if os.path.isfile(filename) is used to check if filename variable is file or directory, if it is file then we can read that file.
Here I am printing file.
If you want a flat list of all paths under a given dir (like find . in the shell):
files = [
os.path.join(parent, name)
for (parent, subdirs, files) in os.walk(YOUR_DIRECTORY)
for name in files + subdirs
]
To only include full paths to files under the base dir, leave out + subdirs.
I've found the following to be the easiest
from glob import glob
import os
files = [f for f in glob('rootdir/**', recursive=True) if os.path.isfile(f)]
Using glob('some/path/**', recursive=True) gets all files, but also includes directory names. Adding the if os.path.isfile(f) condition filters this list to existing files only
For my taste os.walk() is a little too complicated and verbose. You can do the accepted answer cleaner by:
all_files = [str(f) for f in pathlib.Path(dir_path).glob("**/*") if f.is_file()]
with open(outfile, 'wb') as fout:
for f in all_files:
with open(f, 'rb') as fin:
fout.write(fin.read())
fout.write(b'\n')
use os.path.join() to construct your paths - It's neater:
import os
import sys
rootdir = sys.argv[1]
for root, subFolders, files in os.walk(rootdir):
for folder in subFolders:
outfileName = os.path.join(root,folder,"py-outfile.txt")
folderOut = open( outfileName, 'w' )
print "outfileName is " + outfileName
for file in files:
filePath = os.path.join(root,file)
toWrite = open( filePath).read()
print "Writing '" + toWrite + "' to" + filePath
folderOut.write( toWrite )
folderOut.close()
os.walk does recursive walk by default. For each dir, starting from root it yields a 3-tuple (dirpath, dirnames, filenames)
from os import walk
from os.path import splitext, join
def select_files(root, files):
"""
simple logic here to filter out interesting files
.py files in this example
"""
selected_files = []
for file in files:
#do concatenation here to get full path
full_path = join(root, file)
ext = splitext(file)[1]
if ext == ".py":
selected_files.append(full_path)
return selected_files
def build_recursive_dir_tree(path):
"""
path - where to begin folder scan
"""
selected_files = []
for root, dirs, files in walk(path):
selected_files += select_files(root, files)
return selected_files
I think the problem is that you're not processing the output of os.walk correctly.
Firstly, change:
filePath = rootdir + '/' + file
to:
filePath = root + '/' + file
rootdir is your fixed starting directory; root is a directory returned by os.walk.
Secondly, you don't need to indent your file processing loop, as it makes no sense to run this for each subdirectory. You'll get root set to each subdirectory. You don't need to process the subdirectories by hand unless you want to do something with the directories themselves.
Try this:
import os
import sys
for root, subdirs, files in os.walk(path):
for file in os.listdir(root):
filePath = os.path.join(root, file)
if os.path.isdir(filePath):
pass
else:
f = open (filePath, 'r')
# Do Stuff
If you prefer an (almost) Oneliner:
from pathlib import Path
lookuppath = '.' #use your path
filelist = [str(item) for item in Path(lookuppath).glob("**/*") if Path(item).is_file()]
In this case you will get a list with just the paths of all files located recursively under lookuppath.
Without str() you will get PosixPath() added to each path.
This worked for me:
import glob
root_dir = "C:\\Users\\Scott\\" # Don't forget trailing (last) slashes
for filename in glob.iglob(root_dir + '**/*.jpg', recursive=True):
print(filename)
# do stuff
If just the file names are not enough, it's easy to implement a Depth-first search on top of os.scandir():
stack = ['.']
files = []
total_size = 0
while stack:
dirname = stack.pop()
with os.scandir(dirname) as it:
for e in it:
if e.is_dir():
stack.append(e.path)
else:
size = e.stat().st_size
files.append((e.path, size))
total_size += size
The docs have this to say:
The scandir() function returns directory entries along with file attribute information, giving better performance for many common use cases.

Categories

Resources