How to convert tar.gz file to zip using Python only? - python

Does anybody has any code for converting tar.gz file into zip using only Python code? I have been facing many issues with tar.gz as mentioned in the How can I read tar.gz file using pandas read_csv with gzip compression option?

You would have to use the tarfile module, with mode 'r|gz' for reading.
Then use zipfile for writing.
import tarfile, zipfile
tarf = tarfile.open( name='mytar.tar.gz', mode='r|gz' )
zipf = zipfile.ZipFile( file='myzip.zip', mode='a', compression=zipfile.ZIP_DEFLATED )
for m in tarf:
f = tarf.extractfile( m )
fl = f.read()
fn = m.name
zipf.writestr( fn, fl )
tarf.close()
zipf.close()
You can use is_tarfile() to check for a valid tar file.
Perhaps you could also use shutil, but I think it cannot work on memory.
PS: From the brief testing that I performed, you may have issues with members m which are directories.
If so, you may have to use is_dir(), or even first get the info on each tar file member with tarf.getmembers(), and the open the tar.gz file for transferring to zip, since you cannot do it after tarf.getmembers() (you cannot seek backwards).

This just fixes a couple of tiny issues from the above answer, makes sure the mtime is preserved and makes sure compression is happening on all the files. All credit to the above for the simple answer.
from datetime import datetime
import sys
from tarfile import open
from zipfile import ZipFile, ZIP_DEFLATED, ZipInfo
compresslevel = 9
compression = ZIP_DEFLATED
with open(name=sys.argv[1], mode='r|gz') as tarf:
with ZipFile(file=sys.argv[2], mode='w', compression=compression, compresslevel=compresslevel) as zipf:
for m in tarf:
mtime = datetime.fromtimestamp(m.mtime)
print(f'{mtime} - {m.name}')
zinfo: ZipInfo = ZipInfo(
filename=m.name,
date_time=(mtime.year, mtime.month, mtime.day, mtime.hour, mtime.minute, mtime.second)
)
if not m.isfile():
# for directories and other types
continue
f = tarf.extractfile(m)
fl = f.read()
zipf.writestr(zinfo, fl, compress_type=compression, compresslevel=compresslevel)
print('done.')

Related

How to extract a mult-part zip file in python?

Suposse that I have some files that I downloaded from a server and they are zipped with 7zip in multiple parts, the format is something like this myfile.zip.001, myfile.zip.002, ..., myfile.zip.00n. Basically, I need to extract the content of it in the same folder where they are stored.
I tried using zipfile, patoolib and pyunpack without success, here is what I've done:
file_path = r"C:\Users\user\Documents\myfile.zip.001" #I also tested with only .zip
extract_path = r"C:\Users\user\Documents\"
#"
import zipfile
with zipfile.ZipFile(file_path, "r") as zip_ref:
zip_ref.extractall(extract_path) # myfile.zip.001 file isn't zip file.
from pyunpack import Archive
Archive(file_path).extractall(extract_path) # File is not a zip file
import patoolib
patoolib.extract_archive(file_path, outdir=extract_path) # unknown archive format for file `myfile.zip.001'
Another way (that works, but it's very ugly) is this one:
import os
import subprocess
path_7zip = r"C:\Program Files (x86)\7-Zip\7z.exe"
cmd = [path_7zip, 'x', 'myfile.zip.001']
sp = subprocess.Popen(cmd, stderr=subprocess.STDOUT, stdout=subprocess.PIPE)
But this makes the user install 7zip in his computer, which isn't a good approach of what I'm looking for.
So, the question is: there is at least a way to extract/unzip multi-parts files with the format x.zip.001 in python?
You seem to be on the right track with zipfile, but you most likely have to concatenate the zip file before using extractall.
import os
zip_prefix = "myfile.zip."
# N number of parts
import glob
parts = glob.glob(zip_prefix + '*')
n = len(parts)
# Concatenate
with open("myfile.zip", "wb") as outfile:
for i in range(1, n+1):
filename = zip_prefix + str(i).zfill(3)
with open(filename, "rb") as infile:
outfile.write(infile.read())
# Extract
import zipfile
with zipfile.ZipFile(file_path, "r") as zip_ref:
zip_ref.extractall(extract_path)

Zip single file

I am trying to zip a single file in python. For whatever reason, I'm having a hard time getting down the syntax. What I am trying to do is keep the original file and create a new zipped file of the original (like what a Mac or Windows would do if you archive a file).
Here is what I have so far:
import zipfile
myfilepath = '/tmp/%s' % self.file_name
myzippath = myfilepath.replace('.xml', '.zip')
zipfile.ZipFile(myzippath, 'w').write(open(myfilepath).read()) # does not zip the file properly
The correct way to zip file is:
zipfile.ZipFile('hello.zip', mode='w').write("hello.csv")
# assume your xxx.py under the same dir with hello.csv
The python official doc says:
ZipFile.write(filename, arcname=None, compress_type=None)
Write the file named filename to the archive, giving it the archive name arcname
You pass open(filename).read() into write(). open(filename).read() is a single string that contains the whole content of file filename, it would throw FileNotFoundError because it is trying to find a file named with the string content.
If the file to be zipped (filename) is in a different directory called pathname, you should use the arcname parameter. Otherwise, it will recreate the full folder hierarchy to the file folder.
from zipfile import ZipFile
import os
with ZipFile(zip_file, 'w') as zipf:
zipf.write(os.path.join(pathname,filename), arcname=filename)
Try calling zipfile.close() afterwards?
from zipfile import ZipFile
zipf = ZipFile("main.zip","w", zipfile.ZIP_DEFLATED)
zipf.write("main.json")
zipf.close()
Since you also want to specify the directory try using os.chdir:
#!/usr/bin/python
from zipfile import ZipFile
import os
os.chdir('/path/of/target/and/destination')
ZipFile('archive.zip', 'w').write('original_file.txt')
Python zipfile : Work with Zip archives
Python Miscellaneous operating system interfaces

How can I create a zipped file in Python that when unzipped will give me the raw files rather than a folder? [duplicate]

I have two files in two different directories, one is '/home/test/first/first.pdf', the other is '/home/text/second/second.pdf'. I use following code to compress them:
import zipfile, StringIO
buffer = StringIO.StringIO()
first_path = '/home/test/first/first.pdf'
second_path = '/home/text/second/second.pdf'
zip = zipfile.ZipFile(buffer, 'w')
zip.write(first_path)
zip.write(second_path)
zip.close()
After I open the zip file that I created, I have a home folder in it, then there are two sub-folders in it, first and second, then the pdf files. I don't know how to include only two pdf files instead of having full path zipped into the zip archive. I hope I make my question clear, please help.
The zipfile write() method supports an extra argument (arcname) which is the archive name to be stored in the zip file, so you would only need to change your code with:
from os.path import basename
...
zip.write(first_path, basename(first_path))
zip.write(second_path, basename(second_path))
zip.close()
When you have some spare time reading the documentation for zipfile will be helpful.
I use this function to zip a directory without include absolute path
import zipfile
import os
def zipDir(dirPath, zipPath):
zipf = zipfile.ZipFile(zipPath , mode='w')
lenDirPath = len(dirPath)
for root, _ , files in os.walk(dirPath):
for file in files:
filePath = os.path.join(root, file)
zipf.write(filePath , filePath[lenDirPath :] )
zipf.close()
#end zipDir
I suspect there might be a more elegant solution, but this one should work:
def add_zip_flat(zip, filename):
dir, base_filename = os.path.split(filename)
os.chdir(dir)
zip.write(base_filename)
zip = zipfile.ZipFile(buffer, 'w')
add_zip_flat(zip, first_path)
add_zip_flat(zip, second_path)
zip.close()
You can override the filename in the archive with the arcname parameter:
with zipfile.ZipFile(file="sample.zip", mode="w", compression=zipfile.ZIP_DEFLATED) as out_zip:
for f in Path.home().glob("**/*.txt"):
out_zip.write(f, arcname=f.name)
Documentation reference: https://docs.python.org/3/library/zipfile.html#zipfile.ZipFile.write
Can be done that way also (this allow for creating archives >2GB)
import os, zipfile
def zipdir(path, ziph):
"""zipper"""
for root, _, files in os.walk(path):
for file_found in files:
abs_path = root+'/'+file_found
ziph.write(abs_path, file_found)
zipf = zipfile.ZipFile(DEST_FILE.zip, 'w', zipfile.ZIP_DEFLATED, allowZip64=True)
zipdir(SOURCE_DIR, zipf)
zipf.close()
As João Pinto said, the arcname argument of ZipFile.write is what you need. Also, reading the documentation of pathlib is helpful. You can easily get the relative path to something also with pathlib.Path.relative_to, no need to switch to os.path.
import zipfile
from pathlib import Path
folder_to_compress = Path("/path/to/folder")
path_to_archive = Path("/path/to/archive.zip")
with zipfile.ZipFile(
path_to_archive,
mode="w",
compression=zipfile.ZIP_DEFLATED,
compresslevel=7,
) as zip:
for file in folder_to_compress.rglob("*"):
relative_path = file.relative_to(folder_to_compress)
print(f"Packing {file} as {relative_path}")
zip.write(file, arcname=relative_path)

How to unpack 7-Zip.gz (.gz) with Python?

I have a .tar.gz file which I want to unpack (when I unpack with 7-Zip manually, I am getting a .tar file inside). I am able to unpack this .tar file easily then with Python tarfile module then.
When I right-click the .tar.gz file in Windows Explorer, I can see under Type of file: 7-Zip.gz (.gz). I have tried using gzip module (gzip.open), however I am getting a an exception 'Not a gzipped file'. So there should be some other way to go.
I have searched the Internet and seen that people use 7-Zip manually or some batch commands, however I cannot find a way to do this in Python. I am on Python 2.7.
The tarfile library is able to read gzipped tar files. You should look at the examples here:
http://docs.python.org/2/library/tarfile.html#examples
The first example might accomplish what you want. It extracts the content of the archive to the current working directory:
import tarfile
tar = tarfile.open("sample.tar.gz")
tar.extractall()
tar.close()
import os
import tarfile
import zipfile
def extract_file(path, to_directory='.'):
if path.endswith('.zip'):
opener, mode = zipfile.ZipFile, 'r'
elif path.endswith('.tar.gz') or path.endswith('.tgz'):
opener, mode = tarfile.open, 'r:gz'
elif path.endswith('.tar.bz2') or path.endswith('.tbz'):
opener, mode = tarfile.open, 'r:bz2'
else:
raise ValueError, "Could not extract `%s` as no appropriate extractor is found" % path
cwd = os.getcwd()
os.chdir(to_directory)
try:
file = opener(path, mode)
try: file.extractall()
finally: file.close()
finally:
os.chdir(cwd)
Found this here:
http://code.activestate.com/recipes/576714-extract-a-compressed-file/
This is the example from the python-docs and should work:
import gzip
f = gzip.open('file.txt.gz', 'rb')
file_content = f.read()
f.close()

How to eliminate absolute path in zip archive if absolute paths for files are provided?

I have two files in two different directories, one is '/home/test/first/first.pdf', the other is '/home/text/second/second.pdf'. I use following code to compress them:
import zipfile, StringIO
buffer = StringIO.StringIO()
first_path = '/home/test/first/first.pdf'
second_path = '/home/text/second/second.pdf'
zip = zipfile.ZipFile(buffer, 'w')
zip.write(first_path)
zip.write(second_path)
zip.close()
After I open the zip file that I created, I have a home folder in it, then there are two sub-folders in it, first and second, then the pdf files. I don't know how to include only two pdf files instead of having full path zipped into the zip archive. I hope I make my question clear, please help.
The zipfile write() method supports an extra argument (arcname) which is the archive name to be stored in the zip file, so you would only need to change your code with:
from os.path import basename
...
zip.write(first_path, basename(first_path))
zip.write(second_path, basename(second_path))
zip.close()
When you have some spare time reading the documentation for zipfile will be helpful.
I use this function to zip a directory without include absolute path
import zipfile
import os
def zipDir(dirPath, zipPath):
zipf = zipfile.ZipFile(zipPath , mode='w')
lenDirPath = len(dirPath)
for root, _ , files in os.walk(dirPath):
for file in files:
filePath = os.path.join(root, file)
zipf.write(filePath , filePath[lenDirPath :] )
zipf.close()
#end zipDir
I suspect there might be a more elegant solution, but this one should work:
def add_zip_flat(zip, filename):
dir, base_filename = os.path.split(filename)
os.chdir(dir)
zip.write(base_filename)
zip = zipfile.ZipFile(buffer, 'w')
add_zip_flat(zip, first_path)
add_zip_flat(zip, second_path)
zip.close()
You can override the filename in the archive with the arcname parameter:
with zipfile.ZipFile(file="sample.zip", mode="w", compression=zipfile.ZIP_DEFLATED) as out_zip:
for f in Path.home().glob("**/*.txt"):
out_zip.write(f, arcname=f.name)
Documentation reference: https://docs.python.org/3/library/zipfile.html#zipfile.ZipFile.write
Can be done that way also (this allow for creating archives >2GB)
import os, zipfile
def zipdir(path, ziph):
"""zipper"""
for root, _, files in os.walk(path):
for file_found in files:
abs_path = root+'/'+file_found
ziph.write(abs_path, file_found)
zipf = zipfile.ZipFile(DEST_FILE.zip, 'w', zipfile.ZIP_DEFLATED, allowZip64=True)
zipdir(SOURCE_DIR, zipf)
zipf.close()
As João Pinto said, the arcname argument of ZipFile.write is what you need. Also, reading the documentation of pathlib is helpful. You can easily get the relative path to something also with pathlib.Path.relative_to, no need to switch to os.path.
import zipfile
from pathlib import Path
folder_to_compress = Path("/path/to/folder")
path_to_archive = Path("/path/to/archive.zip")
with zipfile.ZipFile(
path_to_archive,
mode="w",
compression=zipfile.ZIP_DEFLATED,
compresslevel=7,
) as zip:
for file in folder_to_compress.rglob("*"):
relative_path = file.relative_to(folder_to_compress)
print(f"Packing {file} as {relative_path}")
zip.write(file, arcname=relative_path)

Categories

Resources