Create a zip file from a generator in Python? - python

I've got a large amount of data (a couple gigs) I need to write to a zip file in Python. I can't load it all into memory at once to pass to the .writestr method of ZipFile, and I really don't want to feed it all out to disk using temporary files and then read it back.
Is there a way to feed a generator or a file-like object to the ZipFile library? Or is there some reason this capability doesn't seem to be supported?
By zip file, I mean zip file. As supported in the Python zipfile package.

The only solution is to rewrite the method it uses for zipping files to read from a buffer. It would be trivial to add this to the standard libraries; I'm kind of amazed it hasn't been done yet. I gather there's a lot of agreement the entire interface needs to be overhauled, and that seems to be blocking any incremental improvements.
import zipfile, zlib, binascii, struct
class BufferedZipFile(zipfile.ZipFile):
def writebuffered(self, zipinfo, buffer):
zinfo = zipinfo
zinfo.file_size = file_size = 0
zinfo.flag_bits = 0x00
zinfo.header_offset = self.fp.tell()
self._writecheck(zinfo)
self._didModify = True
zinfo.CRC = CRC = 0
zinfo.compress_size = compress_size = 0
self.fp.write(zinfo.FileHeader())
if zinfo.compress_type == zipfile.ZIP_DEFLATED:
cmpr = zlib.compressobj(zlib.Z_DEFAULT_COMPRESSION, zlib.DEFLATED, -15)
else:
cmpr = None
while True:
buf = buffer.read(1024 * 8)
if not buf:
break
file_size = file_size + len(buf)
CRC = binascii.crc32(buf, CRC) & 0xffffffff
if cmpr:
buf = cmpr.compress(buf)
compress_size = compress_size + len(buf)
self.fp.write(buf)
if cmpr:
buf = cmpr.flush()
compress_size = compress_size + len(buf)
self.fp.write(buf)
zinfo.compress_size = compress_size
else:
zinfo.compress_size = file_size
zinfo.CRC = CRC
zinfo.file_size = file_size
position = self.fp.tell()
self.fp.seek(zinfo.header_offset + 14, 0)
self.fp.write(struct.pack("<LLL", zinfo.CRC, zinfo.compress_size, zinfo.file_size))
self.fp.seek(position, 0)
self.filelist.append(zinfo)
self.NameToInfo[zinfo.filename] = zinfo

Changed in Python 3.5 (from official docs): Added support for writing to unseekable streams.
This means that now for zipfile.ZipFile we can use streams which do not store the entire file in memory. Such streams do not support movement over the entire data volume.
So this is simple generator:
from zipfile import ZipFile, ZipInfo
def zipfile_generator(path, stream):
with ZipFile(stream, mode='w') as zf:
z_info = ZipInfo.from_file(path)
with open(path, 'rb') as entry, zf.open(z_info, mode='w') as dest:
for chunk in iter(lambda: entry.read(16384), b''):
dest.write(chunk)
# Yield chunk of the zip file stream in bytes.
yield stream.get()
# ZipFile was closed.
yield stream.get()
path is a string path of the large file or directory or pathlike object.
stream is the unseekable stream instance of the class like this (designed according to official docs):
from io import RawIOBase
class UnseekableStream(RawIOBase):
def __init__(self):
self._buffer = b''
def writable(self):
return True
def write(self, b):
if self.closed:
raise ValueError('Stream was closed!')
self._buffer += b
return len(b)
def get(self):
chunk = self._buffer
self._buffer = b''
return chunk
You can try this code online: https://repl.it/#IvanErgunov/zipfilegenerator
There is also another way to create a generator without ZipInfo and manually reading and dividing your large file. You can pass the queue.Queue() object to your UnseekableStream() object and write to this queue in another thread. Then in current thread you can simply read chunks from this queue in iterable way. See docs
P.S.
Python Zipstream by allanlei is outdated and unreliable way. It was an attempt to add support for unseekable streams before it was done officially.

I took Chris B.'s answer and created a complete solution. Here it is in case anyone else is interested:
import os
import threading
from zipfile import *
import zlib, binascii, struct
class ZipEntryWriter(threading.Thread):
def __init__(self, zf, zinfo, fileobj):
self.zf = zf
self.zinfo = zinfo
self.fileobj = fileobj
zinfo.file_size = 0
zinfo.flag_bits = 0x00
zinfo.header_offset = zf.fp.tell()
zf._writecheck(zinfo)
zf._didModify = True
zinfo.CRC = 0
zinfo.compress_size = compress_size = 0
zf.fp.write(zinfo.FileHeader())
super(ZipEntryWriter, self).__init__()
def run(self):
zinfo = self.zinfo
zf = self.zf
file_size = 0
CRC = 0
if zinfo.compress_type == ZIP_DEFLATED:
cmpr = zlib.compressobj(zlib.Z_DEFAULT_COMPRESSION, zlib.DEFLATED, -15)
else:
cmpr = None
while True:
buf = self.fileobj.read(1024 * 8)
if not buf:
self.fileobj.close()
break
file_size = file_size + len(buf)
CRC = binascii.crc32(buf, CRC)
if cmpr:
buf = cmpr.compress(buf)
compress_size = compress_size + len(buf)
zf.fp.write(buf)
if cmpr:
buf = cmpr.flush()
compress_size = compress_size + len(buf)
zf.fp.write(buf)
zinfo.compress_size = compress_size
else:
zinfo.compress_size = file_size
zinfo.CRC = CRC
zinfo.file_size = file_size
position = zf.fp.tell()
zf.fp.seek(zinfo.header_offset + 14, 0)
zf.fp.write(struct.pack("<lLL", zinfo.CRC, zinfo.compress_size, zinfo.file_size))
zf.fp.seek(position, 0)
zf.filelist.append(zinfo)
zf.NameToInfo[zinfo.filename] = zinfo
class EnhZipFile(ZipFile, object):
def _current_writer(self):
return hasattr(self, 'cur_writer') and self.cur_writer or None
def assert_no_current_writer(self):
cur_writer = self._current_writer()
if cur_writer and cur_writer.isAlive():
raise ValueError('An entry is already started for name: %s' % cur_write.zinfo.filename)
def write(self, filename, arcname=None, compress_type=None):
self.assert_no_current_writer()
super(EnhZipFile, self).write(filename, arcname, compress_type)
def writestr(self, zinfo_or_arcname, bytes):
self.assert_no_current_writer()
super(EnhZipFile, self).writestr(zinfo_or_arcname, bytes)
def close(self):
self.finish_entry()
super(EnhZipFile, self).close()
def start_entry(self, zipinfo):
"""
Start writing a new entry with the specified ZipInfo and return a
file like object. Any data written to the file like object is
read by a background thread and written directly to the zip file.
Make sure to close the returned file object, before closing the
zipfile, or the close() would end up hanging indefinitely.
Only one entry can be open at any time. If multiple entries need to
be written, make sure to call finish_entry() before calling any of
these methods:
- start_entry
- write
- writestr
It is not necessary to explicitly call finish_entry() before closing
zipfile.
Example:
zf = EnhZipFile('tmp.zip', 'w')
w = zf.start_entry(ZipInfo('t.txt'))
w.write("some text")
w.close()
zf.close()
"""
self.assert_no_current_writer()
r, w = os.pipe()
self.cur_writer = ZipEntryWriter(self, zipinfo, os.fdopen(r, 'r'))
self.cur_writer.start()
return os.fdopen(w, 'w')
def finish_entry(self, timeout=None):
"""
Ensure that the ZipEntry that is currently being written is finished.
Joins on any background thread to exit. It is safe to call this method
multiple times.
"""
cur_writer = self._current_writer()
if not cur_writer or not cur_writer.isAlive():
return
cur_writer.join(timeout)
if __name__ == "__main__":
zf = EnhZipFile('c:/tmp/t.zip', 'w')
import time
w = zf.start_entry(ZipInfo('t.txt', time.localtime()[:6]))
w.write("Line1\n")
w.write("Line2\n")
w.close()
zf.finish_entry()
w = zf.start_entry(ZipInfo('p.txt', time.localtime()[:6]))
w.write("Some text\n")
w.close()
zf.close()

gzip.GzipFile writes the data in gzipped chunks , which you can set the size of your chunks according to the numbers of lines read from the files.
an example:
file = gzip.GzipFile('blah.gz', 'wb')
sourcefile = open('source', 'rb')
chunks = []
for line in sourcefile:
chunks.append(line)
if len(chunks) >= X:
file.write("".join(chunks))
file.flush()
chunks = []

The essential compression is done by zlib.compressobj. ZipFile (under Python 2.5 on MacOSX appears to be compiled). The Python 2.3 version is as follows.
You can see that it builds the compressed file in 8k chunks. Taking out the source file information is complex because a lot of source file attributes (like uncompressed size) is recorded in the zip file header.
def write(self, filename, arcname=None, compress_type=None):
"""Put the bytes from filename into the archive under the name
arcname."""
st = os.stat(filename)
mtime = time.localtime(st.st_mtime)
date_time = mtime[0:6]
# Create ZipInfo instance to store file information
if arcname is None:
zinfo = ZipInfo(filename, date_time)
else:
zinfo = ZipInfo(arcname, date_time)
zinfo.external_attr = st[0] << 16L # Unix attributes
if compress_type is None:
zinfo.compress_type = self.compression
else:
zinfo.compress_type = compress_type
self._writecheck(zinfo)
fp = open(filename, "rb")
zinfo.flag_bits = 0x00
zinfo.header_offset = self.fp.tell() # Start of header bytes
# Must overwrite CRC and sizes with correct data later
zinfo.CRC = CRC = 0
zinfo.compress_size = compress_size = 0
zinfo.file_size = file_size = 0
self.fp.write(zinfo.FileHeader())
zinfo.file_offset = self.fp.tell() # Start of file bytes
if zinfo.compress_type == ZIP_DEFLATED:
cmpr = zlib.compressobj(zlib.Z_DEFAULT_COMPRESSION,
zlib.DEFLATED, -15)
else:
cmpr = None
while 1:
buf = fp.read(1024 * 8)
if not buf:
break
file_size = file_size + len(buf)
CRC = binascii.crc32(buf, CRC)
if cmpr:
buf = cmpr.compress(buf)
compress_size = compress_size + len(buf)
self.fp.write(buf)
fp.close()
if cmpr:
buf = cmpr.flush()
compress_size = compress_size + len(buf)
self.fp.write(buf)
zinfo.compress_size = compress_size
else:
zinfo.compress_size = file_size
zinfo.CRC = CRC
zinfo.file_size = file_size
# Seek backwards and write CRC and file sizes
position = self.fp.tell() # Preserve current position in file
self.fp.seek(zinfo.header_offset + 14, 0)
self.fp.write(struct.pack("<lLL", zinfo.CRC, zinfo.compress_size,
zinfo.file_size))
self.fp.seek(position, 0)
self.filelist.append(zinfo)
self.NameToInfo[zinfo.filename] = zinfo

Some (many? most?) compression algorithms are based on looking at redundancies across the entire file.
Some compression libraries will choose between several compression algorithms based on which works best on the file.
I believe the ZipFile module does this, so it wants to see the entire file, not just pieces at a time.
Hence, it won't work with generators or files to big to load in memory. That would explain the limitation of the Zipfile library.

In case anyone stumbles upon this question, which is still relevant in 2017 for Python 2.7, here's a working solution for a true streaming zip file, with no requirement for the output to be seekable as in the other cases. The secret is to set bit 3 of the general purpose bit flag (see https://pkware.cachefly.net/webdocs/casestudies/APPNOTE.TXT section 4.3.9.1).
Note that this implementation will always create a ZIP64-style file, allowing the streaming to work for arbitrarily large files. It includes an ugly hack to force the zip64 end of central directory record, so be aware it will cause all zipfiles written by your process to become ZIP64-style.
import io
import zipfile
import zlib
import binascii
import struct
class ByteStreamer(io.BytesIO):
'''
Variant on BytesIO which lets you write and consume data while
keeping track of the total filesize written. When data is consumed
it is removed from memory, keeping the memory requirements low.
'''
def __init__(self):
super(ByteStreamer, self).__init__()
self._tellall = 0
def tell(self):
return self._tellall
def write(self, b):
orig_size = super(ByteStreamer, self).tell()
super(ByteStreamer, self).write(b)
new_size = super(ByteStreamer, self).tell()
self._tellall += (new_size - orig_size)
def consume(self):
bytes = self.getvalue()
self.seek(0)
self.truncate(0)
return bytes
class BufferedZipFileWriter(zipfile.ZipFile):
'''
ZipFile writer with true streaming (input and output).
Created zip files are always ZIP64-style because it is the only safe way to stream
potentially large zip files without knowing the full size ahead of time.
Example usage:
>>> def stream():
>>> bzfw = BufferedZip64FileWriter()
>>> for arc_path, buffer in inputs: # buffer is a file-like object which supports read(size)
>>> for chunk in bzfw.streambuffer(arc_path, buffer):
>>> yield chunk
>>> yield bzfw.close()
'''
def __init__(self, compression=zipfile.ZIP_DEFLATED):
self._buffer = ByteStreamer()
super(BufferedZipFileWriter, self).__init__(self._buffer, mode='w', compression=compression, allowZip64=True)
def streambuffer(self, zinfo_or_arcname, buffer, chunksize=2**16):
if not isinstance(zinfo_or_arcname, zipfile.ZipInfo):
zinfo = zipfile.ZipInfo(filename=zinfo_or_arcname,
date_time=time.localtime(time.time())[:6])
zinfo.compress_type = self.compression
zinfo.external_attr = 0o600 << 16 # ?rw-------
else:
zinfo = zinfo_or_arcname
zinfo.file_size = file_size = 0
zinfo.flag_bits = 0x08 # Streaming mode: crc and size come after the data
zinfo.header_offset = self.fp.tell()
self._writecheck(zinfo)
self._didModify = True
zinfo.CRC = CRC = 0
zinfo.compress_size = compress_size = 0
self.fp.write(zinfo.FileHeader())
if zinfo.compress_type == zipfile.ZIP_DEFLATED:
cmpr = zlib.compressobj(zlib.Z_DEFAULT_COMPRESSION, zlib.DEFLATED, -15)
else:
cmpr = None
while True:
buf = buffer.read(chunksize)
if not buf:
break
file_size += len(buf)
CRC = binascii.crc32(buf, CRC) & 0xffffffff
if cmpr:
buf = cmpr.compress(buf)
compress_size += len(buf)
self.fp.write(buf)
compressed_bytes = self._buffer.consume()
if compressed_bytes:
yield compressed_bytes
if cmpr:
buf = cmpr.flush()
compress_size += len(buf)
self.fp.write(buf)
zinfo.compress_size = compress_size
compressed_bytes = self._buffer.consume()
if compressed_bytes:
yield compressed_bytes
else:
zinfo.compress_size = file_size
zinfo.CRC = CRC
zinfo.file_size = file_size
# Write CRC and file sizes after the file data
# Always write as zip64 -- only safe way to stream what might become a large zipfile
fmt = '<LQQ'
self.fp.write(struct.pack(fmt, zinfo.CRC, zinfo.compress_size, zinfo.file_size))
self.fp.flush()
self.filelist.append(zinfo)
self.NameToInfo[zinfo.filename] = zinfo
yield self._buffer.consume()
# The close method needs to be patched to force writing a ZIP64 file
# We'll hack ZIP_FILECOUNT_LIMIT to do the forcing
def close(self):
tmp = zipfile.ZIP_FILECOUNT_LIMIT
zipfile.ZIP_FILECOUNT_LIMIT = 0
super(BufferedZipFileWriter, self).close()
zipfile.ZIP_FILECOUNT_LIMIT = tmp
return self._buffer.consume()

The gzip library will take a file-like object for compression.
class GzipFile([filename [,mode [,compresslevel [,fileobj]]]])
You still need to provide a nominal filename for inclusion in the zip file, but you can pass your data-source to the fileobj.
(This answer differs from that of Damnsweet, in that the focus should be on the data-source being incrementally read, not the compressed file being incrementally written.)
And I see now the original questioner won't accept Gzip :-(

Now with python 2.7 you can add data to the zipfile insted of the file :
http://docs.python.org/2/library/zipfile#zipfile.ZipFile.writestr

This is 2017. If you are still looking to do this elegantly, use Python Zipstream by allanlei.
So far, it is probably the only well written library to accomplish that.

gzip.GzipFile writes the data in gzipped chunks , which you can set the size of your chunks according to the numbers of lines read from the files.
an example:
file = gzip.GzipFile('blah.gz', 'wb')
sourcefile = open('source', 'rb')
chunks = []
for line in sourcefile:
chunks.append(line)
if len(chunks) >= X:
file.write("".join(chunks))
file.flush()
chunks = []

You can use stream-zip for this (full disclosure: written mostly by me).
Say you have generators of bytes you want to zip:
def file_data_1():
yield b'Some bytes a'
yield b'Some bytes b'
def file_data_2():
yield b'Some bytes c'
yield b'Some bytes d'
You can created a single iterable of the zipped bytes of these generators:
from datetime import datetime
from stream_zip import ZIP_64, stream_zip
def zip_member_files():
modified_at = datetime.now()
perms = 0o600
yield 'my-file-1.txt', modified_at, perms, ZIP_64, file_data_1()
yield 'my-file-2.txt', modified_at, perms, ZIP_64, file_data_2()
zipped_chunks = stream_zip(zip_member_files()):
And then, for example, save this iterable to disk by:
with open('my.zip', 'wb') as f:
for chunk in zipped_chunks:
f.write(chunk)

Related

How to stream from ZipFile? How to zip "on the fly"?

I want to zip a stream and stream out the result. I'm doing it using AWS Lambda which matters in sense of available disk space and other restrictions.
I'm going to use the zipped stream to write an AWS S3 object using upload_fileobj() or put(), if it matters.
I can create an archive as a file until I have small objects:
import zipfile
zf = zipfile.ZipFile("/tmp/byte.zip", "w")
zf.writestr(filename, my_stream.read())
zf.close()
For large amount of data I can create an object instead of file:
from io import BytesIO
...
byte = BytesIO()
zf = zipfile.ZipFile(byte, "w")
....
but how can I pass the zipped stream to the output? If I use zf.close() - the stream will be closed, if I don't use it - the archive will be incomplete.
Instead of using Python't built-in zipfile, you can use stream-zip (full disclosure: written by me)
If you have an iterable of bytes, my_data_iter say, you can get an iterable of a zip file using its stream_zip function:
from datetime import datetime
from stream_zip import stream_zip, ZIP_64
def files():
modified_at = datetime.now()
perms = 0o600
yield 'my-file-1.txt', modified_at, perms, ZIP_64, my_data_iter
my_zip_iter = stream_zip(files())
If you need a file-like object, say to pass to boto3's upload_fileobj, you can convert from the iterable with a transformation function:
def to_file_like_obj(iterable):
chunk = b''
offset = 0
it = iter(iterable)
def up_to_iter(size):
nonlocal chunk, offset
while size:
if offset == len(chunk):
try:
chunk = next(it)
except StopIteration:
break
else:
offset = 0
to_yield = min(size, len(chunk) - offset)
offset = offset + to_yield
size -= to_yield
yield chunk[offset - to_yield:offset]
class FileLikeObj:
def read(self, size=-1):
return b''.join(up_to_iter(float('inf') if size is None or size < 0 else size))
return FileLikeObj()
my_file_like_obj = to_file_like_obj(my_zip_iter)
You might like to try the zipstream version of zipfile. For example, to compress stdin to stdout as a zip file holding the data as a file named TheLogFile using iterators:
#!/usr/bin/python3
import sys, zipstream
with zipstream.ZipFile(mode='w', compression=zipstream.ZIP_DEFLATED) as z:
z.write_iter('TheLogFile', sys.stdin.buffer)
for chunk in z:
sys.stdout.buffer.write(chunk)

Empty chunks when spliting a large file

I am trying to split a large files into 50Mb chunks and save them in another files. After running some read/write operations, some of my chunks were smaller than 50Mb (43Mb,17Mb and so on). Although, I wrote the same code in Java and It has the same problem. What is wrong? my codes are following bellow:
By the way, What we can do to speed up this code to split into chunks faster?
try:
f = open(self.__filename, 'rb')
except (OSError, IOError), e:
raise FileSplitterException, str(e)
bname = (os.path.split(self.__filename))[1]
fsize = os.path.getsize(self.__filename)
self.__chunksize = int(float(fsize)/float(self.__numchunks))
chunksz = self.__chunksize
total_bytes = 0
for x in range(self.__numchunks):
chunkfilename = bname + '-' + str(x+1) + self.__postfix
if x == self.__numchunks - 1:
chunksz = fsize - total_bytes
try:
print 'Writing file',chunkfilename
data = f.read(chunksz)
total_bytes += len(data)
chunkf = file(chunkfilename, 'wb')
chunkf.write(data)
chunkf.close()
except (OSError, IOError), e:
print e
continue
except EOFError, e:
print e
break
The code in the question seems to be focussed on producing a set number of chunks rather than files of 50MB in size.
This code produces 50MB files.
import os
try:
f = open('big.txt', 'rb')
except (OSError, IOError), e:
raise FileSplitterException, str(e)
bname = (os.path.split('big.txt'))[1]
chunksz = 50 * 1000 * 1000 # metric MB - use 1024 * 1024 for binary MB (MiB)
counter = 0
while True:
chunkfilename = bname + '-' + str(counter+1) + '.foo'
try:
print 'Writing file',chunkfilename
data = f.read(chunksz)
if not data:
# We have reached the end of the file, end the script.
break
chunkf = file(chunkfilename, 'wb')
chunkf.write(data)
chunkf.close()
except (OSError, IOError), e:
print e
continue
except EOFError, e:
print e
break
counter += 1
Some aspects of the code are considered poor style in modern python - for example not using a context manager to open files - but I haven't changed these in case the OP is on an old python like 2.5.
Your question is unclear because you haven't included a Minimal, Complete, and Verifiable example—so I don't know exactly what's wrong with your code. However after creating / simulating my guess as to the missing parts, I was able to come up with something that does exactly what you want, I think.
import os
class FileSplitterException(Exception): pass
class FileSplitter(object):
def __init__(self, filename, chunksize):
if not os.path.isfile(filename):
raise FileSplitterException(
"File: {!r} does not exist".format(filename))
self._filename = filename
self._postfix = 'chunk'
self._chunksize = chunksize
def split(self):
bname = os.path.splitext(self._filename)[0]
fsize = os.path.getsize(self._filename)
chunks, partial = divmod(fsize, self._chunksize)
if partial:
chunks += 1
with open(self._filename, 'rb') as infile:
for i in range(chunks):
chunk_filename = os.path.join('{}-{}.{}'.format(
bname, i, self._postfix))
with open(chunk_filename, 'wb') as outfile:
data = infile.read(self._chunksize)
if data:
outfile.write(data)
else:
FileSplitterException('unexpected EOF encountered')
if __name__ == '__main__':
import glob
filename = 'big_file.txt'
chunksize = 1 * 1024 * 1024 # 1 Mb
print('splitting {} into {:,} sized chunks'.format(filename, chunksize))
fs = FileSplitter(filename, chunksize)
fs.split()
print('chunk files written:')
bname = os.path.splitext(filename)[0]
for chunkname in sorted(glob.glob(bname + '-*.' + fs._postfix)):
fsize = os.path.getsize(chunkname)
print(' {}: size: {:,}'.format(chunkname, fsize))

divide a disk image into smaller parts using Python

I would like to write a program that takes a .dmg file that is 1.6 GB and split it into 100 MB chunks.
I would like to also write another program that later can put everything back together so that it can be mounted and used.
I am very new to Python (and any type of programming language in general) and cannot find anything on here about this specific thing. Let me know if I am using incorrect terminology too so that I can learn how to search more effectively.
Thanks!
Try this example:
split.py
import sys, os
kilobytes = 1024
megabytes = kilobytes * 1000
chunksize = int(1.4 * megabytes)
def split(fromfile, todir, chunksize=chunksize):
if not os.path.exists(todir):
os.mkdir(todir)
else:
for fname in os.listdir(todir):
os.remove(os.path.join(todir, fname))
partnum = 0
input = open(fromfile, 'rb')
while 1:
chunk = input.read(chunksize)
if not chunk: break
partnum = partnum+1
filename = os.path.join(todir, ('part%04d' % partnum))
fileobj = open(filename, 'wb')
fileobj.write(chunk)
fileobj.close()
input.close( )
assert partnum <= 9999
return partnum
if __name__ == '__main__':
try:
parts = split('/Users/example/Desktop/SO/st/example.mp4', '/Users/example/Desktop/SO/st/new', 2000000) # 100000000 == 100 mb
except:
print('Error during split')
for join:
join.py
import os, sys
readsize = 1024
def join(fromdir, tofile):
output = open(tofile, 'wb')
parts = os.listdir(fromdir)
parts.sort( )
for filename in parts:
filepath = os.path.join(fromdir, filename)
fileobj = open(filepath, 'rb')
while 1:
filebytes = fileobj.read(readsize)
if not filebytes: break
output.write(filebytes)
fileobj.close( )
output.close( )
if __name__ == '__main__':
try:
join('/Users/example/Desktop/SO/st/new', 'example_join.mp4')
except:
print('Error joining files:')
else:
print('Join complete!')

How do you append bytes onto a file inside of a zip file via Python 2.7?

I am currently working on a piece of a larger puzzle. For my piece, I have a file object and the destination of the zip file. There is never a time that I am aware of the size of the file object. I only know I have one. There for the zip has to support zip64.
My goal is to take that file object(pointer to the file) and write it to the zip file without loading the entire file into memory. I would like to do this chunk by chunk(especially if the file object is really big).
Any ideas on how I can go about doing this?
import zipfile
zip_path = "/tmp/file.zip"
file_to_zip_path = "/home/ryanb58/Desktop/movie.mp4"
with zipfile.ZipFile(zip_path, mode="w", allowZip64=True) as zip:
f = open(file_to_zip_path, 'rb')
while True:
data = f.read(1024)
zip.writestr("file.mp4", data)
if not data:
break
My issue is that when I write the new bytes to the file inside the zip. Upon it finishing, I open the zip and it is just a huge list of small files with the same name, each about 1024bytes in size. My code above ^^ I am kind of stuck, so any ideas or solutions would be great.
Following the advice that #J.F.Sebastian gave in his comment, I was able to write my file to a zip without bringing the full file into memory.
Here is my solution for the override.
import zipfile
BUFFER_SIZE = 1024 * 10000 # 10 megabytes.
class Zip(zipfile.ZipFile):
def write(self, fileobj, arcname=None, compress_type=None):
"""Put the bytes from file into the archive under the name
arcname."""
"""CONST"""
ZIP64_LIMIT = (1 << 31) - 1
ZIP_DEFLATED = 8
try:
import zlib # We may need its compression method
crc32 = zlib.crc32
except ImportError:
zlib = None
crc32 = binascii.crc32
if not self.fp:
raise RuntimeError(
"Attempt to write to ZIP archive that was already closed")
st = os.stat(fileobj.name)
isdir = stat.S_ISDIR(st.st_mode)
mtime = time.localtime(st.st_mtime)
date_time = mtime[0:6]
# Create ZipInfo instance to store file information
if arcname is None:
arcname = "/temp.zip"
arcname = os.path.normpath(os.path.splitdrive(arcname)[1])
# Strips any leading forward or back slashes for files.
while arcname[0] in (os.sep, os.altsep):
arcname = arcname[1:]
if isdir:
arcname += '/'
# Create the zipinfo.
zinfo = zipfile.ZipInfo(arcname, date_time)
zinfo.external_attr = (st.st_mode & 0xFFFF) << 16L # Unix attributes
if isdir:
zinfo.compress_type = ZIP_STORED
elif compress_type is None:
zinfo.compress_type = self.compression
else:
zinfo.compress_type = compress_type
zinfo.file_size = st.st_size
zinfo.flag_bits = 0x00
zinfo.header_offset = self.fp.tell() # Start of header bytes
self._writecheck(zinfo)
self._didModify = True
if isdir:
zinfo.file_size = 0
zinfo.compress_size = 0
zinfo.CRC = 0
zinfo.external_attr |= 0x10 # MS-DOS directory flag
self.filelist.append(zinfo)
self.NameToInfo[zinfo.filename] = zinfo
self.fp.write(zinfo.FileHeader(False))
return
# Must overwrite CRC and sizes with correct data later
zinfo.CRC = CRC = 0
zinfo.compress_size = compress_size = 0
# Compressed size can be larger than uncompressed size
zip64 = self._allowZip64 and \
zinfo.file_size * 1.05 > ZIP64_LIMIT
self.fp.write(zinfo.FileHeader())
if zinfo.compress_type == ZIP_DEFLATED:
cmpr = zlib.compressobj(zlib.Z_DEFAULT_COMPRESSION,
zlib.DEFLATED, -15)
else:
cmpr = None
file_size = 0
while 1:
buf = fileobj.read(BUFFER_SIZE)
if not buf:
break
file_size = file_size + len(buf)
CRC = crc32(buf, CRC) & 0xffffffff
if cmpr:
buf = cmpr.compress(buf)
compress_size = compress_size + len(buf)
self.fp.write(buf)
if cmpr:
buf = cmpr.flush()
compress_size = compress_size + len(buf)
self.fp.write(buf)
zinfo.compress_size = compress_size
else:
zinfo.compress_size = file_size
zinfo.CRC = CRC
zinfo.file_size = file_size
if not zip64 and self._allowZip64:
if file_size > ZIP64_LIMIT:
raise RuntimeError('File size has increased during compressing')
if compress_size > ZIP64_LIMIT:
raise RuntimeError('Compressed size larger than uncompressed size')
# Seek backwards and write file header (which will now include
# correct CRC and file sizes)
position = self.fp.tell() # Preserve current position in file
self.fp.seek(zinfo.header_offset, 0)
self.fp.write(zinfo.FileHeader())
self.fp.seek(position, 0)
self.filelist.append(zinfo)
self.NameToInfo[zinfo.filename] = zinfo
As you can see I can't pass in zip64 into the FileHeader methods because on the system the code runs, it only supports Python 2.7.2 Whereas to support the correct headers for zip64 files you will need Python 2.7.4 at the minimum.
https://github.com/python/cpython/blob/2e46376c8c10908afed56ace4c7f0f7c64e80c5e/Misc/NEWS#L189

How to download large file with binary mode in python?

I am code a download function in python. The file size >1GB. The server is linux, HTTP server is Karrigell. Client is browse, Firefox or IE. I meet a big trouble.
At first, I use sys.stdout() to send file content.
file = open(path, 'rb')
size = os.path.getsize(path)
RESPONSE['Pragma'] = 'public'
RESPONSE['Expires'] = '0'
RESPONSE['Cache-Control'] = 'must-revalidate, pre-check=0'
RESPONSE['Content-Disposition'] = 'attachment; filename="' + os.path.basename(path) + '"'
RESPONSE['Content-type'] = "application/octet-stream"
RESPONSE['Content-Transfer-Encoding'] = 'binary'
RESPONSE['Content-length'] = str(os.path.getsize(path))
sys.stdout.flush()
chunk_size = 10000
handle = open(path, "rb")
while True:
buffer = handle.read(chunk_size)
if buffer:
STDOUT(buffer)
else:
break
sys.stdout.flush()
The problem is the server out of memory! I know, stdout write content to memory first, then memory send to socket.
So, I modify the function. Send content to socket directly. I use the py-sendfile module. http://code.google.com/p/py-sendfile/
file = open(path, 'rb')
size = os.path.getsize(path)
sock = REQUEST_HANDLER.sock
sock.sendall("""HTTP/1.1 200 OK\r\nPragma: no-cache\r\nExpires: 0\r\nCache-Control: no-cache, no-store\r\nContent-Disposition: attachment; filename="%s"\r\nContent-Type: application/octet-stream\r\nContent-Length: %u\r\nContent-Range: bytes 0-4096/%u\r\nLocation: "%s"\r\n\r\n""" % (os.path.basename(path), size, size, os.path.basename(path)))
offset = 0
nbytes = 4096
while 1:
try:
sent = sendfile.sendfile(sock.fileno(), file.fileno(), offset, nbytes)
except OSError, err:
if err.errno in (errno.EAGAIN, errno.EBUSY): # retry
continue
raise
else:
if sent == 0:
break # done
offset += sent
This time, the server memory is OK, but browse die! The browse memory rise quickly! Not free
until the socket accept whole file content.
I don't know how to deal with these problems. I think the second idea is right, send content to socket directly. But why browse can't free memory while accept data?
You should try to download the file in chunks. This is an example that works for me using urllib2
import os
import urllib2
import math
def downloadChunks(url):
"""Helper to download large files
the only arg is a url
this file will go to a temp directory
the file will also be downloaded
in chunks and print out how much remains
"""
baseFile = os.path.basename(url)
#move the file to a more uniq path
os.umask(0002)
temp_path = "/tmp/"
try:
file = os.path.join(temp_path,baseFile)
req = urllib2.urlopen(url)
total_size = int(req.info().getheader('Content-Length').strip())
downloaded = 0
CHUNK = 256 * 10240
with open(file, 'wb') as fp:
while True:
chunk = req.read(CHUNK)
downloaded += len(chunk)
print math.floor( (downloaded / total_size) * 100 )
if not chunk: break
fp.write(chunk)
except urllib2.HTTPError, e:
print "HTTP Error:",e.code , url
return False
except urllib2.URLError, e:
print "URL Error:",e.reason , url
return False
return file

Categories

Resources