Create a temporary compressed file - python

I need to create a temporary file to send it, I have tried :
# Create a temporary file --> I think it is ok (file not seen)
temporaryfile = NamedTemporaryFile(delete=False, dir=COMPRESSED_ROOT)
# The path to archive --> It's ok
root_dir = "something"
# Create a compressed file --> It bugs
data = open(f.write(make_archive(f.name, 'zip', root_dir))).read()
# Send the file --> Its ok
response = HttpResponse(data, mimetype='application/zip')
response['Content-Disposition'] = 'attachment; filename="%s"' % unicode(downloadedassignment.name + '.zip')
return response
I don't know at all if it is the good approach..

I actually just needed to do something similar and I wanted to avoid file I/O entirely, if possible. Here's what I came up with:
import tempfile
import zipfile
with tempfile.SpooledTemporaryFile() as tmp:
with zipfile.ZipFile(tmp, 'w', zipfile.ZIP_DEFLATED) as archive:
archive.writestr('something.txt', 'Some Content Here')
# Reset file pointer
tmp.seek(0)
# Write file data to response
return HttpResponse(tmp.read(), mimetype='application/x-zip-compressed')
It uses a SpooledTemporaryFile so it will remain in-memory, unless it exceeds the memory limits. Then, I set this tempory file as the stream for ZipFile to use. The filename passed to writestr is just the filename that the file will have inside the archive, it doesn't have anything to do with the server's filesystem. Then, I just need to rewind the file pointer (seek(0)) after ZipFile had done its thing and dump it to the response.

First of all, you don't need to create a NamedTemporaryFile to use make_archive; all you want is a unique filename for the make_archive file to create.
.write doesn't return a filename
To focus on that error: You are assuming that the return value of f.write is a filename you can open; just seek to the start of your file and read instead:
f.write(make_archive(f.name, 'zip', root_dir))
f.seek(0)
data = f.read()
Note that you'll also need to clean up the temporary file you created (you set delete=False):
import os
f.close()
os.unlink(f.name)
Alternatively, just omit the delete keyword to have it default to True again and only close your file afterwards, no need to unlink.
That just wrote the archive filename to a new file..
You are just writing the new archive name to your temporary file. You'd be better off just reading the archive directly:
data = open(make_archive(f.name, 'zip', root_dir), 'rb').read()
Note that now your temporary file isn't being written to at all.
Best way to do this
Avoid creating a NamedTemporaryFile altogether: Use tempfile.mkdtemp() instead, to generate a temporary directory in which to put your archive, then clean that up afterwards:
tmpdir = tempfile.mkdtemp()
try:
tmparchive = os.path.join(tmpdir, 'archive')
root_dir = "something"
data = open(make_archive(tmparchive, 'zip', root_dir), 'rb').read()
finally:
shutil.rmtree(tmpdir)

Related

Python ZipFile Created In-Memory Not Compressing as Expected

I am trying to use Python to create a ZipFile object in-memory, and write a single file, also created in-memory, into the ZipFile object, and then upload the file to Google Cloud Storage.
My file is not actually getting compressed. Any idea what I might be doing wrong?
I realize there may be a fancier way of getting the row data into the file object, but apart from that, I'm really just trying to figure out why the resulting zip file is not coming out compressed at all.
UPDATE: code sample now excludes any interaction with Google Cloud Services (GCS, etc.), and instead just writes the files to disk.
It seems that when I write the file to disk first, then create the ZipFile, the result is compressed as expected, but when I add the StringIO contents directly from memory to the ZipFile object, the contents are not compressed.
import random, io, argparse, os, string
from zipfile import ZipFile, ZipInfo, ZIP_DEFLATED
parser = argparse.ArgumentParser()
parser.add_argument("--row_limit", default=1000)
parser.add_argument("--file_name", default='file.txt', type=str)
parser.add_argument("--archive_name", default='file.zip', type=str)
parser.add_argument("--snapshot_millis", default=0, type=int)
args = parser.parse_args()
# imagine this has lots and lots of data in it, coming from a database query result
rows = [{
'seq_no': ''.join(random.choices(string.ascii_uppercase + string.digits, k=args.row_limit)),
'csv': ''.join(random.choices(string.ascii_uppercase + string.digits, k=args.row_limit))
}] * args.row_limit
archive = io.BytesIO()
# create zip archive in memory
with ZipFile(archive, 'w', compression=ZIP_DEFLATED, compresslevel=9) as zip_archive:
count = 0
file_contents = io.StringIO()
for row in rows:
if count > args.row_limit:
break
count += 1
file_contents.write(f"{row['seq_no']},{row['csv']}\n")
# write file to zip archive in memory
zip_file = ZipInfo(args.file_name)
zip_archive.writestr(zip_file, file_contents.getvalue())
# also write file to disk
with open(args.file_name, mode='w') as f:
print(file_contents.getvalue(), file=f)
print(f"StringIO Size: {file_contents.tell()}")
print(f"Text File Size On Disk: {os.path.getsize(args.file_name)}")
archive.seek(0)
with open(args.archive_name, 'wb') as outfile:
outfile.write(archive.getbuffer())
print(f"Zip File Created from File In Memory: {os.path.getsize(args.archive_name)}")
ZipFile(args.archive_name, mode='w', compression=ZIP_DEFLATED, compresslevel=9).write(args.file_name)
print(f"Zip File Created from File On Disk: {os.path.getsize(args.archive_name)}")
The problem is here:
zip_file = ZipInfo(args.file_name)
zip_archive.writestr(zip_file, file_contents.getvalue())
From the ZipFile.writestr docs:
When passing a ZipInfo instance as the zinfo_or_arcname parameter, the
compression method used will be that specified in the compress_type
member of the given ZipInfo instance. By default, the ZipInfo
constructor sets this member to ZIP_STORED [i.e. uncompressed].
The easiest way to correct the issue is not to use a complete ZipInfo, but just the file name. This will also set the current date/time as the creation time for the file inside the archive (ZipInfo defaults to year 1980):
# zip_file = ZipInfo(args.file_name)
zip_archive.writestr(args.file_name, file_contents.getvalue())

How to read contents of zip file in memory on a file upload in python?

I have a zip file that I receive when the user uploads a file. The zip essentially contains a json file which I want to read and process without having to create the zip file first, then unzipping it and then reading the content of the inner file.
Currently I only the longer process which is something like below
import json
import zipfile
#csrf_exempt
def get_zip(request):
try:
if request.method == "POST":
try:
client_file = request.FILES['file']
file_path = "/some/path/"
# first dump the zip file to a directory
with open(file_path + '%s' % client_file.name, 'wb+') as dest:
for chunk in client_file.chunks():
dest.write(chunk)
# unzip the zip file to the same directory
with zipfile.ZipFile(file_path + client_file.name, 'r') as zip_ref:
zip_ref.extractall(file_path)
# at this point we get a json file from the zip say `test.json`
# read the json file content
with open(file_path + "test.json", "r") as fo:
json_content = json.load(fo)
doSomething(json_content)
return HttpResponse(0)
except Exception as e:
return HttpResponse(1)
As you can see, this involves 3 steps to finally get the content from the zip file into memory. What I want is get the content of the zip file and load directly into memory.
I did find some similar questions in stack overflow like this one https://stackoverflow.com/a/2463819 . But I am not sure at what point do I invoke this operation mentioned in the post
How can I achieve this?
Note: I am using django in backend.
There will always be one json file in the zip.
From what I understand, what #jason is trying to say here is to first open a zipFile just like you have done here with zipfile.ZipFile(file_path + client_file.name, 'r') as zip_ref:.
class zipfile.ZipFile(file[, mode[, compression[, allowZip64]]])
Open a ZIP file, where file can be either a path to a file (a string) or a file-like object.
And then use BytesIO read in the bytes of a file-like object. But from above you are reading in r mode and not rb mode. So change it as follows.
with open(filename, 'rb') as file_data:
bytes_content = file_data.read()
file_like_object = io.BytesIO(bytes_content)
zipfile_ob = zipfile.ZipFile(file_like_object)
Now zipfile_ob can be accessed from memory.
The first argument to zipfile.ZipFile() can be a file object rather than a pathname. I think the Django UploadedFile object supports this use, so you can read directly from that rather than having to copy into a file.
You can also open the file directly from the zip archive rather than extracting that into a file.
import json
import zipfile
#csrf_exempt
def get_zip(request):
try:
if request.method == "POST":
try:
client_file = request.FILES['file']
# unzip the zip file to the same directory
with zipfile.ZipFile(client_file, 'r') as zip_ref:
first = zip_ref.infolist()[0]
with zip_ref.open(first, "r") as fo:
json_content = json.load(fo)
doSomething(json_content)
return HttpResponse(0)
except Exception as e:
return HttpResponse(1)

Sending multiple .CSV files to .ZIP without storing to disk in Python

I'm working on a reporting application for my Django powered website. I want to run several reports and have each report generate a .csv file in memory that can be downloaded in batch as a .zip. I would like to do this without storing any files to disk. So far, to generate a single .csv file, I am following the common operation:
mem_file = StringIO.StringIO()
writer = csv.writer(mem_file)
writer.writerow(["My content", my_value])
mem_file.seek(0)
response = HttpResponse(mem_file, content_type='text/csv')
response['Content-Disposition'] = 'attachment; filename=my_file.csv'
This works fine, but only for a single, unzipped .csv. If I had, for example, a list of .csv files created with a StringIO stream:
firstFile = StringIO.StringIO()
# write some data to the file
secondFile = StringIO.StringIO()
# write some data to the file
thirdFile = StringIO.StringIO()
# write some data to the file
myFiles = [firstFile, secondFile, thirdFile]
How could I return a compressed file that contains all objects in myFiles and can be properly unzipped to reveal three .csv files?
zipfile is a standard library module that does exactly what you're looking for. For your use-case, the meat and potatoes is a method called "writestr" that takes a name of a file and the data contained within it that you'd like to zip.
In the code below, I've used a sequential naming scheme for the files when they're unzipped, but this can be switched to whatever you'd like.
import zipfile
import StringIO
zipped_file = StringIO.StringIO()
with zipfile.ZipFile(zipped_file, 'w') as zip:
for i, file in enumerate(files):
file.seek(0)
zip.writestr("{}.csv".format(i), file.read())
zipped_file.seek(0)
If you want to future-proof your code (hint hint Python 3 hint hint), you might want to switch over to using io.BytesIO instead of StringIO, since Python 3 is all about the bytes. Another bonus is that explicit seeks are not necessary with io.BytesIO before reads (I haven't tested this behavior with Django's HttpResponse, so I've left that final seek in there just in case).
import io
import zipfile
zipped_file = io.BytesIO()
with zipfile.ZipFile(zipped_file, 'w') as f:
for i, file in enumerate(files):
f.writestr("{}.csv".format(i), file.getvalue())
zipped_file.seek(0)
The stdlib comes with the module zipfile, and the main class, ZipFile, accepts a file or file-like object:
from zipfile import ZipFile
temp_file = StringIO.StringIO()
zipped = ZipFile(temp_file, 'w')
# create temp csv_files = [(name1, data1), (name2, data2), ... ]
for name, data in csv_files:
data.seek(0)
zipped.writestr(name, data.read())
zipped.close()
temp_file.seek(0)
# etc. etc.
I'm not a user of StringIO so I may have the seek and read out of place, but hopefully you get the idea.
def zipFiles(files):
outfile = StringIO() # io.BytesIO() for python 3
with zipfile.ZipFile(outfile, 'w') as zf:
for n, f in enumarate(files):
zf.writestr("{}.csv".format(n), f.getvalue())
return outfile.getvalue()
zipped_file = zip_files(myfiles)
response = HttpResponse(zipped_file, content_type='application/octet-stream')
response['Content-Disposition'] = 'attachment; filename=my_file.zip'
StringIO has getvalue method which return the entire contents. You can compress the zipfile
by zipfile.ZipFile(outfile, 'w', zipfile.ZIP_DEFLATED). Default value of compression is ZIP_STORED which will create zip file without compressing.

Python zipfile, bizarre limit to number of files: "folder is invalid"

The computer is toying with me, I know it!
I am creating a zip folder in Python. The individual files are generated in memory and then the whole thing is zipped and saved to a file. I am allowed to add 9 files to the zip. I am allowed to add 11 files to the zip. But 10, no, not 10 files. The zip file IS saved to my computer, but I'm not allowed to open it; Windows says that the compressed zipped folder is invalid.
I use the code below, which I got from another stackoverflow question. It appends 10 files and saves the zipped folder. When I click on the folder, I cannot extract it. BUT, remove one of the appends() and it's fine. Or, add another append and it works!
What am I missing here? How can I make this work every time?
imz = InMemoryZip()
imz.append("1a.txt", "a").append("2a.txt", "a").append("3a.txt", "a").append("4a.txt", "a").append("5a.txt", "a").append("6a.txt", "a").append("7a.txt", "a").append("8a.txt", "a").append("9a.txt", "a").append("10a.txt", "a")
imz.writetofile("C:/path/test.zip")
import zipfile
import StringIO
class InMemoryZip(object):
def __init__(self):
# Create the in-memory file-like object
self.in_memory_zip = StringIO.StringIO()
def append(self, filename_in_zip, file_contents):
'''Appends a file with name filename_in_zip and contents of
file_contents to the in-memory zip.'''
# Get a handle to the in-memory zip in append mode
zf = zipfile.ZipFile(self.in_memory_zip, "a", zipfile.ZIP_DEFLATED, False)
# Write the file to the in-memory zip
zf.writestr(filename_in_zip, file_contents)
# Mark the files as having been created on Windows so that
# Unix permissions are not inferred as 0000
for zfile in zf.filelist:
zfile.create_system = 0
return self
def read(self):
'''Returns a string with the contents of the in-memory zip.'''
self.in_memory_zip.seek(0)
return self.in_memory_zip.read()
def writetofile(self, filename):
'''Writes the in-memory zip to a file.'''
f = file(filename, "w")
f.write(self.read())
f.close()
You should use the 'wb' mode when creating the file you are saving to the file system. This will ensure that the file is written in binary.
Otherwise, any time a newline (\n) character happens to be encountered in the zip file python will replace it to match the windows line ending (\r\n). The reason 10 files is a problem is that 10 happens to be the code for \n.
So your write function should look like this:
def writetofile(self, filename):
'''Writes the in-memory zip to a file.'''
f = file(filename, 'wb')
f.write(self.read())
f.close()
This should fix your problem and work for the files in your example. Although, in your case you might find it easier to write the zip file directly to the file system like this code which includes some of the comments from above:
import StringIO
import zipfile
class ZipCreator:
buffer = None
def __init__(self, fileName=None):
if fileName:
self.zipFile = zipfile.ZipFile(fileName, 'w', zipfile.ZIP_DEFLATED, False)
return
self.buffer = StringIO.StringIO()
self.zipFile = zipfile.ZipFile(self.buffer, 'w', zipfile.ZIP_DEFLATED, False)
def addToZipFromFileSystem(self, filePath, filenameInZip):
self.zipFile.write(filePath, filenameInZip)
def addToZipFromMemory(self, filenameInZip, fileContents):
self.zipFile.writestr(filenameInZip, fileContents)
for zipFile in self.zipFile.filelist:
zipFile.create_system = 0
def write(self, fileName):
if not self.buffer: # If the buffer was not initialized the file is written by the ZipFile
self.zipFile.close()
return
f = file(fileName, 'wb')
f.write(self.buffer.getvalue())
f.close()
# Use File Handle
zipCreator = ZipCreator('C:/path/test.zip')
# Use Memory Buffer
# zipCreator = ZipCreator()
for i in range(1, 10):
zipCreator.addToZipFromMemory('test/%sa.txt' % i, 'a')
zipCreator.write('C:/path/test.zip')
Ideally, you would probably use separate classes for an in-memory zip and a zip that is tied to the file system from the beginning. I have also seem some issues with the in-memory zip when folders are added which are difficult to recreate and which I am still trying to track down.

Reading the same file multiple times in Python

I need to download a zip archive of text files, dispatch each text file in the archive to other handlers for processing, and finally write the unzipped text file to disk.
I have the following code. It uses multiple open/close on the same file, which does not seem elegant. How do I make it more elegant and efficient?
zipped = urllib.urlopen('www.abc.com/xyz.zip')
buf = cStringIO.StringIO(zipped.read())
zipped.close()
unzipped = zipfile.ZipFile(buf, 'r')
for f_info in unzipped.infolist():
logfile = unzipped.open(f_info)
handler1(logfile)
logfile.close() ## Cannot seek(0). The file like obj does not support seek()
logfile = unzipped.open(f_info)
handler2(logfile)
logfile.close()
unzipped.extract(f_info)
Your answer is in your example code. Just use StringIO to buffer the logfile:
zipped = urllib.urlopen('www.abc.com/xyz.zip')
buf = cStringIO.StringIO(zipped.read())
zipped.close()
unzipped = zipfile.ZipFile(buf, 'r')
for f_info in unzipped.infolist():
logfile = unzipped.open(f_info)
# Here's where we buffer:
logbuffer = cStringIO.StringIO(logfile.read())
logfile.close()
for handler in [handler1, handler2]:
handler(logbuffer)
# StringIO objects support seek():
logbuffer.seek(0)
unzipped.extract(f_info)
You could say something like:
handler_dispatch(logfile)
and
def handler_dispatch(file):
for line in file:
handler1(line)
handler2(line)
or even make it more dynamic by constructing a Handler class with multiple handlerN functions, and applying each of them inside handler_dispatch. Like
class Handler:
def __init__(self:)
self.handlers = []
def add_handler(handler):
self.handlers.append(handler)
def handler_dispatch(self, file):
for line in file:
for handler in self.handlers:
handler.handle(line)
Open the zip file once, loop through all the names, extract the file for each name and process it, then write it to disk.
Like so:
for f_info in unzipped.info_list():
file = unzipped.open(f_info)
data = file.read()
# If you need a file like object, wrap it in a cStringIO
fobj = cStringIO.StringIO(data)
handler1(fobj)
handler2(fobj)
with open(filename,"w") as fp:
fp.write(data)
You get the idea

Categories

Resources