I want to get the data of tar.gz I created
In this example I create the tar.gz file, and then read the content
import tarfile
with tarfile.open('/tmp/test.tar.gz', 'w:gz') as f:
f.add("/home/chris/.zshrc")
with open ('/tmp/test.tar.gz','rb') as f:
data = f.read()
I there any short and clean way? I dpn't need the tar.gz file, only the data
Use an in-memory buffer by specifying to tarfile the fileobj argument that is an io.BytesIO instance:
import tarfile
from io import BytesIO
buf = BytesIO()
with tarfile.open('/tmp/test.tar.gz', 'w:gz', fileobj=buf) as f:
f.add("/home/chris/.zshrc")
data = buf.getvalue()
print(len(data))
Or you can do:
import tarfile
from io import BytesIO
buf = BytesIO()
with tarfile.open('/tmp/test.tar.gz', 'w:gz', fileobj=buf) as f:
f.add("/home/chris/.zshrc")
buf.seek(0, 0) # reset pointer back to the start of the buffer
with tarfile.open('/tmp/test.tar.gz', 'r:gz', fileobj=buf) as f:
print(f.getmembers())
Related
When I run this glue job function I get the folders but the actual csv after extracting is empty with 0 bytes, but the compressed tar.gz shows 108 bytes. What is wrong?
import sys
import os
import tarfile
import boto3
import io
from awsglue.utils import getResolvedOptions
args = getResolvedOptions(sys.argv, ['INPUT_FILE_PATH','PROCESS_DATE','TARGET_BUCKET','TARGET_PATH','S3_RAW_BUCKET'])
s3 = boto3.resource('s3')
def createTarFileStream(s3RawBucket, srcBucketFilePath, targetBucketName, targetBucketFilePath, processDate, fileExt='csv'):
bucket = s3.Bucket(s3RawBucket)
filesCollection = bucket.objects.filter(Prefix=srcBucketFilePath).all()
archive = io.BytesIO()
with tarfile.open(name=None, mode='w|gz', fileobj=archive) as tar:
for file in filesCollection:
info = tarfile.TarInfo(name=os.path.basename(file.key))
tar.addfile(tarinfo=info, fileobj=file.get()['Body'].read())
archive.seek(0)
s3.Object(targetBucketName, f"{targetBucketFilePath}/process_date={processDate}.tar.gz").upload_fileobj(archive)
archive.close()
return
createTarFileStream(args["S3_RAW_BUCKET"], args["INPUT_FILE_PATH"], args["TARGET_BUCKET"], args["TARGET_PATH"], args["PROCESS_DATE"])
UPDATE I have found the solution. I had to update the TarInfo object with the size of the content in order to keep the tar from being empty. Here is the final working code, also note the headers required to upload gzip to S3:
with tarfile.open(tfile, "w:gz") as archive:
for file in filesCollection:
with io.BytesIO(file.get()['Body'].read()) as f:
info = tarfile.TarInfo(os.path.basename(file.key))
f.seek(0, io.SEEK_END)
info.size = f.tell()
f.seek(0, io.SEEK_SET)
archive.addfile(info, f)
with open(tfile, 'rb') as t:
s3.Object(targetBucketName, f"{targetBucketFilePath}/{tfile}.tar.gz").upload_fileobj(t, {'ContentType': 'text/plain', 'ContentEncoding': 'gzip'})
Right now my final output is in excel format. I wanted to compressed my excel file using gzip. Is there a way to do it ?
import pandas as pd
import gzip
import re
def renaming_ad_unit():
with gzip.open('weekly_direct_house.xlsx.gz') as f:
df = pd.read_excel(f)
result = df['Ad unit'].to_list()
for index, a_string in enumerate(result):
modified_string = re.sub(r"\([^()]*\)", "", a_string)
df.at[index,'Ad unit'] = modified_string
return df.to_excel('weekly_direct_house.xlsx',index=False)
Yes, this is possible.
To create a gzip file, you can open the file like this:
with gzip.open('filename.xlsx.gz', 'wb') as f:
...
Unfortunately, when I tried this, I found that I get the error OSError: Negative seek in write mode. This is because the Pandas excel writer moves backwards in the file when writing, and uses multiple passes to write the file. This is not allowed by the gzip module.
To fix this, I created a temporary file, and wrote the excel file there. Then, I read the file back, and write it to the compressed archive.
I wrote a short program to demonstrate this. It reads an excel file from a gzip archive, prints it out, and writes it back to another gzip file.
import pandas as pd
import gzip
import tempfile
def main():
with gzip.open('apportionment-2020-table02.xlsx.gz') as f:
df = pd.read_excel(f)
print(df)
with tempfile.TemporaryFile() as excel_f:
df.to_excel(excel_f, index=False)
with gzip.open('output.xlsx.gz', 'wb') as gzip_f:
excel_f.seek(0)
gzip_f.write(excel_f.read())
if __name__ == '__main__':
main()
Here's the file I'm using to demonstrate this: Link
You could also use io.BytesIO to create file in memory and write excel in this file and next write this file as gzip on disk.
I used link to excel file from Nick ODell answer.
import pandas as pd
import gzip
import io
df = pd.read_excel('https://www2.census.gov/programs-surveys/decennial/2020/data/apportionment/apportionment-2020-table02.xlsx')
buf = io.BytesIO()
df.to_excel(buf)
buf.seek(0) # move to the beginning of file
with gzip.open('output.xlsx.gz', 'wb') as f:
f.write(buf.read())
Similar to Nick ODell answer.
import pandas as pd
import gzip
import io
df = pd.read_excel('https://www2.census.gov/programs-surveys/decennial/2020/data/apportionment/apportionment-2020-table02.xlsx')
with io.BytesIO() as buf:
df.to_excel(buf)
buf.seek(0) # move to the beginning of file
with gzip.open('output.xlsx.gz', 'wb') as f:
f.write(buf.read())
Tested on Linux
Using python zipfile module I created a zip file as follows:
s = StringIO()
zip_file = zipfile.ZipFile(s, "w")
zip_file.write('/local/my_files/my_file.txt')
s.seek(0)
and now, I want this zip_file to be saved in my file system at path /local/my_files/ as my_file.zip.
Generally to save a noraml files I used the following flow:
with open(dest_file, 'w') as out_file:
for line in in_file:
out_file.write(line)
But, I think I can't achieve saving a zipfile with this. Can any one please help me in getting this done.
zip_file = zipfile.ZipFile("/local/my_files/my_file.zip", "w")
zip_file.write('/local/my_files/my_file.txt')
zip_file.close()
The first argument of the ZipFile object initialization is the path to which you want to save the zip file.
If you need to use StringIO, just try this code:
from StringIO import StringIO
import zipfile
s = StringIO()
with zipfile.ZipFile(s, "w", compression=zipfile.ZIP_DEFLATED) as zf:
zf.write('/local/my_files/my_file.txt')
with open('/local/my_files/my_file.zip', 'wb') as f_out:
f_out.write(s.getvalue())
Or you can do it in a simpler way:
import zipfile
with zipfile.ZipFile("/local/my_files/my_file.zip", "w", compression=zipfile.ZIP_DEFLATED) as zf:
zf.write("/local/my_files/my_file.txt")
Is there a possibility to create a file directly in a tar archive?
Context: I have a method which creates content of some kind as String. I want to save this content as a file in the tar archive. Do I have to create a tmpfile or is there a possibility to create a file directly in the tar archive.
def save_files_to_tar(tarname):
archive = tarfile.open(tarname, mode='w')
for _ in range(some_number):
content = get_content()
# HERE add content to tar
I think you should use StringIO, to create a file like in-memory object, and use tarInfo to describe a fake file, like so :
import StringIO
import tarfile
archive = tarfile.open(tarname, mode='w')
for _ in range(some_number):
content = get_content()
s = StringIO.StringIO()
s.write(content)
s.seek(0)
tarinfo = tarfile.TarInfo(name="my filename")
tarinfo.size = len(s.buf)
archive.addfile(tarinfo=tarinfo, fileobj=s)
archive.close()
Hope this helps.
A solution that is Python 2 & 3 compatible using context managers:
from __future__ import unicode_literals
import tarfile
import time
from contextlib import closing
from io import BytesIO
message = 'Lorem ipsum dolor sit amet.'
filename = 'test.txt'
with tarfile.open('test.tar', 'w') as tf:
with closing(BytesIO(message.encode())) as fobj:
tarinfo = tarfile.TarInfo(filename)
tarinfo.size = len(fobj.getvalue())
tarinfo.mtime = time.time()
tf.addfile(tarinfo, fileobj=fobj)
I'm working on a reporting application for my Django powered website. I want to run several reports and have each report generate a .csv file in memory that can be downloaded in batch as a .zip. I would like to do this without storing any files to disk. So far, to generate a single .csv file, I am following the common operation:
mem_file = StringIO.StringIO()
writer = csv.writer(mem_file)
writer.writerow(["My content", my_value])
mem_file.seek(0)
response = HttpResponse(mem_file, content_type='text/csv')
response['Content-Disposition'] = 'attachment; filename=my_file.csv'
This works fine, but only for a single, unzipped .csv. If I had, for example, a list of .csv files created with a StringIO stream:
firstFile = StringIO.StringIO()
# write some data to the file
secondFile = StringIO.StringIO()
# write some data to the file
thirdFile = StringIO.StringIO()
# write some data to the file
myFiles = [firstFile, secondFile, thirdFile]
How could I return a compressed file that contains all objects in myFiles and can be properly unzipped to reveal three .csv files?
zipfile is a standard library module that does exactly what you're looking for. For your use-case, the meat and potatoes is a method called "writestr" that takes a name of a file and the data contained within it that you'd like to zip.
In the code below, I've used a sequential naming scheme for the files when they're unzipped, but this can be switched to whatever you'd like.
import zipfile
import StringIO
zipped_file = StringIO.StringIO()
with zipfile.ZipFile(zipped_file, 'w') as zip:
for i, file in enumerate(files):
file.seek(0)
zip.writestr("{}.csv".format(i), file.read())
zipped_file.seek(0)
If you want to future-proof your code (hint hint Python 3 hint hint), you might want to switch over to using io.BytesIO instead of StringIO, since Python 3 is all about the bytes. Another bonus is that explicit seeks are not necessary with io.BytesIO before reads (I haven't tested this behavior with Django's HttpResponse, so I've left that final seek in there just in case).
import io
import zipfile
zipped_file = io.BytesIO()
with zipfile.ZipFile(zipped_file, 'w') as f:
for i, file in enumerate(files):
f.writestr("{}.csv".format(i), file.getvalue())
zipped_file.seek(0)
The stdlib comes with the module zipfile, and the main class, ZipFile, accepts a file or file-like object:
from zipfile import ZipFile
temp_file = StringIO.StringIO()
zipped = ZipFile(temp_file, 'w')
# create temp csv_files = [(name1, data1), (name2, data2), ... ]
for name, data in csv_files:
data.seek(0)
zipped.writestr(name, data.read())
zipped.close()
temp_file.seek(0)
# etc. etc.
I'm not a user of StringIO so I may have the seek and read out of place, but hopefully you get the idea.
def zipFiles(files):
outfile = StringIO() # io.BytesIO() for python 3
with zipfile.ZipFile(outfile, 'w') as zf:
for n, f in enumarate(files):
zf.writestr("{}.csv".format(n), f.getvalue())
return outfile.getvalue()
zipped_file = zip_files(myfiles)
response = HttpResponse(zipped_file, content_type='application/octet-stream')
response['Content-Disposition'] = 'attachment; filename=my_file.zip'
StringIO has getvalue method which return the entire contents. You can compress the zipfile
by zipfile.ZipFile(outfile, 'w', zipfile.ZIP_DEFLATED). Default value of compression is ZIP_STORED which will create zip file without compressing.