gzip a file quicker using Python? - python

I am attempting to gzip a file using python faster as some of my files are as as small as 30 MB and as large as 4 GB.
Is there a more efficient way of creating a gzip file than the following? Is there a way to optimize the following so that if a file is small enough to be placed in memory it can simply just read the whole chunk of the file to be read rather than do it on a per line basis?
with open(j, 'rb') as f_in:
with gzip.open(j + ".gz", 'wb') as f_out:
f_out.writelines(f_in)

Copy the file in bigger chunks using the shutil.copyfileobj() function. In this example, I'm using 16MiB blocks which is pretty reasonable.
MEG = 2**20
with open(j, 'rb') as f_in:
with gzip.open(j + ".gz", 'wb') as f_out:
shutil.copyfileobj(f_in, f_out, length=16*MEG)
You may find that calling out to gzip is faster for large files, especially if you plan to zip multiple files in parallel.

Instead of reading it line by line, you can read it at once.
Example:
import gzip
with open(j, 'rb') as f_in:
content = f_in.read()
f = gzip.open(j + '.gz', 'wb')
f.write(content)
f.close()

Find 2 almost identical methods for reading gzip files below:
A.) to load everything into memory --> can be a bad choice for very big files (several GB), because you can run out of memory
B.) Don't load everything into memory, line by line --> good for BIG files
adapted from
https://codebright.wordpress.com/2011/03/25/139/
and
https://www.reddit.com/r/Python/comments/2olhrf/fast_gzip_in_python/
http://pastebin.com/dcEJRs1i
import sys
if sys.version.startswith("3"):
import io
io_method = io.BytesIO
else:
import cStringIO
io_method = cStringIO.StringIO
A.)
def yield_line_gz_file(fn):
"""
:param fn: String (absolute path)
:return: GeneratorFunction (yields String)
"""
ph = subprocess.Popen(["gzcat", fn], stdout=subprocess.PIPE)
fh = io_method(ph.communicate()[0])
for line in fh:
yield line
B.)
def yield_line_gz_file(fn):
"""
:param fn: String (absolute path)
:return: GeneratorFunction (yields String)
"""
ph = subprocess.Popen(["gzcat", fn], stdout=subprocess.PIPE)
for line in ph.stdout:
yield line

Related

Download bz2, Read compress files in memory (avoid memory overflow)

As title says, I'm downloading a bz2 file which has a folder inside and a lot of text files...
My first version was decompressing in memory, but Although it is only 90mbs when you uncomrpess it, it has 60 files of 750mb each.... Computer goes bum! obviusly cant handle like 40gb of ram XD)
So, The problem is that they are too big to keep all in memory at the same time... so I'm using this code that works but its sucks (Too slow):
response = requests.get('https:/fooweb.com/barfile.bz2')
# save file into disk:
compress_filepath = '{0}/files/sources/{1}'.format(zsets.BASE_DIR, check_time)
with open(compress_filepath, 'wb') as local_file:
local_file.write(response.content)
#We extract the files into folder
extract_folder = compress_filepath + '_ext'
with tarfile.open(compress_filepath, "r:bz2") as tar:
tar.extractall(extract_folder)
# We process one file at a time:
for filename in os.listdir(extract_folder):
filepath = '{0}/{1}'.format(extract_folder,filename)
file = open(filepath, 'r').readlines()
for line in file:
some_processing(line)
Is there a way I could make this without dumping it to disk... and only decompressing and reading one file from the .bz2 at a time?
Thank you very much for your time in advance, I hope somebody knows how to help me with this...
#!/usr/bin/python3
import sys
import requests
import tarfile
got = requests.get(sys.argv[1], stream=True)
with tarfile.open(fileobj=got.raw, mode='r|*') as tar:
for info in tar:
if info.isreg():
ent = tar.extractfile(info)
# now process ent as a file, however you like
print(info.name, len(ent.read()))
I did it this way:
response = requests.get(my_url_to_file)
memfile = io.BytesIO(response.content)
# We extract files in memory, one by one:
tar = tarfile.open(fileobj=memfile, mode="r:bz2")
for member_name in tar.getnames():
filecount+=1
file = tar.extractfile(member_name)
with open(file, 'r') as read_file:
for line in read_file:
process_line(line)

Python file downloading fails

New to python and attempting to download the NIST NVD JSON files. I have tried several methods but it only write about 324 bytes file. If I do one file that does in fact work but there are several files to download for this.
I did try to adjust the chunk_size but still can't get a 1 to 6mb zip file to download
from requests import get
def download(url, filename):
response = get(url, stream = True)
with open(filename, "wb") as file:
for chunk in response.iter_content(chunk_size=1024):
if chunk:
file.write(chunk)
print('Downloaded! ', filename)
with open('NVD_JSON_SOURCE_URLS.txt') as f:
for line in f:
filename = line.split('/')[-1]
url = line
download(url, filename)
The input works and it starts the downloads, just never completes them. Clearly I am missing something frustratingly simple here but after 2 days I am not getting any closer. Thanks.
I find chunking to be painful for instances like this in Python. This approach has worked for me frequently:
import requests
import shutil
def download_file(url, filename):
r = requests.get(url, stream=True)
with open(filename, 'wb') as f:
shutil.copyfileobj(r.raw, f)
This streams the whole file to memory then writes it. So wouldn't work for huge files, but you are only talking about a couple of MB should work fine.
I think line has some whitespace characters, so if you remove the whitespace characters from line using strip(), the code should work.
for line in f:
line = line.strip()
...
I tested it and it works for me.
Because there is a line break in the end of lines when you read the data from .txt file. So you should strip the line break in the first.
from requests import get
def download(url, filename):
response = get(url, stream = True)
with open(filename, "wb") as file:
for chunk in response.iter_content(chunk_size=1024):
if chunk:
file.write(chunk)
print('Downloaded! ', filename)
with open('NVD_JSON_SOURCE_URLS.txt') as f:
for line in f:
line = line.strip()
filename = line.split('/')[-1]
url = line
download(url, filename)

Python "gzip" module acting weirdly if compressed extension is not ".gz"

I need to compress a file using the gzip module, but the output file extension may not be .gz.
Look at this simple code:
import gzip
import shutil
input_path = "test.txt"
output_path = input_path + ".gz"
with open(input_path, 'w') as file:
file.write("abc" * 10)
with gzip.open(output_path, 'wb') as f_out:
with open(input_path, 'rb') as f_in:
shutil.copyfileobj(f_in, f_out)
It works fine. But if I replace ".gz" with ".gzip" for example, then I am not able to open the compressed file correctly:
I tried with 7-Zip and WinRar, the result is the same, and the bug persists even if I rename the file.
Does anyone know where the problem comes from, please?
I tried with compression bz2 and lzma, they seem to work properly no matter what the extension is.
You actually have two versions of file created this way:
First, .gz file:
with gzip.open("test.txt.gz", 'wb') as f_out:
with open("test.txt", 'rb') as f_in:
shutil.copyfileobj(f_in, f_out)
Second, .gzip file:
with gzip.open("test.txt.gzip", 'wb') as f_out:
with open("test.txt", 'rb') as f_in:
shutil.copyfileobj(f_in, f_out)
Both create a GZIP with your test.txt in it. The only difference is that in the second case, test.txt is renamed to test.txt.gzip.
The problem is that the argument to gzip.open actually has two purposes: the filename of the gzip archive and the filename of the file inside (bad design, imho).
So, if you do gzip.open("abcd", 'wb') and write to it, it will create gzip archive named abcd with a file named abcd inside.
But then, there comes magic: if the filename endswith .gz, then it behaves differently, e.g. gzip.open("bla.gz", 'wb') creates a gzip archive named bla.gz with a file named bla inside.
So, with .gz you activated the (undocumented, as far as I can see!) magic, whereas with .gzip you did not.
The filename inside the archive can be controlled by utilising gzip.GzipFile constructor instead of the gzip.open method. The gzip.GzipFile needs then a separate os.open call before it.
with open(output_path, 'wb') as f_out_gz:
with gzip.GzipFile(fileobj=f_out_gz, filename=input_path, mode='wb') as f_out:
...
f_out.flush()
Note also the added f_out.flush() - according to my experience without this line the GzipFile may in some cases randomly not flush the data before the file is closed, resulting in corrupt archive.
Or as a complete example:
import gzip
import shutil
input_path = "test.txt"
output_path = input_path + ".gz"
with open(input_path, 'w') as file:
file.write("abc" * 10)
with open(output_path, 'wb') as f_out_gz:
with gzip.GzipFile(fileobj=f_out_gz, filename=input_path, mode='wb') as f_out
with open(input_path, 'rb') as f_in:
shutil.copyfileobj(f_in, f_out)
f_out.flush()

How do I automatically handle decompression when reading a file in Python?

I am writing some Python code that loops through a number of files and processes the first few hundred lines of each file. I would like to extend this code so that if any of the files in the list are compressed, it will automatically decompress while reading them, so that my code always receives the decompressed lines. Essentially my code currently looks like:
for f in files:
handle = open(f)
process_file_contents(handle)
Is there any function that can replace open in the above code so that if f is either plain text or gzip-compressed text (or bzip2, etc.), the function will always return a file handle to the decompressed contents of the file? (No seeking required, just sequential access.)
I had the same problem: I'd like my code to accept filenames and return a filehandle to be used with with, automatically compressed & etc.
In my case, I'm willing to trust the filename extensions and I only need to deal with gzip and maybe bzip files.
import gzip
import bz2
def open_by_suffix(filename):
if filename.endswith('.gz'):
return gzip.open(filename, 'rb')
elif filename.endswith('.bz2'):
return bz2.BZ2file(filename, 'r')
else:
return open(filename, 'r')
If we don't trust the filenames, we can compare the initial bytes of the file for magic strings (modified from https://stackoverflow.com/a/13044946/117714):
import gzip
import bz2
magic_dict = {
"\x1f\x8b\x08": (gzip.open, 'rb')
"\x42\x5a\x68": (bz2.BZ2File, 'r')
}
max_len = max(len(x) for x in magic_dict)
def open_by_magic(filename):
with open(filename) as f:
file_start = f.read(max_len)
for magic, (fn, flag) in magic_dict.items():
if file_start.startswith(magic):
return fn(filename, flag)
return open(filename, 'r')
Usage:
# cat
for filename in filenames:
with open_by_suffix(filename) as f:
for line in f:
print f
Your use-case would look like:
for f in files:
with open_by_suffix(f) as handle:
process_file_contents(handle)

Replace and overwrite instead of appending

I have the following code:
import re
#open the xml file for reading:
file = open('path/test.xml','r+')
#convert to string:
data = file.read()
file.write(re.sub(r"<string>ABC</string>(\s+)<string>(.*)</string>",r"<xyz>ABC</xyz>\1<xyz>\2</xyz>",data))
file.close()
where I'd like to replace the old content that's in the file with the new content. However, when I execute my code, the file "test.xml" is appended, i.e. I have the old content follwed by the new "replaced" content. What can I do in order to delete the old stuff and only keep the new?
You need seek to the beginning of the file before writing and then use file.truncate() if you want to do inplace replace:
import re
myfile = "path/test.xml"
with open(myfile, "r+") as f:
data = f.read()
f.seek(0)
f.write(re.sub(r"<string>ABC</string>(\s+)<string>(.*)</string>", r"<xyz>ABC</xyz>\1<xyz>\2</xyz>", data))
f.truncate()
The other way is to read the file then open it again with open(myfile, 'w'):
with open(myfile, "r") as f:
data = f.read()
with open(myfile, "w") as f:
f.write(re.sub(r"<string>ABC</string>(\s+)<string>(.*)</string>", r"<xyz>ABC</xyz>\1<xyz>\2</xyz>", data))
Neither truncate nor open(..., 'w') will change the inode number of the file (I tested twice, once with Ubuntu 12.04 NFS and once with ext4).
By the way, this is not really related to Python. The interpreter calls the corresponding low level API. The method truncate() works the same in the C programming language: See http://man7.org/linux/man-pages/man2/truncate.2.html
file='path/test.xml'
with open(file, 'w') as filetowrite:
filetowrite.write('new content')
Open the file in 'w' mode, you will be able to replace its current text save the file with new contents.
Using truncate(), the solution could be
import re
#open the xml file for reading:
with open('path/test.xml','r+') as f:
#convert to string:
data = f.read()
f.seek(0)
f.write(re.sub(r"<string>ABC</string>(\s+)<string>(.*)</string>",r"<xyz>ABC</xyz>\1<xyz>\2</xyz>",data))
f.truncate()
import os#must import this library
if os.path.exists('TwitterDB.csv'):
os.remove('TwitterDB.csv') #this deletes the file
else:
print("The file does not exist")#add this to prevent errors
I had a similar problem, and instead of overwriting my existing file using the different 'modes', I just deleted the file before using it again, so that it would be as if I was appending to a new file on each run of my code.
See from How to Replace String in File works in a simple way and is an answer that works with replace
fin = open("data.txt", "rt")
fout = open("out.txt", "wt")
for line in fin:
fout.write(line.replace('pyton', 'python'))
fin.close()
fout.close()
in my case the following code did the trick
with open("output.json", "w+") as outfile: #using w+ mode to create file if it not exists. and overwrite the existing content
json.dump(result_plot, outfile)
Using python3 pathlib library:
import re
from pathlib import Path
import shutil
shutil.copy2("/tmp/test.xml", "/tmp/test.xml.bak") # create backup
filepath = Path("/tmp/test.xml")
content = filepath.read_text()
filepath.write_text(re.sub(r"<string>ABC</string>(\s+)<string>(.*)</string>",r"<xyz>ABC</xyz>\1<xyz>\2</xyz>", content))
Similar method using different approach to backups:
from pathlib import Path
filepath = Path("/tmp/test.xml")
filepath.rename(filepath.with_suffix('.bak')) # different approach to backups
content = filepath.read_text()
filepath.write_text(re.sub(r"<string>ABC</string>(\s+)<string>(.*)</string>",r"<xyz>ABC</xyz>\1<xyz>\2</xyz>", content))

Categories

Resources