divide a disk image into smaller parts using Python - python

I would like to write a program that takes a .dmg file that is 1.6 GB and split it into 100 MB chunks.
I would like to also write another program that later can put everything back together so that it can be mounted and used.
I am very new to Python (and any type of programming language in general) and cannot find anything on here about this specific thing. Let me know if I am using incorrect terminology too so that I can learn how to search more effectively.
Thanks!

Try this example:
split.py
import sys, os
kilobytes = 1024
megabytes = kilobytes * 1000
chunksize = int(1.4 * megabytes)
def split(fromfile, todir, chunksize=chunksize):
if not os.path.exists(todir):
os.mkdir(todir)
else:
for fname in os.listdir(todir):
os.remove(os.path.join(todir, fname))
partnum = 0
input = open(fromfile, 'rb')
while 1:
chunk = input.read(chunksize)
if not chunk: break
partnum = partnum+1
filename = os.path.join(todir, ('part%04d' % partnum))
fileobj = open(filename, 'wb')
fileobj.write(chunk)
fileobj.close()
input.close( )
assert partnum <= 9999
return partnum
if __name__ == '__main__':
try:
parts = split('/Users/example/Desktop/SO/st/example.mp4', '/Users/example/Desktop/SO/st/new', 2000000) # 100000000 == 100 mb
except:
print('Error during split')
for join:
join.py
import os, sys
readsize = 1024
def join(fromdir, tofile):
output = open(tofile, 'wb')
parts = os.listdir(fromdir)
parts.sort( )
for filename in parts:
filepath = os.path.join(fromdir, filename)
fileobj = open(filepath, 'rb')
while 1:
filebytes = fileobj.read(readsize)
if not filebytes: break
output.write(filebytes)
fileobj.close( )
output.close( )
if __name__ == '__main__':
try:
join('/Users/example/Desktop/SO/st/new', 'example_join.mp4')
except:
print('Error joining files:')
else:
print('Join complete!')

Related

Merge 2 PDF's in 16 page segments

I have 2 PDF's resultant from splitting a 2-up document composed by 32 pages signatures. Meaning one PDF has pages 1-16, 33-48, 65-80.... and the other has pages 17-32, 49-64, 81-96....
How can I merge both, iterating through 16-page segments of each, using Python? To get a final composed PDF with 1-16, 17-32, 33-48, 49-64.....
I can iterate them page by page and I can combine one full PDF after the other, etc. But can't seem to get the correct way merging by segments.
The first operations are done with external software (Xerox Freeflow Core) and I get to a point where I have 4 files with the 16-page sequences divided in even/odd pages, I join them iterating with:
import itertools as itt
import sys
import PyPDF2 as PDF
def main():
fbase = sys.argv[1]
pdf_out = PDF.PdfFileWriter()
with open(fbase + "_odd.pdf", 'rb') as f_odd:
with open(fbase + "_even.pdf", 'rb') as f_even:
pdf_odd = PDF.PdfFileReader(f_odd)
pdf_even = PDF.PdfFileReader(f_even)
for p in itt.chain.from_iterable(
itt.zip_longest(
pdf_odd.pages,
(pdf_even.pages),
)
):
if p:
pdf_out.addPage(p)
with open(fbase + ".pdf", 'wb') as f_out:
pdf_out.write(f_out)
return 0
if __name__ == "__main__":
if len(sys.argv) != 2:
print("Wrong number of arguments!")
sys.exit(1)
sys.exit(main())
Afterwards I get the 2 files mentioned above. The code above would work for me if I could iterate through 16p segments instead of page by page.
Any clues, please?
Thanks
Got it!
Just in case anyone needs something similar, here's what worked for me:
import sys
import PyPDF2 as PDF
def main():
fbase = sys.argv[1]
all_pages = []
with open(fbase + "_odd.pdf", 'rb') as f_odd:
with open(fbase + "_even.pdf", 'rb') as f_even:
pdf_odd = PDF.PdfFileReader(f_odd)
pdf_even = PDF.PdfFileReader(f_even)
size_odd = len(pdf_odd.pages)
size_even = len(pdf_even.pages)
slice_idx = list(range(0,size_odd,16))
zip_pdfs = list(zip(pdf_odd.pages, pdf_even.pages))
for slice16_odd, slice16_even in [(pdf_odd.pages[el:el+16],
pdf_even.pages[el:el+16])
for el in slice_idx]:
all_pages.extend(slice16_odd)
all_pages.extend(slice16_even)
if size_odd > slice_idx[-1]:
all_pages.extend(slice16_odd[slice_idx[-1]:])
if size_even > slice_idx[-1]:
all_pages.extend(slice16_even[slice_idx[-1]:])
if any(all_pages):
pdf_out = PDF.PdfFileWriter()
for page in all_pages:
pdf_out.addPage(page)
with open(fbase + ".pdf", 'wb') as f_out:
pdf_out.write(f_out)
return 0
if __name__ == "__main__":
if len(sys.argv) != 2:
print("Wrong number of arguments!")
sys.exit(1)
sys.exit(main())
Thanks anyway...
BR

Empty chunks when spliting a large file

I am trying to split a large files into 50Mb chunks and save them in another files. After running some read/write operations, some of my chunks were smaller than 50Mb (43Mb,17Mb and so on). Although, I wrote the same code in Java and It has the same problem. What is wrong? my codes are following bellow:
By the way, What we can do to speed up this code to split into chunks faster?
try:
f = open(self.__filename, 'rb')
except (OSError, IOError), e:
raise FileSplitterException, str(e)
bname = (os.path.split(self.__filename))[1]
fsize = os.path.getsize(self.__filename)
self.__chunksize = int(float(fsize)/float(self.__numchunks))
chunksz = self.__chunksize
total_bytes = 0
for x in range(self.__numchunks):
chunkfilename = bname + '-' + str(x+1) + self.__postfix
if x == self.__numchunks - 1:
chunksz = fsize - total_bytes
try:
print 'Writing file',chunkfilename
data = f.read(chunksz)
total_bytes += len(data)
chunkf = file(chunkfilename, 'wb')
chunkf.write(data)
chunkf.close()
except (OSError, IOError), e:
print e
continue
except EOFError, e:
print e
break
The code in the question seems to be focussed on producing a set number of chunks rather than files of 50MB in size.
This code produces 50MB files.
import os
try:
f = open('big.txt', 'rb')
except (OSError, IOError), e:
raise FileSplitterException, str(e)
bname = (os.path.split('big.txt'))[1]
chunksz = 50 * 1000 * 1000 # metric MB - use 1024 * 1024 for binary MB (MiB)
counter = 0
while True:
chunkfilename = bname + '-' + str(counter+1) + '.foo'
try:
print 'Writing file',chunkfilename
data = f.read(chunksz)
if not data:
# We have reached the end of the file, end the script.
break
chunkf = file(chunkfilename, 'wb')
chunkf.write(data)
chunkf.close()
except (OSError, IOError), e:
print e
continue
except EOFError, e:
print e
break
counter += 1
Some aspects of the code are considered poor style in modern python - for example not using a context manager to open files - but I haven't changed these in case the OP is on an old python like 2.5.
Your question is unclear because you haven't included a Minimal, Complete, and Verifiable example—so I don't know exactly what's wrong with your code. However after creating / simulating my guess as to the missing parts, I was able to come up with something that does exactly what you want, I think.
import os
class FileSplitterException(Exception): pass
class FileSplitter(object):
def __init__(self, filename, chunksize):
if not os.path.isfile(filename):
raise FileSplitterException(
"File: {!r} does not exist".format(filename))
self._filename = filename
self._postfix = 'chunk'
self._chunksize = chunksize
def split(self):
bname = os.path.splitext(self._filename)[0]
fsize = os.path.getsize(self._filename)
chunks, partial = divmod(fsize, self._chunksize)
if partial:
chunks += 1
with open(self._filename, 'rb') as infile:
for i in range(chunks):
chunk_filename = os.path.join('{}-{}.{}'.format(
bname, i, self._postfix))
with open(chunk_filename, 'wb') as outfile:
data = infile.read(self._chunksize)
if data:
outfile.write(data)
else:
FileSplitterException('unexpected EOF encountered')
if __name__ == '__main__':
import glob
filename = 'big_file.txt'
chunksize = 1 * 1024 * 1024 # 1 Mb
print('splitting {} into {:,} sized chunks'.format(filename, chunksize))
fs = FileSplitter(filename, chunksize)
fs.split()
print('chunk files written:')
bname = os.path.splitext(filename)[0]
for chunkname in sorted(glob.glob(bname + '-*.' + fs._postfix)):
fsize = os.path.getsize(chunkname)
print(' {}: size: {:,}'.format(chunkname, fsize))

pytee can not produce proper output in python3

I have a piece of code which runs well in Python 2.7.5 but doesn't work with Python 3.
The major problem is tee.write, which can not write to the file.
This piece of code suppose to write 20 letters a into the file /tmp/tee-test-1 and /tmp/tee-test-2 but it does not, the two files are empty…
Could any one give me some advice?
import sys
import os
import subprocess
#from netsa.util.shell import *
from string import Template
__author__ = 'Brandon Sandrowicz <brandon#sandrowicz.org>'
__version__ = '0.1'
valid_modes = ['a','w']
def create_tee(files, mode, buffer_size=128):
if mode not in valid_modes:
raise IOError("Only valid modes to create_tee() are: %s" % ', '.join(valid_modes))
tee_list = []
for file in files:
if type(file) == str:
fp = open(file, mode)
tee_list.append(fp)
else:
tee_list.append(file)
pipe_read, pipe_write = os.pipe()
pid = os.fork()
if pid == 0:
# Child -- Read bytes from the pipe and write them to the specified
# files.
try:
# Close parent's end of the pipe
os.close(pipe_write)
bytes = os.read(pipe_read, buffer_size)
print (bytes)
while(bytes):
for file in tee_list:
file.write(bytes)
file.flush()
# TODO maybe add in fsync() here if the fileno() method
# exists on file
bytes = os.read(pipe_read, buffer_size)
except:
pass
finally:
os._exit(255)
else:
# Parent -- Return a file object wrapper around the pipe to the
# child.
return os.fdopen(pipe_write,'w')
if __name__ == '__main__':
files = [ '/tmp/tee-test-1', '/tmp/tee-test-2' ]
num_chars = 100000
print("Writing %d chars to files (using create_tee):" % num_chars)
for file in files:
print(" %s" % file)
print()
tee = create_tee(files,mode='a')
#print("a" * num_chars, end=' ', file=tee)
tee.write("a" * 20)
tee.close()
os.wait()
for filename in files:
with open(filename, 'r') as fh:
chars = len(fh.read())
print("File '%s' has %d chars" % (filename, chars))
ok, I found that problem interesting and challenging, and finally found out what's wrong, it's said in that document:
One common problem is that the file is opened in the wrong mode. Make sure you open text files with the 't' flag and binary files with the 'b' flag and you have solved many problems.
so as you're writing data as b"" datatype, I tried the following:
for file in files:
if type(file) == str:
fp = open(file, mode+'b')
tee_list.append(fp)
else:
tee_list.append(file)
and it works well:
File '/tmp/tee-test-1' has 20 chars
File '/tmp/tee-test-2' has 20 chars

Python Filling Up Disk

I need to setup some test conditions to simulate a filled up disk. I created the following to simply write garbage to the disk:
#!/usr/bin/python
import os
import sys
import mmap
def freespace(p):
"""
Returns the number of free bytes on the drive that ``p`` is on
"""
s = os.statvfs(p)
return s.f_bsize * s.f_bavail
if __name__ == '__main__':
drive_path = sys.argv[1]
output_path = sys.argv[2]
output_file = open(output_path, 'w')
while freespace(drive_path) > 0:
output_file.write("!")
print freespace(drive_path)
output_file.flush()
output_file.close()
As far as I can tell by looking at the return value from freespace, the write method does not write the file to until it is closed, thereby making the while condition invalid.
Is there a way I can write the data directly to the file? Or another solution perhaps?
This is untested but I imagine something along these lines will be the quickest way to fill the disk easily
import sys
import errno
write_str = "!"*1024*1024*5 # 5MB
output_path = sys.argv[1]
with open(output_path, "w") as f:
while True:
try:
f.write(write_str)
f.flush()
except IOError as err:
if err.errno == errno.ENOSPC:
write_str_len = len(write_str)
if write_str_len > 1:
write_str = write_str[:write_str_len/2]
else:
break
else:
raise
You could try/catch a disk full exception on write.

How to download large file with binary mode in python?

I am code a download function in python. The file size >1GB. The server is linux, HTTP server is Karrigell. Client is browse, Firefox or IE. I meet a big trouble.
At first, I use sys.stdout() to send file content.
file = open(path, 'rb')
size = os.path.getsize(path)
RESPONSE['Pragma'] = 'public'
RESPONSE['Expires'] = '0'
RESPONSE['Cache-Control'] = 'must-revalidate, pre-check=0'
RESPONSE['Content-Disposition'] = 'attachment; filename="' + os.path.basename(path) + '"'
RESPONSE['Content-type'] = "application/octet-stream"
RESPONSE['Content-Transfer-Encoding'] = 'binary'
RESPONSE['Content-length'] = str(os.path.getsize(path))
sys.stdout.flush()
chunk_size = 10000
handle = open(path, "rb")
while True:
buffer = handle.read(chunk_size)
if buffer:
STDOUT(buffer)
else:
break
sys.stdout.flush()
The problem is the server out of memory! I know, stdout write content to memory first, then memory send to socket.
So, I modify the function. Send content to socket directly. I use the py-sendfile module. http://code.google.com/p/py-sendfile/
file = open(path, 'rb')
size = os.path.getsize(path)
sock = REQUEST_HANDLER.sock
sock.sendall("""HTTP/1.1 200 OK\r\nPragma: no-cache\r\nExpires: 0\r\nCache-Control: no-cache, no-store\r\nContent-Disposition: attachment; filename="%s"\r\nContent-Type: application/octet-stream\r\nContent-Length: %u\r\nContent-Range: bytes 0-4096/%u\r\nLocation: "%s"\r\n\r\n""" % (os.path.basename(path), size, size, os.path.basename(path)))
offset = 0
nbytes = 4096
while 1:
try:
sent = sendfile.sendfile(sock.fileno(), file.fileno(), offset, nbytes)
except OSError, err:
if err.errno in (errno.EAGAIN, errno.EBUSY): # retry
continue
raise
else:
if sent == 0:
break # done
offset += sent
This time, the server memory is OK, but browse die! The browse memory rise quickly! Not free
until the socket accept whole file content.
I don't know how to deal with these problems. I think the second idea is right, send content to socket directly. But why browse can't free memory while accept data?
You should try to download the file in chunks. This is an example that works for me using urllib2
import os
import urllib2
import math
def downloadChunks(url):
"""Helper to download large files
the only arg is a url
this file will go to a temp directory
the file will also be downloaded
in chunks and print out how much remains
"""
baseFile = os.path.basename(url)
#move the file to a more uniq path
os.umask(0002)
temp_path = "/tmp/"
try:
file = os.path.join(temp_path,baseFile)
req = urllib2.urlopen(url)
total_size = int(req.info().getheader('Content-Length').strip())
downloaded = 0
CHUNK = 256 * 10240
with open(file, 'wb') as fp:
while True:
chunk = req.read(CHUNK)
downloaded += len(chunk)
print math.floor( (downloaded / total_size) * 100 )
if not chunk: break
fp.write(chunk)
except urllib2.HTTPError, e:
print "HTTP Error:",e.code , url
return False
except urllib2.URLError, e:
print "URL Error:",e.reason , url
return False
return file

Categories

Resources