I have a piece of code which runs well in Python 2.7.5 but doesn't work with Python 3.
The major problem is tee.write, which can not write to the file.
This piece of code suppose to write 20 letters a into the file /tmp/tee-test-1 and /tmp/tee-test-2 but it does not, the two files are empty…
Could any one give me some advice?
import sys
import os
import subprocess
#from netsa.util.shell import *
from string import Template
__author__ = 'Brandon Sandrowicz <brandon#sandrowicz.org>'
__version__ = '0.1'
valid_modes = ['a','w']
def create_tee(files, mode, buffer_size=128):
if mode not in valid_modes:
raise IOError("Only valid modes to create_tee() are: %s" % ', '.join(valid_modes))
tee_list = []
for file in files:
if type(file) == str:
fp = open(file, mode)
tee_list.append(fp)
else:
tee_list.append(file)
pipe_read, pipe_write = os.pipe()
pid = os.fork()
if pid == 0:
# Child -- Read bytes from the pipe and write them to the specified
# files.
try:
# Close parent's end of the pipe
os.close(pipe_write)
bytes = os.read(pipe_read, buffer_size)
print (bytes)
while(bytes):
for file in tee_list:
file.write(bytes)
file.flush()
# TODO maybe add in fsync() here if the fileno() method
# exists on file
bytes = os.read(pipe_read, buffer_size)
except:
pass
finally:
os._exit(255)
else:
# Parent -- Return a file object wrapper around the pipe to the
# child.
return os.fdopen(pipe_write,'w')
if __name__ == '__main__':
files = [ '/tmp/tee-test-1', '/tmp/tee-test-2' ]
num_chars = 100000
print("Writing %d chars to files (using create_tee):" % num_chars)
for file in files:
print(" %s" % file)
print()
tee = create_tee(files,mode='a')
#print("a" * num_chars, end=' ', file=tee)
tee.write("a" * 20)
tee.close()
os.wait()
for filename in files:
with open(filename, 'r') as fh:
chars = len(fh.read())
print("File '%s' has %d chars" % (filename, chars))
ok, I found that problem interesting and challenging, and finally found out what's wrong, it's said in that document:
One common problem is that the file is opened in the wrong mode. Make sure you open text files with the 't' flag and binary files with the 'b' flag and you have solved many problems.
so as you're writing data as b"" datatype, I tried the following:
for file in files:
if type(file) == str:
fp = open(file, mode+'b')
tee_list.append(fp)
else:
tee_list.append(file)
and it works well:
File '/tmp/tee-test-1' has 20 chars
File '/tmp/tee-test-2' has 20 chars
Related
I would like to write a program that takes a .dmg file that is 1.6 GB and split it into 100 MB chunks.
I would like to also write another program that later can put everything back together so that it can be mounted and used.
I am very new to Python (and any type of programming language in general) and cannot find anything on here about this specific thing. Let me know if I am using incorrect terminology too so that I can learn how to search more effectively.
Thanks!
Try this example:
split.py
import sys, os
kilobytes = 1024
megabytes = kilobytes * 1000
chunksize = int(1.4 * megabytes)
def split(fromfile, todir, chunksize=chunksize):
if not os.path.exists(todir):
os.mkdir(todir)
else:
for fname in os.listdir(todir):
os.remove(os.path.join(todir, fname))
partnum = 0
input = open(fromfile, 'rb')
while 1:
chunk = input.read(chunksize)
if not chunk: break
partnum = partnum+1
filename = os.path.join(todir, ('part%04d' % partnum))
fileobj = open(filename, 'wb')
fileobj.write(chunk)
fileobj.close()
input.close( )
assert partnum <= 9999
return partnum
if __name__ == '__main__':
try:
parts = split('/Users/example/Desktop/SO/st/example.mp4', '/Users/example/Desktop/SO/st/new', 2000000) # 100000000 == 100 mb
except:
print('Error during split')
for join:
join.py
import os, sys
readsize = 1024
def join(fromdir, tofile):
output = open(tofile, 'wb')
parts = os.listdir(fromdir)
parts.sort( )
for filename in parts:
filepath = os.path.join(fromdir, filename)
fileobj = open(filepath, 'rb')
while 1:
filebytes = fileobj.read(readsize)
if not filebytes: break
output.write(filebytes)
fileobj.close( )
output.close( )
if __name__ == '__main__':
try:
join('/Users/example/Desktop/SO/st/new', 'example_join.mp4')
except:
print('Error joining files:')
else:
print('Join complete!')
I have a folder with huge text files. Each one is gzipped and weighs several Giga byte.
I wrote a piece of code to split the content of each gzip file: each gzip file is open with gzip, then every specified chunk of line is read and written to a new gzip file.
Here is the code, in file file_compression.py:
import sys, os, file_manipulation as fm
import gzip
def splitGzipFile(fileName, dest=None, chunkPerSplit=100, linePerChunk=4, file_field_separator="_", zfill=3
, verbose=False, file_permission=None, execute=True):
"""
Splits a gz file into chunk files.
:param fileName:
:param chunkPerSplit:
:param linePerChunk:
:return:
"""
absPath = os.path.abspath(fileName)
baseName = os.path.basename(absPath)
dirName = os.path.dirname(absPath)
destFolder = dirName if dest is None else dest
## Compute file fields
rawBaseName, extensions = baseName.split(os.extsep, 1)
if not str(extensions).startswith("."):
extensions = "." + extensions
file_fields = str(rawBaseName).split(file_field_separator)
first_fields = file_fields[:-1] if file_fields.__len__() > 1 else file_fields
first_file_part = file_field_separator.join(first_fields)
last_file_field = file_fields[-1] if file_fields.__len__() > 1 else ""
current_chunk = getCurrentChunkNumber(last_file_field)
if current_chunk is None or current_chunk < 0:
first_file_part = rawBaseName
## Initialize chunk variables
linePerSplit = chunkPerSplit * linePerChunk
# chunkCounter = 0
chunkCounter = 0 if current_chunk is None else current_chunk-1
for chunk in getFileChunks(fileName, linePerSplit):
print "writing " + str(str(chunk).__len__()) + " ..."
chunkCounter += 1
oFile = fm.buildPath(destFolder) + first_file_part + file_field_separator + str(chunkCounter).zfill(zfill) + extensions
if execute:
writeGzipFile(oFile, chunk, file_permission)
if verbose:
print "Splitting: created file ", oFile
def getCurrentChunkNumber(chunk_field):
"""
Tries to guess an integer from a string.
:param chunk_field:
:return: an integer, None if failure.
"""
try:
return int(chunk_field)
except ValueError:
return None
def getFileChunks(fileName, linePerSplit):
with gzip.open(fileName, 'rb') as f:
print "gzip open"
lineCounter = 0
currentChunk = ""
for line in f:
currentChunk += line
lineCounter += 1
if lineCounter >= linePerSplit:
yield currentChunk
currentChunk = ""
lineCounter = 0
if not currentChunk == '':
yield currentChunk
def writeGzipFile(file_name, content, file_permission=None):
import gzip
with gzip.open(file_name, 'wb') as f:
if not content == '':
f.write(content)
if file_permission is not None and type(file_permission) == int:
os.chmod(file_name, file_permission)
This task is multiprocess, a process is created for each file before being splitted. Each file is open and split only once, before being erased, I made sure of that by recording them in a list:
from tools.file_utils import file_compression as fc, file_manipulation as fm
import multiprocessing
from multiprocessing import Process, Queue, Manager
manager = Manager()
split_seen = manager.list()
files = [...] # list is full of gzip files.
processList = []
sampleDir = "sample/dir/"
for file in files:
fielPath = sampleDir + str(file)
p = Process(target=processFile, args=(filePath, sampleDir, True))
p.start()
processList.append(p)
## Join the processes
for p in processList:
p.join()
def processFile(filePath, destFolder, verbose=True):
global split_seen
if filePath in split_seen:
print "Duplicate file processed: " + str(filePath)
time.sleep(3)
print "adding", filePath, split_seen.__len__()
split_seen.append(filePath)
fc.splitGzipFile(filePath, dest=destFolder, chunkPerSplit=4000000\
, linePerChunk=4
, verbose=True
, file_permission=0770
, zfill=3
)
os.remove(filePath)
So far the code has always run fine. But today I had an issue with gzip files' CRC corruption:
Process Process-3:72:
Traceback (most recent call last):
...
File "/.../tools/file_utils/file_compression.py", line 43, in splitGzipFile
for chunk in getFileChunks(fileName, linePerSplit):
File "/.../tools/file_utils/file_compression.py", line 70, in getFileChunks
for line in f:
File "/.../python2.7/lib/python2.7/gzip.py", line 450, in readline
c = self.read(readsize)
File "/.../python2.7/lib/python2.7/gzip.py", line 256, in read
self._read(readsize)
File "/.../python2.7/lib/python2.7/gzip.py", line 320, in _read
self._read_eof()
File "/.../python2.7/lib/python2.7/gzip.py", line 342, in _read_eof
hex(self.crc)))
IOError: CRC check failed 0xddbb6045 != 0x34fd5580L
What could be the origins for this issue? I have to state again that so far it has
always worked, folders and files are always of the same structure. The difference in this instance perhaps is that my script is processing more gzip files than usual, maybe twice as much.
Could it be a matter of the same files being accessed at the same time? But that I seriously doubt, I made sure it is not the case by registering each file accessed in my split_seen list.
I would take any hint, as I have no more clues to where to look.
EDIT 1
Maybe some open files were accessed by someone else, or another program? I cannot ask for and rely on testimonials. So as a start, if I were to put a multiprocess.Lock, would it prevent any other thread, process, program, user, etc from modifying the file? Or is it only limited to Python? I cannot find any doc on that.
I got the exact same error on code that has been running for months. Turns out that the file source was corrupted for that particular file. I went back to an old file and it worked fine and I used a newer file and it also worked fine.
I had the same issue. I just deleted the old file re-ran the code.
rm -rf /tmp/imagenet/
HTH
I have the following script:
import sys, os
pid = sys.argv[1]
maps_file = open("/proc/%s/maps" % pid, 'r')
mem_file = open("/proc/%s/mem" % pid, 'r')
for line in maps_file.readlines(): # for each mapped region
m = re.match(r'([0-9A-Fa-f]+)-([0-9A-Fa-f]+) ([-r])', line)
if m.group(3) == 'r': # if this is a readable region
start = int(m.group(1), 16)
end = int(m.group(2), 16)
mem_file.seek(start) # seek to region start
chunk = mem_file.read(end - start) # read region contents
#print chunk, # dump contents to standard output
mem_dump = open(pid+".bin", "wb")
mem_dump.write(str(chunk,))
mem_dump.close()
maps_file.close()
mem_file.close()
All workds well (dumping the process' memory) so far but I can't save data to file. What am I doing wrong?
Could it be that the files are getting written to somewhere you don't expect (looks like they will be written to the current directory)?
I need to setup some test conditions to simulate a filled up disk. I created the following to simply write garbage to the disk:
#!/usr/bin/python
import os
import sys
import mmap
def freespace(p):
"""
Returns the number of free bytes on the drive that ``p`` is on
"""
s = os.statvfs(p)
return s.f_bsize * s.f_bavail
if __name__ == '__main__':
drive_path = sys.argv[1]
output_path = sys.argv[2]
output_file = open(output_path, 'w')
while freespace(drive_path) > 0:
output_file.write("!")
print freespace(drive_path)
output_file.flush()
output_file.close()
As far as I can tell by looking at the return value from freespace, the write method does not write the file to until it is closed, thereby making the while condition invalid.
Is there a way I can write the data directly to the file? Or another solution perhaps?
This is untested but I imagine something along these lines will be the quickest way to fill the disk easily
import sys
import errno
write_str = "!"*1024*1024*5 # 5MB
output_path = sys.argv[1]
with open(output_path, "w") as f:
while True:
try:
f.write(write_str)
f.flush()
except IOError as err:
if err.errno == errno.ENOSPC:
write_str_len = len(write_str)
if write_str_len > 1:
write_str = write_str[:write_str_len/2]
else:
break
else:
raise
You could try/catch a disk full exception on write.
When I use QFTP's put command to upload a file it only uploads around 40 bytes of the specified file. I'm catching the dataProgress signal and I'm getting the progress but the total size of the file is only read to be around 40 bytes. Is there anything wrong with my code, or is it a problem on the FTP server's side?
Here is my upload function:
def upload(self):
filename = QFileDialog.getOpenFileName(self, 'Upload File', '.')
fname = QIODevice(filename[0])
dataname = filename[0]
data = os.path.basename(dataname)
#data = data[data.find("/") + 1:]
print data
print fname
if not self.fileTree.currentItem():
self.qftp.put(fname, data)
elif "." in self.fileTree.currentItem().text(0):
self.qftp.put(fname, self.fileTree.currentItem().parent().text(0) + data)
elif self.fileTree.currentItem().text(0) == "/":
self.qftp.put(fname, data)
else:
return
Alright, figured out what I needed to do. I needed to create a QFile and read all of the bytes from that file and then pass that to the put command.
def upload(self):
filename = QFileDialog.getOpenFileName(self, 'Upload File', '.')
data = QFile(filename[0])
data.open(1)
qdata = QByteArray(data.readAll())
file = os.path.basename(filename[0])
print data
if not self.fileTree.currentItem():
self.qftp.put(qdata, file, self.qftp.TransferType())
elif "." in self.fileTree.currentItem().text(0):
self.qftp.put(qdata, self.fileTree.currentItem().parent().text(0) + file)
elif self.fileTree.currentItem().text(0) == "/":
self.qftp.put(qdata, file)
else:
return
I'm guessing that data = os.path.basename(dataname) means data is always a string containing the name of the file. Try changing this to be an open fileobj by using data = open(os.path.basename(dataname), 'rb')
edit
Looking at PySide.QtNetwork.QFtp.put(data, file[, type=Binary]) and PySide.QtNetwork.QFtp.put(dev, file[, type=Binary]) - the order of arguments is data/dev then file - so it's the wrong way around in your code...