I have a folder with huge text files. Each one is gzipped and weighs several Giga byte.
I wrote a piece of code to split the content of each gzip file: each gzip file is open with gzip, then every specified chunk of line is read and written to a new gzip file.
Here is the code, in file file_compression.py:
import sys, os, file_manipulation as fm
import gzip
def splitGzipFile(fileName, dest=None, chunkPerSplit=100, linePerChunk=4, file_field_separator="_", zfill=3
, verbose=False, file_permission=None, execute=True):
"""
Splits a gz file into chunk files.
:param fileName:
:param chunkPerSplit:
:param linePerChunk:
:return:
"""
absPath = os.path.abspath(fileName)
baseName = os.path.basename(absPath)
dirName = os.path.dirname(absPath)
destFolder = dirName if dest is None else dest
## Compute file fields
rawBaseName, extensions = baseName.split(os.extsep, 1)
if not str(extensions).startswith("."):
extensions = "." + extensions
file_fields = str(rawBaseName).split(file_field_separator)
first_fields = file_fields[:-1] if file_fields.__len__() > 1 else file_fields
first_file_part = file_field_separator.join(first_fields)
last_file_field = file_fields[-1] if file_fields.__len__() > 1 else ""
current_chunk = getCurrentChunkNumber(last_file_field)
if current_chunk is None or current_chunk < 0:
first_file_part = rawBaseName
## Initialize chunk variables
linePerSplit = chunkPerSplit * linePerChunk
# chunkCounter = 0
chunkCounter = 0 if current_chunk is None else current_chunk-1
for chunk in getFileChunks(fileName, linePerSplit):
print "writing " + str(str(chunk).__len__()) + " ..."
chunkCounter += 1
oFile = fm.buildPath(destFolder) + first_file_part + file_field_separator + str(chunkCounter).zfill(zfill) + extensions
if execute:
writeGzipFile(oFile, chunk, file_permission)
if verbose:
print "Splitting: created file ", oFile
def getCurrentChunkNumber(chunk_field):
"""
Tries to guess an integer from a string.
:param chunk_field:
:return: an integer, None if failure.
"""
try:
return int(chunk_field)
except ValueError:
return None
def getFileChunks(fileName, linePerSplit):
with gzip.open(fileName, 'rb') as f:
print "gzip open"
lineCounter = 0
currentChunk = ""
for line in f:
currentChunk += line
lineCounter += 1
if lineCounter >= linePerSplit:
yield currentChunk
currentChunk = ""
lineCounter = 0
if not currentChunk == '':
yield currentChunk
def writeGzipFile(file_name, content, file_permission=None):
import gzip
with gzip.open(file_name, 'wb') as f:
if not content == '':
f.write(content)
if file_permission is not None and type(file_permission) == int:
os.chmod(file_name, file_permission)
This task is multiprocess, a process is created for each file before being splitted. Each file is open and split only once, before being erased, I made sure of that by recording them in a list:
from tools.file_utils import file_compression as fc, file_manipulation as fm
import multiprocessing
from multiprocessing import Process, Queue, Manager
manager = Manager()
split_seen = manager.list()
files = [...] # list is full of gzip files.
processList = []
sampleDir = "sample/dir/"
for file in files:
fielPath = sampleDir + str(file)
p = Process(target=processFile, args=(filePath, sampleDir, True))
p.start()
processList.append(p)
## Join the processes
for p in processList:
p.join()
def processFile(filePath, destFolder, verbose=True):
global split_seen
if filePath in split_seen:
print "Duplicate file processed: " + str(filePath)
time.sleep(3)
print "adding", filePath, split_seen.__len__()
split_seen.append(filePath)
fc.splitGzipFile(filePath, dest=destFolder, chunkPerSplit=4000000\
, linePerChunk=4
, verbose=True
, file_permission=0770
, zfill=3
)
os.remove(filePath)
So far the code has always run fine. But today I had an issue with gzip files' CRC corruption:
Process Process-3:72:
Traceback (most recent call last):
...
File "/.../tools/file_utils/file_compression.py", line 43, in splitGzipFile
for chunk in getFileChunks(fileName, linePerSplit):
File "/.../tools/file_utils/file_compression.py", line 70, in getFileChunks
for line in f:
File "/.../python2.7/lib/python2.7/gzip.py", line 450, in readline
c = self.read(readsize)
File "/.../python2.7/lib/python2.7/gzip.py", line 256, in read
self._read(readsize)
File "/.../python2.7/lib/python2.7/gzip.py", line 320, in _read
self._read_eof()
File "/.../python2.7/lib/python2.7/gzip.py", line 342, in _read_eof
hex(self.crc)))
IOError: CRC check failed 0xddbb6045 != 0x34fd5580L
What could be the origins for this issue? I have to state again that so far it has
always worked, folders and files are always of the same structure. The difference in this instance perhaps is that my script is processing more gzip files than usual, maybe twice as much.
Could it be a matter of the same files being accessed at the same time? But that I seriously doubt, I made sure it is not the case by registering each file accessed in my split_seen list.
I would take any hint, as I have no more clues to where to look.
EDIT 1
Maybe some open files were accessed by someone else, or another program? I cannot ask for and rely on testimonials. So as a start, if I were to put a multiprocess.Lock, would it prevent any other thread, process, program, user, etc from modifying the file? Or is it only limited to Python? I cannot find any doc on that.
I got the exact same error on code that has been running for months. Turns out that the file source was corrupted for that particular file. I went back to an old file and it worked fine and I used a newer file and it also worked fine.
I had the same issue. I just deleted the old file re-ran the code.
rm -rf /tmp/imagenet/
HTH
Related
This question already has answers here:
Trying to use open(filename, 'w' ) gives IOError: [Errno 2] No such file or directory if directory doesn't exist
(3 answers)
Closed 6 months ago.
So my prof. gave me this code as the solution of my homework but when I run it it gives me an error. Can you please help me out? I guess I didn't specify the location of the file but not sure if that's the case.The objective of this question is to generate and read files that contain a list of random numbers.
import random
import os
import time
def fillFile(fileSize, fileName):
# Delete file if exists
if os.path.exists(fileName):
os.remove(fileName)
# Open file
FILE = open(fileName, "w")
# Write to file
for i in range(fileSize):
r = random.randint(0,fileSize+1000)
FILE.write(str(r) + "\n")
FILE.close()
def readFile(fileName):
# Open file
if os.path.exists(fileName):
FILE = open(fileName,"r")
else:
print(fileName + " does not exist!")
exit()
# Read File
alist = []
for line in FILE:
alist.append(int(line))
FILE.close()
return alist
def mainForFiles():
# Dosyaları oluştur
fileSizes = [1000, 5000, 10000, 25000, 50000, 100000, 200000]
dirName = ".\\filesForAssignment1\\"
# Delete fileStats.txt file if exists
statFileName = "fileStats.txt"
if os.path.exists(statFileName):
os.remove(statFileName)
# open stat file
statFile = open(statFileName, "w")
statFile.write("fillFile")
print("WRITING TO FILES")
for i in fileSizes:
start = time.time()
fillFile(i, dirName+"file"+str(i))
finish = time.time()
statFile.write(" " + str(finish-start))
print("File Size = " + str(i) + " Write Time = " + str(finish-start))
statFile.write("\n")
print("READING FILES")
statFile.write("readFile")
for i in fileSizes:
fileName = dirName+"file"+str(i)
# Dosyayı oku
finish = time.time()
alist = readFile(fileName)
start = time.time()
statFile.write(" " + str(finish-start))
print ("File Size = " + str(i)+ " Dosya Okuma Zamanı = " + str(finish-start))
statFile.write("\n")
statFile.close()
mainForFiles()
File "C:/Users/emrea/PycharmProjects/helloworld/hello2.py", line 84, in
<module>
mainForFiles()
File "C:/Users/emrea/PycharmProjects/helloworld/hello2.py", line 57, in mainForFiles
fillFile(i, dirName+"file"+str(i))
File "C:/Users/emrea/PycharmProjects/helloworld/hello2.py", line 12, in fillFile
FILE = open(fileName, "w")
FileNotFoundError: [Errno 2] No such file or directory: '.\\filesForAssignment1\\file1000'
FileNotFoundError: [Errno 2] No such file or directory: '.\\filesForAssignment1\\file1000'
The w mode causes the file to be created if it doesn't exist (and truncated if it does so the os.remove is not actually useful there), however it does expect intermediate directories to exist.
This means you should ensure the path to the file ('.\\filesForAssignment1) does exist before trying to create the file.
os.makedirs(os.path.dirname(fileName), exists_ok=True)
should do the trick, or
pathlib.Path(fileName).parent.mkdir(parents=True, exists_ok=True)
for a somewhat more modern take on it.
There's a bunch of other minor issues in the script:
the main function should generally be "gated" so modules can be imported without running them
explicitly closing files has fallen out of favor as it's unreliable
when opening files in "text" mode (the default) you should always provide an encoding
pathlib is fun, also that way you should not have to deal with path separators and all that crap
unless it's required to handle that case, I'd just let open(fname, 'r') error out if the file doesn't exist
Here's a version I think should be slightly improved:
import pathlib
import random
import os
import time
def fillFile(fileSize, fileName):
with fileName.open('w', encoding='utf-8') as f:
for i in range(fileSize):
r = random.randint(0,fileSize+1000)
f.write(f"{r}\n")
def readFile(fileName):
with fileName.open(encoding='utf-8') as f:
return [int(line) for line in f]
OUT_DIR = pathlib.Path.cwd().joinpath("filesForAssignment1")
FILE_SIZES = [1000, 5000, 10000, 25000, 50000, 100000, 200000]
def mainForFiles():
# Dosyaları oluştur
OUT_DIR.mkdir(parents=True, exist_ok=True) # make sure the directory exists
statFilePath = pathlib.Path("fileStats.txt")
with statFilePath.open('w', encoding='utf-8') as statFile:
statFile.write("fillFile")
print("WRITING TO FILES")
for i in FILE_SIZES:
start = time.time()
fillFile(i, OUT_DIR.joinpath(f'file{i}'))
finish = time.time()
statFile.write(f" {finish-start}")
print(f"File Size = {i} Write Time = {finish-start})")
statFile.write("\n")
print("READING FILES")
statFile.write("readFile")
for i in FILE_SIZES:
f = OUT_DIR.joinpath(f'file{i}')
# Dosyayı oku
start = time.time()
alist = readFile(f)
finish = time.time()
statFile.write(f" {finish-start}")
print (f"File Size = {i} Dosya Okuma Zamanı = {finish-start}")
statFile.write("\n")
if __name__ == '__main__':
mainForFiles()
exit() is not doing what you want, it continues with the code.
def readFile(fileName):
# Open file
if os.path.exists(fileName):
FILE = open(fileName,"r")
else:
print(fileName + " does not exist!")
return
# Read File
alist = []
for line in FILE:
alist.append(int(line))
FILE.close()
return alist
So I'm writing a script to take large csv files and divide them into chunks. These files each have lines formatted accordingly:
01/07/2003,1545,12.47,12.48,12.43,12.44,137423
Where the first field is the date. The next field to the right is a time value. These data points are at minute granularity. My goal is to fill files with 8 days worth of data, so I want to write all the lines from a file for 8 days worth into a new file.
Right now, I'm only seeing the program write one line per "chunk," rather than all the lines. Code shown below and screenshots included showing how the chunk directories are made and the file as well as its contents.
For reference, day 8 shown and 1559 means it stored the last line right before the mod operator became true. So I'm thinking that everything is getting overwritten somehow since only the last values are being stored.
import os
import time
CWD = os.getcwd()
WRITEDIR = CWD+"/Divided Data/"
if not os.path.exists(WRITEDIR):
os.makedirs(WRITEDIR)
FILEDIR = CWD+"/SP500"
os.chdir(FILEDIR)
valid_files = []
filelist = open("filelist.txt", 'r')
for file in filelist:
cur_file = open(file.rstrip()+".csv", 'r')
cur_file.readline() #skip first line
prev_day = ""
count = 0
chunk_count = 1
for line in cur_file:
day = line[3:5]
WDIR = WRITEDIR + "Chunk"
cur_dir = os.getcwd()
path = WDIR + " "+ str(chunk_count)
if not os.path.exists(path):
os.makedirs(path)
if(day != prev_day):
# print(day)
prev_day = day
count += 1
#Create new directory
if(count % 8 == 0):
chunk_count += 1
PATH = WDIR + " " + str(chunk_count)
if not os.path.exists(PATH):
os.makedirs(PATH)
print("Chunk count: " + str(chunk_count))
print("Global count: " + str(count))
temp_path = WDIR +" "+str(chunk_count)
os.chdir(temp_path)
fname = file.rstrip()+str(chunk_count)+".csv"
with open(fname, 'w') as f:
try:
f.write(line + '\n')
except:
print("Could not write to file. \n")
os.chdir(cur_dir)
if(chunk_count >= 406):
continue
cur_file.close()
# count += 1
The answer is in the comment but let me give it here so that your question is answered.
You're opening your file in 'w' mode which overwrites all the previously written content. You need to open it in the 'a' (append) mode:
fname = file.rstrip()+str(chunk_count)+".csv"
with open(fname, 'a') as f:
See more on open function and modes in Python documentation. It specifically mentions about 'w' mode:
note that 'w+' truncates the file
I have a piece of code which runs well in Python 2.7.5 but doesn't work with Python 3.
The major problem is tee.write, which can not write to the file.
This piece of code suppose to write 20 letters a into the file /tmp/tee-test-1 and /tmp/tee-test-2 but it does not, the two files are empty…
Could any one give me some advice?
import sys
import os
import subprocess
#from netsa.util.shell import *
from string import Template
__author__ = 'Brandon Sandrowicz <brandon#sandrowicz.org>'
__version__ = '0.1'
valid_modes = ['a','w']
def create_tee(files, mode, buffer_size=128):
if mode not in valid_modes:
raise IOError("Only valid modes to create_tee() are: %s" % ', '.join(valid_modes))
tee_list = []
for file in files:
if type(file) == str:
fp = open(file, mode)
tee_list.append(fp)
else:
tee_list.append(file)
pipe_read, pipe_write = os.pipe()
pid = os.fork()
if pid == 0:
# Child -- Read bytes from the pipe and write them to the specified
# files.
try:
# Close parent's end of the pipe
os.close(pipe_write)
bytes = os.read(pipe_read, buffer_size)
print (bytes)
while(bytes):
for file in tee_list:
file.write(bytes)
file.flush()
# TODO maybe add in fsync() here if the fileno() method
# exists on file
bytes = os.read(pipe_read, buffer_size)
except:
pass
finally:
os._exit(255)
else:
# Parent -- Return a file object wrapper around the pipe to the
# child.
return os.fdopen(pipe_write,'w')
if __name__ == '__main__':
files = [ '/tmp/tee-test-1', '/tmp/tee-test-2' ]
num_chars = 100000
print("Writing %d chars to files (using create_tee):" % num_chars)
for file in files:
print(" %s" % file)
print()
tee = create_tee(files,mode='a')
#print("a" * num_chars, end=' ', file=tee)
tee.write("a" * 20)
tee.close()
os.wait()
for filename in files:
with open(filename, 'r') as fh:
chars = len(fh.read())
print("File '%s' has %d chars" % (filename, chars))
ok, I found that problem interesting and challenging, and finally found out what's wrong, it's said in that document:
One common problem is that the file is opened in the wrong mode. Make sure you open text files with the 't' flag and binary files with the 'b' flag and you have solved many problems.
so as you're writing data as b"" datatype, I tried the following:
for file in files:
if type(file) == str:
fp = open(file, mode+'b')
tee_list.append(fp)
else:
tee_list.append(file)
and it works well:
File '/tmp/tee-test-1' has 20 chars
File '/tmp/tee-test-2' has 20 chars
I am trying to create python script to archive and compress one year old data in datewise tar file. Script is also generating log file of the archived files. I am using python 2.6 on linux.
Here is my code :
for search_date in dd_list:
tar_file = "/files/yearly_archive/nas_archive_" + search_date + ".tgz"
mytar = tarfile.open(tar_file,"w:gz")
log_file = "/files/yearly_archive/archive_log_" + search_date
fcount = 0
#print tar_file
#print log_file
f = open(log_file,'ab+')
for f_name, d_date in date_file_dict.iteritems():
if d_date == search_date:
#print f_name
fcount += 1
mytar.add(f_name)
f.write(f_name + '\n')
date_occur_dict[search_date] = fcount
mytar.close()
f.close()
Here log file is getting appended if it exists but tar file is getting overwritten every when I run the script. Is there a way I can make sure tar file gets appended if it exists otherwise gets created ?
Edit :
I tried to add code for ungzipping and adding but it is not working.
for search_date in dd_list:
tar_file = "/files/yearly_archive/nas_archive_" + search_date + ".tgz"
zip = 1
try:
with open(tar_file,'ab+'):
import gzip
d_tar = gzip.open(tar_file,'wb')
zip = 0
except IOError:
print "Creating new tar file"
if zip == 1:
mytar = tarfile.open(tar_file,"w:gz")
else:
mytar = tarfile.open(d_tar,"w")
log_file = "/files/yearly_archive/archive_log_" + search_date
fcount = 0
#print tar_file
#print log_file
f = open(log_file,'ab+')
for f_name, d_date in date_file_dict.iteritems():
if d_date == search_date:
#print f_name
fcount += 1
mytar.add(f_name)
f.write(f_name + '\n')
date_occur_dict[search_date] = fcount
mytar.close()
f.close()
I am getting following error:
Traceback (most recent call last):
File "sort_archive.py", line 63, in <module>
mytar = tarfile.open(d_tar,"w")
File "/usr/lib64/python2.6/tarfile.py", line 1687, in open
return cls.taropen(name, mode, fileobj, **kwargs)
File "/usr/lib64/python2.6/tarfile.py", line 1697, in taropen
return cls(name, mode, fileobj, **kwargs)
File "/usr/lib64/python2.6/tarfile.py", line 1518, in __init__
fileobj = bltn_open(name, self._mode)
TypeError: coercing to Unicode: need string or buffer, instance found
You cannot use tarfile to append to compressed tarballs. Either perform the decompress/compress steps separately, or don't use compression in the first place.
< strikethrough >
Did you try to change the mode ? I see w, which clearly overwrites the file. Try with a, or w+.
mytar = tarfile.open(tar_file,"w+:gz")
or
mytar = tarfile.open(tar_file,"a:gz")
< / strikethrough >
I have the following script:
import sys, os
pid = sys.argv[1]
maps_file = open("/proc/%s/maps" % pid, 'r')
mem_file = open("/proc/%s/mem" % pid, 'r')
for line in maps_file.readlines(): # for each mapped region
m = re.match(r'([0-9A-Fa-f]+)-([0-9A-Fa-f]+) ([-r])', line)
if m.group(3) == 'r': # if this is a readable region
start = int(m.group(1), 16)
end = int(m.group(2), 16)
mem_file.seek(start) # seek to region start
chunk = mem_file.read(end - start) # read region contents
#print chunk, # dump contents to standard output
mem_dump = open(pid+".bin", "wb")
mem_dump.write(str(chunk,))
mem_dump.close()
maps_file.close()
mem_file.close()
All workds well (dumping the process' memory) so far but I can't save data to file. What am I doing wrong?
Could it be that the files are getting written to somewhere you don't expect (looks like they will be written to the current directory)?