I am trying to create python script to archive and compress one year old data in datewise tar file. Script is also generating log file of the archived files. I am using python 2.6 on linux.
Here is my code :
for search_date in dd_list:
tar_file = "/files/yearly_archive/nas_archive_" + search_date + ".tgz"
mytar = tarfile.open(tar_file,"w:gz")
log_file = "/files/yearly_archive/archive_log_" + search_date
fcount = 0
#print tar_file
#print log_file
f = open(log_file,'ab+')
for f_name, d_date in date_file_dict.iteritems():
if d_date == search_date:
#print f_name
fcount += 1
mytar.add(f_name)
f.write(f_name + '\n')
date_occur_dict[search_date] = fcount
mytar.close()
f.close()
Here log file is getting appended if it exists but tar file is getting overwritten every when I run the script. Is there a way I can make sure tar file gets appended if it exists otherwise gets created ?
Edit :
I tried to add code for ungzipping and adding but it is not working.
for search_date in dd_list:
tar_file = "/files/yearly_archive/nas_archive_" + search_date + ".tgz"
zip = 1
try:
with open(tar_file,'ab+'):
import gzip
d_tar = gzip.open(tar_file,'wb')
zip = 0
except IOError:
print "Creating new tar file"
if zip == 1:
mytar = tarfile.open(tar_file,"w:gz")
else:
mytar = tarfile.open(d_tar,"w")
log_file = "/files/yearly_archive/archive_log_" + search_date
fcount = 0
#print tar_file
#print log_file
f = open(log_file,'ab+')
for f_name, d_date in date_file_dict.iteritems():
if d_date == search_date:
#print f_name
fcount += 1
mytar.add(f_name)
f.write(f_name + '\n')
date_occur_dict[search_date] = fcount
mytar.close()
f.close()
I am getting following error:
Traceback (most recent call last):
File "sort_archive.py", line 63, in <module>
mytar = tarfile.open(d_tar,"w")
File "/usr/lib64/python2.6/tarfile.py", line 1687, in open
return cls.taropen(name, mode, fileobj, **kwargs)
File "/usr/lib64/python2.6/tarfile.py", line 1697, in taropen
return cls(name, mode, fileobj, **kwargs)
File "/usr/lib64/python2.6/tarfile.py", line 1518, in __init__
fileobj = bltn_open(name, self._mode)
TypeError: coercing to Unicode: need string or buffer, instance found
You cannot use tarfile to append to compressed tarballs. Either perform the decompress/compress steps separately, or don't use compression in the first place.
< strikethrough >
Did you try to change the mode ? I see w, which clearly overwrites the file. Try with a, or w+.
mytar = tarfile.open(tar_file,"w+:gz")
or
mytar = tarfile.open(tar_file,"a:gz")
< / strikethrough >
Related
The code below makes an md5/metadata fingerprint, but crashes on files with unknown corruption (e.g., files, that can be copied, mostly even opened, but that can not be hashed or zipped up [to disguise their corruption]).
Question: How one makes this code to skip or ignore any and all problem files and just do the rest? Imagine 1 million files on 8 TB. Otherwise I leave it running and having no real-time monitoring of progress, 2 days later I find out that nothing got hashed because a couple problem files made the code hung.
Part of the code (see full code below):
def createBasicInfoListFromDisk():
global diskCompareListDetails, onlyFileNameOnDisk, driveLetter,walk_dir
walk_dir = os.path.abspath(walk_dir)
for root, subdirs, files in os.walk(walk_dir, topdown=True, onerror=None, followlinks=True ):
for filename in files:
file_path = os.path.join(root, filename)
temp = file_path.split(":")
driveLetter = temp[0]
filePathWithoutDriveLetter = temp[1]
fileSize = os.path.getsize(file_path)
mod_on = get_last_write_time(file_path)
print('\t- file %s (full path: %s)' % (filename, file_path))
print('FileName : {filename} is of size {size} and was modified on{mdt}'.format(filename=file_path,size=fileSize,mdt=mod_on ))
diskCompareListDetails.append("\"" + filePathWithoutDriveLetter+"\",\""+str(fileSize) + "\",\"" + mod_on +'"')
onlyFileNameOnDisk.append("\""+filePathWithoutDriveLetter+"\"")
return
Error:
FileName : T:\problemtest\problemfile.doc is of size 27136 and was modified on2010-10-10 13:58:32
Traceback (most recent call last):
File "t:\scripts\test.py", line 196, in <module>
createBasicInfoListFromDisk()
File "t:\scripts\test.py", line 76, in createBasicInfoListFromDisk
mod_on = get_last_write_time(file_path)
File "t:\scripts\test.py", line 61, in get_last_write_time
convert_time_to_human_readable = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(st.st_mtime))
OSError: [Errno 22] Invalid argument
Full code:
import os
import sys
import time
import datetime
import difflib
import decimal
import hashlib
from pip._vendor.distlib.compat import raw_input
csvListDetails = list()
csvCompareListDetails = list()
diskCompareListDetails = list()
onlyFileNameOnDisk = list()
addedFiles = list()
removedFiles = list()
driveLetter =""
finalFilesToChange=list()
finalFilesToDelete=list()
changedFiles=list()
csvfilewithPath="md5.csv"
import shutil
walk_dir=""
def findAndReadCSVFile(fileName):
global csvListDetails
global csvCompareListDetails
haveIgnoredLine = 0
foundFile=0
try :
inputFileHandler = open(fileName,"rt",encoding='utf-8')
update_time = get_last_write_time(fileName)
print("\n Found md5.csv, last updated on: %s" % update_time)
foundFile=1
except (OSError, IOError, FileNotFoundError):
print("\n md5.csv not found. Will create a new one.")
return foundFile
for line in inputFileHandler:
if (haveIgnoredLine==0):
haveIgnoredLine=1
continue
rowItem = line.replace("\n","").split('","')
csvCompareListDetails.append('"' + rowItem[3]+',"'+rowItem[2]+'","' +rowItem[1]+'"')
lineDetails = list()
for detailNum in range (0,len(rowItem)):
lineDetails.append('"' + (rowItem[detailNum].replace('"','')) + '"')
csvListDetails.append(lineDetails)
inputFileHandler.close()
return foundFile
def get_last_write_time(filename):
st = os.stat(filename)
convert_time_to_human_readable = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(st.st_mtime))
return convert_time_to_human_readable
def createBasicInfoListFromDisk():
global diskCompareListDetails, onlyFileNameOnDisk, driveLetter,walk_dir
walk_dir = os.path.abspath(walk_dir)
for root, subdirs, files in os.walk(walk_dir, topdown=True, onerror=None, followlinks=True ):
for filename in files:
file_path = os.path.join(root, filename)
temp = file_path.split(":")
driveLetter = temp[0]
filePathWithoutDriveLetter = temp[1]
fileSize = os.path.getsize(file_path)
mod_on = get_last_write_time(file_path)
print('\t- file %s (full path: %s)' % (filename, file_path))
print('FileName : {filename} is of size {size} and was modified on{mdt}'.format(filename=file_path,size=fileSize,mdt=mod_on ))
diskCompareListDetails.append("\"" + filePathWithoutDriveLetter+"\",\""+str(fileSize) + "\",\"" + mod_on +'"')
onlyFileNameOnDisk.append("\""+filePathWithoutDriveLetter+"\"")
return
def compareLogAndDiskLists():
global addedFiles, removedFiles
diff = difflib.unified_diff(csvCompareListDetails, diskCompareListDetails, fromfile='file1', tofile='file2', lineterm='', n=0)
lines = list(diff)[2:]
addedFiles = [line[1:] for line in lines if line[0] == '+']
removedFiles = [line[1:] for line in lines if line[0] == '-']
return
def displayInfoForUserInput():
global finalFilesToChange, finalFilesToDelete
changedOrNewFileCount = 0
noLongerExistingFilesCount = 0
totalSizeOfChange = 0
for line in addedFiles:
if line not in removedFiles:
changedOrNewFileCount = changedOrNewFileCount +1
elements = line.replace("\n","").split('","')
sizeOfFile= int(elements[1].replace('"',''))
totalSizeOfChange = totalSizeOfChange + sizeOfFile
finalFilesToChange.append(elements[0] +'"')
for line in removedFiles:
elements = line.split('","')
if elements[0]+'"' not in onlyFileNameOnDisk:
noLongerExistingFilesCount = noLongerExistingFilesCount + 1
finalFilesToDelete.append(elements[0]+'"')
GBModSz= decimal.Decimal(totalSizeOfChange) / decimal.Decimal('1073741824')
print("\n New or modified files on drive: {} (need to hash)".format(changedOrNewFileCount))
print (" Obsolete lines in md5.csv (files modified or not on drive): {} (lines to delete)".format(noLongerExistingFilesCount))
print (" {} files ({:.2f} GB) needs to be hashed.".format(changedOrNewFileCount,GBModSz))
userInput = raw_input("\n Proceed with hash? (Y/N, Yes/No) ")
if (userInput.strip().upper() == "Y" or userInput.strip().upper() == "YES"):
print("Continuing Processing...")
else:
print("You opted not to continue, Exiting")
sys.exit()
return
def processFiles(foundFile):
if (foundFile==1):
oldFileName = walk_dir+"/md5.csv"
shutil.copy( oldFileName, getTargetFileName(oldFileName))
BLOCKSIZE = 1048576*4
global changedFiles
for fileToHash in finalFilesToChange:
hasher = hashlib.new('md5')
fileToUse=driveLetter+":"+fileToHash.replace('"','')
with open(fileToUse, 'rb') as afile:
buf = afile.read(BLOCKSIZE)
while len(buf) > 0:
hasher.update(buf)
buf = afile.read(BLOCKSIZE)
fileDetails = list()
fileDetails.append(hasher.hexdigest())
fileDetails.append(get_last_write_time(fileToUse))
fileDetails.append(os.path.getsize(fileToUse))
fileDetails.append(fileToHash)
changedFiles.append(fileDetails)
return
def getTargetFileName(oldFileName):
targetFileName= walk_dir+"/generated_on_" + get_last_write_time(oldFileName).replace(" ","_").replace("-","").replace(":","")
targetFileName = targetFileName + "__archived_on_" + datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
targetFileName = targetFileName + "__md5.csv"
return targetFileName
def writeCSVFile(fileName):
try :
outputFileHandler=open(fileName,"wt",encoding='utf-8')
outputFileHandler.write("\"md5Hash\",\"LastWriteTime\",\"Length\",\"FullName\"\n")
for details in csvListDetails:
if details[3] in finalFilesToDelete:
continue
if details[3] in finalFilesToChange:
continue
outputFileHandler.write("{},{},{},{}\n".format(details[0],details[1],details[2],details[3]))
for details in changedFiles:
outputFileHandler.write("\"{}\",\"{}\",\"{}\",{}\n".format(details[0],details[1],details[2],details[3]))
outputFileHandler.close()
except (OSError, IOError, FileNotFoundError) as e:
print("ERROR :")
print("File {} is either not writable or some other error: {}".format(fileName,e))
return
if __name__ == '__main__':
walk_dir = raw_input("\n Enter drive or directory to scan: ")
csvfilewithPath=walk_dir+"/md5.csv"
print("\n Drive to scan: " + walk_dir)
foundFile = 0
foundFile=findAndReadCSVFile(csvfilewithPath)
createBasicInfoListFromDisk()
compareLogAndDiskLists()
displayInfoForUserInput()
processFiles(foundFile)
writeCSVFile(csvfilewithPath)
Trying this fix, no luck:
def get_last_write_time(filename):
try:
st = os.stat(filename)
convert_time_to_human_readable = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(st.st_mtime))
return convert_time_to_human_readable
except OSError:
pass
return "ERROR"
def createBasicInfoListFromDisk():
I agree with IMCoins and I'm very qurius on why except isn't catching the error.
So first thing I would do is to go to the source where the OSError is being raised and try to catch it explicity.
def get_last_write_time(filename):
try:
st = os.stat(filename)
convert_time_to_human_readable = time.strftime("%Y-%m-%d %H:%M:%S",
time.localtime(st.st_mtime)
return convert_time_to_human_readable
except OSError:
pass
return "ERROR" #or whatever string you want add
Updated answer, for updated post.
As stated earlier, except statement with exception type specified catches everything. So, in order to do what want... I'm afraid possible answer are either :
To make a method that identifies corrupted files, and handles it properly.
Make try, except statement that encapsulate every part of your code where there could be an error.
Let me warn you about the second solution though, as sometimes, there are system errors that you do not want to avoid. I believe you should print the exception that you catch, in order to identify further problems you may encounter.
Just so you know, as you may not : your error is not in a try, except statement. Your error is in (if I copied and pasted properly in my editor) line 196, createBasicinfoListFromDisk(), then line 76, mod_on = get_last_write_time(file_path)
As you also mentioned you are using python 3.x, I suggest you are looking into the suppress function (https://docs.python.org/3/library/contextlib.html#contextlib.suppress).
I hope it helped you.
I am using Windows 10 x64, with Python 3.6.1 x86.
I have this script from a few months ago which was working fine, but right now it gives me a weird error. The script is a simple one that extract URLs from tweets saved in .csv files.
This is the script:
import datetime
from urlextract import URLExtract
twitter_files_list = ['File1.csv', 'File2.csv', 'File3.csv']
input_path = my_path
# Find domain of URL
def find_domain(url):
return url.split("//")[-1].split("/")[0]
# Clean domain from useless chars
def clean_domain(domain):
domain = domain.replace("[", "")
domain = domain.replace("]", "")
domain = domain.replace("\'", "")
return domain
# Extract URLs from Tweets
def url_extract(filename):
print('\n' + filename + ':')
url_counter = 0
url_file = open('extracted_urls/urls_' + filename, 'a')
# Open file
f = open(input_path + filename, "r", encoding="utf8")
lines = f.readlines()
# Search for contents of column "text"
text = []
for x in lines:
text.append(x.split('\t')[4])
# Close file
f.close()
extractor = URLExtract()
for i in range(len(text)):
try:
if extractor.find_urls(text[i]): # Check if URL exists
url = extractor.find_urls(text[i])
domain = find_domain(str(url))
if not " " in domain:
url_file.write(str(clean_domain(domain)) + "\n")
url_counter += 1
except 'Not Found':
continue
url_file.close()
# Main
if __name__ == '__main__':
print('\nURL Characterization:\n')
# Start timer
start = datetime.datetime.now()
# Find the unique usernames for every file
for twitter_file in twitter_files_list:
print('Searching ' + str(twitter_file) + '...')
url_extract(twitter_file)
# End timer
end = datetime.datetime.now()
# Print results
print("\nProcess finished")
print("Total time: " + str(end - start))
This gives me the following error:
Traceback (most recent call last):
File "C:/Users/Aventinus/url_analysis/url_extractor.py", line 77, in <module>
url_extract(twitter_file)
File "C:/Users/Aventinus/url_analysis/url_extractor.py", line 50, in url_extract
extractor = URLExtract()
File "C:\Program Files (x86)\Python36-32\lib\site-packages\urlextract.py", line 65, in __init__
if not self._download_tlds_list():
File "C:\Program Files (x86)\Python36-32\lib\site-packages\urlextract.py", line 114, in _download_tlds_list
with open(self._tld_list_path, 'w') as ftld:
PermissionError: [Errno 13] Permission denied: 'C:\\Program Files (x86)\\Python36-32\\lib\\site-packages\\.tlds'
I have no idea how to interpret this.
you can try run the script as administrator
I am trying to read all fasta files from test folder and put the name of file in all headers of individual file. The code working for first file and dont proceed to second file and return error. Could you help me find bug in my code or edit it. Thanks
import sys, glob, os, string
header = ''
check = 0
path = "./test/"
dirs = os.listdir(path)
for file in dirs:
fp = open(file, "r")
fpx = open('%s_output.txt' % file, 'w')
for line in fp:
if line.startswith('>'):
line = line.rstrip()
check = check + 1
if check >= 1:
header = line
fpx.write(header + '_' + file + '\n')
else:
line = line.rstrip()
fpx.write(line + '\n')
It would be good to provide the error message you are getting! I think this must fail with "File not found" because you try to open the file by name instead of path. Try fp = open(os.path.join(path, file), "r"):
import sys, glob, os, string
header = ''
check = 0
path = "./test/"
dirs = os.listdir(path)
for file in dirs:
fp = open(os.path.join(path, file), "r")
fpx = open('%s_output.txt' % file, 'w')
for line in fp:
if line.startswith('>'):
line = line.rstrip()
check = check + 1
if check >= 1:
header = line
fpx.write(header + '_' + file + '\n')
else:
line = line.rstrip()
fpx.write(line + '\n')
I have a folder with huge text files. Each one is gzipped and weighs several Giga byte.
I wrote a piece of code to split the content of each gzip file: each gzip file is open with gzip, then every specified chunk of line is read and written to a new gzip file.
Here is the code, in file file_compression.py:
import sys, os, file_manipulation as fm
import gzip
def splitGzipFile(fileName, dest=None, chunkPerSplit=100, linePerChunk=4, file_field_separator="_", zfill=3
, verbose=False, file_permission=None, execute=True):
"""
Splits a gz file into chunk files.
:param fileName:
:param chunkPerSplit:
:param linePerChunk:
:return:
"""
absPath = os.path.abspath(fileName)
baseName = os.path.basename(absPath)
dirName = os.path.dirname(absPath)
destFolder = dirName if dest is None else dest
## Compute file fields
rawBaseName, extensions = baseName.split(os.extsep, 1)
if not str(extensions).startswith("."):
extensions = "." + extensions
file_fields = str(rawBaseName).split(file_field_separator)
first_fields = file_fields[:-1] if file_fields.__len__() > 1 else file_fields
first_file_part = file_field_separator.join(first_fields)
last_file_field = file_fields[-1] if file_fields.__len__() > 1 else ""
current_chunk = getCurrentChunkNumber(last_file_field)
if current_chunk is None or current_chunk < 0:
first_file_part = rawBaseName
## Initialize chunk variables
linePerSplit = chunkPerSplit * linePerChunk
# chunkCounter = 0
chunkCounter = 0 if current_chunk is None else current_chunk-1
for chunk in getFileChunks(fileName, linePerSplit):
print "writing " + str(str(chunk).__len__()) + " ..."
chunkCounter += 1
oFile = fm.buildPath(destFolder) + first_file_part + file_field_separator + str(chunkCounter).zfill(zfill) + extensions
if execute:
writeGzipFile(oFile, chunk, file_permission)
if verbose:
print "Splitting: created file ", oFile
def getCurrentChunkNumber(chunk_field):
"""
Tries to guess an integer from a string.
:param chunk_field:
:return: an integer, None if failure.
"""
try:
return int(chunk_field)
except ValueError:
return None
def getFileChunks(fileName, linePerSplit):
with gzip.open(fileName, 'rb') as f:
print "gzip open"
lineCounter = 0
currentChunk = ""
for line in f:
currentChunk += line
lineCounter += 1
if lineCounter >= linePerSplit:
yield currentChunk
currentChunk = ""
lineCounter = 0
if not currentChunk == '':
yield currentChunk
def writeGzipFile(file_name, content, file_permission=None):
import gzip
with gzip.open(file_name, 'wb') as f:
if not content == '':
f.write(content)
if file_permission is not None and type(file_permission) == int:
os.chmod(file_name, file_permission)
This task is multiprocess, a process is created for each file before being splitted. Each file is open and split only once, before being erased, I made sure of that by recording them in a list:
from tools.file_utils import file_compression as fc, file_manipulation as fm
import multiprocessing
from multiprocessing import Process, Queue, Manager
manager = Manager()
split_seen = manager.list()
files = [...] # list is full of gzip files.
processList = []
sampleDir = "sample/dir/"
for file in files:
fielPath = sampleDir + str(file)
p = Process(target=processFile, args=(filePath, sampleDir, True))
p.start()
processList.append(p)
## Join the processes
for p in processList:
p.join()
def processFile(filePath, destFolder, verbose=True):
global split_seen
if filePath in split_seen:
print "Duplicate file processed: " + str(filePath)
time.sleep(3)
print "adding", filePath, split_seen.__len__()
split_seen.append(filePath)
fc.splitGzipFile(filePath, dest=destFolder, chunkPerSplit=4000000\
, linePerChunk=4
, verbose=True
, file_permission=0770
, zfill=3
)
os.remove(filePath)
So far the code has always run fine. But today I had an issue with gzip files' CRC corruption:
Process Process-3:72:
Traceback (most recent call last):
...
File "/.../tools/file_utils/file_compression.py", line 43, in splitGzipFile
for chunk in getFileChunks(fileName, linePerSplit):
File "/.../tools/file_utils/file_compression.py", line 70, in getFileChunks
for line in f:
File "/.../python2.7/lib/python2.7/gzip.py", line 450, in readline
c = self.read(readsize)
File "/.../python2.7/lib/python2.7/gzip.py", line 256, in read
self._read(readsize)
File "/.../python2.7/lib/python2.7/gzip.py", line 320, in _read
self._read_eof()
File "/.../python2.7/lib/python2.7/gzip.py", line 342, in _read_eof
hex(self.crc)))
IOError: CRC check failed 0xddbb6045 != 0x34fd5580L
What could be the origins for this issue? I have to state again that so far it has
always worked, folders and files are always of the same structure. The difference in this instance perhaps is that my script is processing more gzip files than usual, maybe twice as much.
Could it be a matter of the same files being accessed at the same time? But that I seriously doubt, I made sure it is not the case by registering each file accessed in my split_seen list.
I would take any hint, as I have no more clues to where to look.
EDIT 1
Maybe some open files were accessed by someone else, or another program? I cannot ask for and rely on testimonials. So as a start, if I were to put a multiprocess.Lock, would it prevent any other thread, process, program, user, etc from modifying the file? Or is it only limited to Python? I cannot find any doc on that.
I got the exact same error on code that has been running for months. Turns out that the file source was corrupted for that particular file. I went back to an old file and it worked fine and I used a newer file and it also worked fine.
I had the same issue. I just deleted the old file re-ran the code.
rm -rf /tmp/imagenet/
HTH
So I'm writing a script to take large csv files and divide them into chunks. These files each have lines formatted accordingly:
01/07/2003,1545,12.47,12.48,12.43,12.44,137423
Where the first field is the date. The next field to the right is a time value. These data points are at minute granularity. My goal is to fill files with 8 days worth of data, so I want to write all the lines from a file for 8 days worth into a new file.
Right now, I'm only seeing the program write one line per "chunk," rather than all the lines. Code shown below and screenshots included showing how the chunk directories are made and the file as well as its contents.
For reference, day 8 shown and 1559 means it stored the last line right before the mod operator became true. So I'm thinking that everything is getting overwritten somehow since only the last values are being stored.
import os
import time
CWD = os.getcwd()
WRITEDIR = CWD+"/Divided Data/"
if not os.path.exists(WRITEDIR):
os.makedirs(WRITEDIR)
FILEDIR = CWD+"/SP500"
os.chdir(FILEDIR)
valid_files = []
filelist = open("filelist.txt", 'r')
for file in filelist:
cur_file = open(file.rstrip()+".csv", 'r')
cur_file.readline() #skip first line
prev_day = ""
count = 0
chunk_count = 1
for line in cur_file:
day = line[3:5]
WDIR = WRITEDIR + "Chunk"
cur_dir = os.getcwd()
path = WDIR + " "+ str(chunk_count)
if not os.path.exists(path):
os.makedirs(path)
if(day != prev_day):
# print(day)
prev_day = day
count += 1
#Create new directory
if(count % 8 == 0):
chunk_count += 1
PATH = WDIR + " " + str(chunk_count)
if not os.path.exists(PATH):
os.makedirs(PATH)
print("Chunk count: " + str(chunk_count))
print("Global count: " + str(count))
temp_path = WDIR +" "+str(chunk_count)
os.chdir(temp_path)
fname = file.rstrip()+str(chunk_count)+".csv"
with open(fname, 'w') as f:
try:
f.write(line + '\n')
except:
print("Could not write to file. \n")
os.chdir(cur_dir)
if(chunk_count >= 406):
continue
cur_file.close()
# count += 1
The answer is in the comment but let me give it here so that your question is answered.
You're opening your file in 'w' mode which overwrites all the previously written content. You need to open it in the 'a' (append) mode:
fname = file.rstrip()+str(chunk_count)+".csv"
with open(fname, 'a') as f:
See more on open function and modes in Python documentation. It specifically mentions about 'w' mode:
note that 'w+' truncates the file