Issue with Python garbage collector? - python

I have a simple program which reads a large file containing few million rows, parses each row (numpy array) and converts into an array of doubles (python array) and later writes into an hdf5 file. I repeat this loop for multiple days. After reading each file, i delete all the objects and call garbage collector. When I run the program, First day is parsed without any error but on the second day i get MemoryError. I monitored the memory usage of my program, during first day of parsing, memory usage is around 1.5 GB. When the first day parsing is finished, memory usage goes down to 50 MB. Now when 2nd day starts and i try to read the lines from the file I get MemoryError. Following is the output of the program.
source file extracted at C:\rfadump\au\2012.08.07.txt
parsing started
current time: 2012-09-16 22:40:16.829000
500000 lines parsed
1000000 lines parsed
1500000 lines parsed
2000000 lines parsed
2500000 lines parsed
3000000 lines parsed
3500000 lines parsed
4000000 lines parsed
4500000 lines parsed
5000000 lines parsed
parsing done.
end time is 2012-09-16 23:34:19.931000
total time elapsed 0:54:03.102000
repacking file
done
> s:\users\aaj\projects\pythonhf\rfadumptohdf.py(132)generateFiles()
-> while single_date <= self.end_date:
(Pdb) c
*** 2012-08-08 ***
source file extracted at C:\rfadump\au\2012.08.08.txt
cought an exception while generating file for day 2012-08-08.
Traceback (most recent call last):
File "rfaDumpToHDF.py", line 175, in generateFile
lines = self.rawfile.read().split('|\n')
MemoryError
I am very sure that windows system task manager shows the memory usage as 50 MB for this process. It looks like the garbage collector or memory manager for Python is not calculating the free memory correcly. There should be lot of free memory but it thinks there is not enough.
Any idea?
EDIT
Adding my code here
I will put parts of my code. I am new to python, please pardon my python coding style.
module 1
def generateFile(self, current_date):
try:
print "*** %s ***" % current_date.strftime("%Y-%m-%d")
weekday=current_date.weekday()
if weekday >= 5:
print "skipping weekend"
return
self.taqdb = taqDB(self.index, self.offset)
cache_filename = os.path.join(self.cache_dir,current_date.strftime("%Y.%m.%d.h5"))
outputFile = config.hdf5.filePath(self.index, date=current_date)
print "cache file: ", cache_filename
print "output file: ", outputFile
tempdir = "C:\\rfadump\\"+self.region+"\\"
input_filename = tempdir + filename
print "source file extracted at %s " % input_filename
## universe
reader = rfaTextToTAQ.rfaTextToTAQ(self.tickobj) ## PARSER
count = 0
self.rawfile = open(input_filename, 'r')
lines = self.rawfile.read().split('|\n')
total_lines = len(lines)
self.rawfile.close()
del self.rawfile
print "parsing started"
start_time = dt.datetime.now()
print "current time: %s" % start_time
#while(len(lines) > 0):
while(count < total_lines):
#line = lines.pop(0) ## This slows down processing
result = reader.parseline(lines[count]+"|")
count += 1
if(count % 500000 == 0):
print "%d lines parsed" %(count)
if(result == None):
continue
ric, timestamp, quotes, trades, levelsUpdated, tradeupdate = result
if(len(levelsUpdated) == 0 and tradeupdate == False):
continue
self.taqdb.insert(result)
## write to hdf5 TODO
writer = h5Writer.h5Writer(cache_filename, self.tickobj)
writer.write(self.taqdb.groups)
writer.close()
del lines
del self.taqdb, self.tickobj
##########################################################
print "parsing done."
end_time = dt.datetime.now()
print "end time is %s" % end_time
print "total time elapsed %s" % (end_time - start_time)
defragger = hdf.HDF5Defragmenter()
defragger.Defrag(cache_filename,outputFile)
del defragger
print "done"
gc.collect(2)
except:
print "cought an exception while generating file for day %s." % current_date.strftime("%Y-%m-%d")
tb = traceback.format_exc()
print tb
module 2 - taqdb - to store parsed data in an array
class taqDB:
def __init__(self, index, offset):
self.index = index
self.tickcfg = config.hdf5.getTickConfig(index)
self.offset = offset
self.groups = {}
def getGroup(self,ric):
if (self.groups.has_key(ric) == False):
self.groups[ric] = {}
return self.groups[ric]
def getOrderbookArray(self, ric, group):
datasetname = orderBookName
prodtype = self.tickcfg.getProdType(ric)
if(prodtype == ProdType.INDEX):
return
orderbookArrayShape = self.tickcfg.getOrderBookArrayShape(prodtype)
if(group.has_key(datasetname) == False):
group[datasetname] = array.array("d")
orderbookArray = self.tickcfg.getOrderBookArray(prodtype)
return orderbookArray
else:
orderbookArray = group[datasetname]
if(len(orderbookArray) == 0):
return self.tickcfg.getOrderBookArray(prodtype)
lastOrderbook = orderbookArray[-orderbookArrayShape[1]:]
return np.array([lastOrderbook])
def addToDataset(self, group, datasetname, timestamp, arr):
if(group.has_key(datasetname) == False):
group[datasetname] = array.array("d")
arr[0,0]=timestamp
a1 = group[datasetname]
a1.extend(arr[0])
def addToOrderBook(self, group, timestamp, arr):
self.addToDataset(self, group, orderBookName, timestamp, arr)
def insert(self, data):
ric, timestamp, quotes, trades, levelsUpdated, tradeupdate = data
delta = dt.timedelta(hours=timestamp.hour,minutes=timestamp.minute, seconds=timestamp.second, microseconds=(timestamp.microsecond/1000))
timestamp = float(str(delta.seconds)+'.'+str(delta.microseconds)) + self.offset
## write to array
group = self.getGroup(ric)
orderbookUpdate = False
orderbookArray = self.getOrderbookArray(ric, group)
nonzero = quotes.nonzero()
orderbookArray[nonzero] = quotes[nonzero]
if(np.any(nonzero)):
self.addToDataset(group, orderBookName, timestamp, orderbookArray)
if(tradeupdate == True):
self.addToDataset(group, tradeName, timestamp, trades)
Module 3- Parser
class rfaTextToTAQ:
"""RFA Raw dump file reader. Readers single line (record) and returns an array or array of fid value pairs."""
def __init__(self,tickconfig):
self.tickconfig = tickconfig
self.token = ''
self.state = ReadState.SEQ_NUM
self.fvstate = fvstate.FID
self.quotes = np.array([]) # read from tickconfig
self.trades = np.array([]) # read from tickconfig
self.prodtype = ProdType.STOCK
self.allquotes = {}
self.alltrades = {}
self.acvol = 0
self.levelsUpdated = []
self.quoteUpdate = False
self.tradeUpdate = False
self.depth = 0
def updateLevel(self, index):
if(self.levelsUpdated.__contains__(index) == False):
self.levelsUpdated.append(index)
def updateQuote(self, fidindex, field):
self.value = float(self.value)
if(self.depth == 1):
index = fidindex[0]+(len(self.tickconfig.stkQuotes)*(self.depth - 1))
self.quotes[index[0]][fidindex[1][0]] = self.value
self.updateLevel(index[0])
else:
self.quotes[fidindex] = self.value
self.updateLevel(fidindex[0][0])
self.quoteUpdate = True
def updateTrade(self, fidindex, field):
#self.value = float(self.value)
if(self.tickconfig.tradeUpdate(self.depth) == False):
return
newacvol = float(self.value)
if(field == acvol):
if(self.value > self.acvol):
tradesize = newacvol - self.acvol
self.acvol = newacvol
self.trades[fidindex] = tradesize
if(self.trades.__contains__(0) == False):
self.tradeUpdate = True
else:
self.trades[fidindex] = self.value
if(not (self.trades[0,1]==0 or self.trades[0,2]==0)):
self.tradeUpdate = True
def updateResult(self):
field = ''
valid, field = field_dict.FIDToField(int(self.fid), field)
if(valid == False):
return
if(self.value == '0'):
return
if(self.prodtype == ProdType.STOCK):
fidindex = np.where(self.tickconfig.stkQuotes == field)
if(len(fidindex[0]) == 0):
fidindex = np.where(self.tickconfig.stkTrades == field)
if(len(fidindex[0]) == 0):
return
else:
self.updateTrade(fidindex, field)
else:
self.updateQuote(fidindex, field)
else:
fidindex = np.where(self.tickconfig.futQuotes == field)
if(len(fidindex[0]) == 0):
fidindex = np.where(self.tickconfig.futTrades == field)
if(len(fidindex[0]) == 0):
return
else:
self.updateTrade(fidindex, field)
else:
self.updateQuote(fidindex, field)
def getOrderBookTrade(self):
if (self.allquotes.has_key(self.ric) == False):
acvol = 0
self.allquotes[self.ric] = self.tickconfig.getOrderBookArray(self.prodtype)
trades = self.tickconfig.getTradesArray()
self.alltrades[self.ric] = [trades, acvol]
return self.allquotes[self.ric], self.alltrades[self.ric]
def parseline(self, line):
self.tradeUpdate = False
self.levelsUpdated = []
pos = 0
length = len(line)
self.state = ReadState.SEQ_NUM
self.fvstate = fvstate.FID
self.token = ''
ch = ''
while(pos < length):
prevChar = ch
ch = line[pos]
pos += 1
#SEQ_NUM
if(self.state == ReadState.SEQ_NUM):
if(ch != ','):
self.token += ch
else:
self.seq_num = int(self.token)
self.state = ReadState.TIMESTAMP
self.token = ''
# TIMESTAMP
elif(self.state == ReadState.TIMESTAMP):
if(ch == ' '):
self.token = ''
elif(ch != ','):
self.token += ch
else:
if(len(self.token) != 12):
print "Invalid timestamp format. %s. skipping line.\n", self.token
self.state = ReadState.SKIPLINE
else:
self.timestamp = datetime.strptime(self.token,'%H:%M:%S.%f')
self.state = ReadState.RIC
self.token = ''
# RIC
elif(self.state == ReadState.RIC):
if(ch != ','):
self.token += ch
else:
self.ric = self.token
self.token = ''
self.ric, self.depth = self.tickconfig.replaceRic(self.ric)
self.prodtype = self.tickconfig.getProdType(self.ric)
if(self.tickconfig.subscribed(self.ric)):
self.state = ReadState.UPDATE_TYPE
self.quotes, trades = self.getOrderBookTrade()
self.trades = trades[0]
self.acvol = trades[1]
else:
self.state = ReadState.SKIPLINE
# UPDATE_TYPE
elif(self.state == ReadState.UPDATE_TYPE):
if(ch != '|'):
self.token += ch
else:
self.update_type = self.token
self.token = ''
self.state = ReadState.FVPAIRS
#SKIPLINE
elif(self.state == ReadState.SKIPLINE):
return None
# FV PAIRS
elif(self.state == ReadState.FVPAIRS):
# FID
if(self.fvstate == fvstate.FID):
if(ch != ','):
if(ch.isdigit() == False):
self.token = self.value+ch
self.fvstate = fvstate.FIDVALUE
self.state = ReadState.FVPAIRS
else:
self.token += ch
else:
self.fid = self.token
self.token = ''
self.fvstate = fvstate.FIDVALUE
self.state = ReadState.FVPAIRS
# FIDVALUE
elif(self.fvstate == fvstate.FIDVALUE):
if(ch != '|'):
self.token += ch
else:
self.value = self.token
self.token = ''
self.state = ReadState.FVPAIRS
self.fvstate = fvstate.FID
# TODO set value
self.updateResult()
return self.ric, self.timestamp, self.quotes, self.trades, self.levelsUpdated, self.tradeUpdate
Thanks.

The only reliable way to free memory is to terminate the process.
So, if your main program spawns a worker process to do most of the work (the stuff that is done in one day) then when that worker process completes, the memory used will be freed:
import multiprocessing as mp
def work(date):
# Do most of the memory-intensive work here
...
while single_date <= self.end_date:
proc = mp.Process(target = work, args = (single_date,))
proc.start()
proc.join()

Related

why it say "the action cannot be completed because the file is open in python“?

def main_loop():
global errname, errtime, error_detail, conclusion
error_detail = ""
facts_all = {}
facts = []
buffer = 0
current_time = datetime.now()
while os.path.exists("C:\Winusr"):
print(paths["wintrace"])
try:
start_point = 0
old_size = os.path.getsize(paths["wintrace"])
while os.path.getsize(paths["wintrace"])>= old_size:
#fo = open(paths["wintrace"], "rb")
#fo.seek(start_point,1)
shutil.copyfile(paths["wintrace"], "C:\Winusr\wintrace1.log")
fo = open("C:\Winusr\wintrace1.log", "rb")
fo.seek(start_point, 1)
errtime = datetime(1900, 1, 1)
old_size = os.path.getsize(paths["wintrace"])
#start from here
for line in fo.readlines():
line = str(line.decode('ISO-8859-1'))
print(line)
if fnmatch.fnmatch(line, "*START DUMP LOG BUFFER*"):
buffer = 1
if fnmatch.fnmatch(line, "*END DUMP LOG BUFFER*"):
buffer = 0
if buffer == 1:
continue
facts_all = collect_facts(line,facts_all,key_string,key_value_dic)
for pattern in error_detect:
if fnmatch.fnmatch(line, pattern):
try:
err_type = df[df["Error Detect Keyword"] == pattern]["Err Type"].to_string(index=False).lstrip()
errname = df[df["Err Type"] == err_type]["Error Name"].tolist()[0].lstrip()
errtime = datetime.strptime(
datetime.fromtimestamp(os.path.getmtime(paths["wintrace"])).strftime("%Y-%m-%d") + " " + line[:8], "%Y-%m-%d %H:%M:%S") #"%d-%b-%Y %H:%M:%S"
#errtime = datetime.fromtimestamp(os.path.getmtime(paths["wintrace"])).strftime("%Y-%m-%d") + " " + line[:8]
#errtime = errtime.strftime('%Y-%m-%d %H:%M:%S')
product = re.findall(r"[/](.+?)[.]", paths["cur"])
product = product[0].split("/")[-1]
tester = tester_name(paths["cur"])
if len(facts_all) != 0:
facts.append(errname)
#idex = 9999
for fact, line in facts_all.items():
if fact in dic1[errname]:
error_detail = error_detail + line + '\n'
facts.append(fact)
print("err_detail1", error_detail)
if len(facts) != 1:
facts = list(set(facts))
conclusion = inference_engine(facts)
print("errtime", errtime)
print("current_time", current_time)
if conclusion != "cannot find solution for this error" and errtime > current_time:
solutions = sop2(errlist, errname, conclusion)
row = recording(tester, product, errname, errtime, error_detail, conclusion)
print("gg pop out GUI!!!")
#send_email(errname, errtime, tester, error_detail)
GUI(errname, errtime, error_detail, conclusion, solutions, row)
current_time = datetime.now()
workbook = xlrd.open_workbook(r"G:\expert system data\Machine Database.xls")
workbook1 = copy(workbook)
ws1 = workbook1.get_sheet(0)
style = xlwt.XFStyle()
style.num_format_str = 'yyyy-mm-dd hh:mm:ss'
ws1.write(row, 8, current_time, style)
workbook1.save(r"G:\expert system data\Machine Database.xls")
error_detail = ""
facts_all = {}
facts = []
error_detail = ""
facts_all = {}
facts = []
except:
continue
start_point = fo.tell()
fo.close()
except:
continue
else:
main_loop()
the paths["wintrace"] is ""C:\Winusr\Wintrace.log", i dont want it is open cause sometimes need to change its name or delete, i copy this file and open the copied one, but it still show it is open, can u help me check where it is opened? besides, i use "filepath = tkinter.filedialog.askopenfilename()", but dont think it will open the wintrace file.the error screenshot

fastest way to check a .rar file password - other than rarfile extractall

So i have written a little .rar password "cracker" based on tutorials, using the code underneath. It works fine, but is very slow when the file size is big. The best reason i could find is, that ever so often when you put in a wrong password, it extracts the whole file, before refusing the password. With small files that is not a big problem, but with big files it slows the process a lot.
Is there a way to just check a hashed version of the password against a iterated hash?
import itertools
import rarfile
import time
rarfile.UNRAR_TOOL = "path"
rar = rarfile.RarFile("path")
done = False
n = 0
inputList = ["A","B","1","2"]
class h():
startword = ""
rep = 1
start = 0
itrTot = 0
f = open("save.txt")
for x,each in enumerate(f):
if x == 0:
h.rep = int(each)
else:
h.start = int(each)-3
f.close()
if h.start < 0:
h.start = 0
h.itrTot = len(inputList)**h.rep
def pw_guess():
res = itertools.product(inputList, repeat=h.rep)
for guess in res:
yield guess
start_time = time.time()
while True:
guess_generator = pw_guess()
for guess in guess_generator:
n += 1
if h.startword == "":
h.startword = guess
else:
if guess == h.startword:
h.rep += 1
n = 0
h.itrTot = len(inputList)**h.rep
h.start = 0
print("next rotation, itr rep: "+str(h.rep))
h.startword = ""
break
if n < h.start:
continue
txt = f"({n}/{h.itrTot}, {round((100/h.itrTot)*n,2)}%) - {h.rep}: {''.join(guess)}"
print(txt)
try:
rar.extractall(path="path",members=None,pwd=''.join(guess))
print("Pass found!")
print(str(n) + " - " + str(h.rep) + ": " + str(''.join(guess)))
done = True
txt2 = f"({n}/{h.itrTot}, {round((100/h.itrTot)*n,2)}%) - {h.rep}: {''.join(guess)}\n"
f = open("pass.txt", "a")
f.write(txt2)
f.close()
break
except:
f = open("save.txt", "w")
f.write(str(h.rep) + "\n" + str(n))
f.close()
if done:
end_time = time.time()
break
print("End time: " + str(end_time-start_time))
John the ripper is the answer. +20k passwords checked in 2 minutes. But using parts of the script for wordlist generation, is still very fast and functional.
wordlist generator i used:
import itertools
inputList = ["A","B","C","D","1","2","3","4","5"]
itr = 7
WL_path = "path"
f = open(WL_path,"w")
f.write("")
f.close()
class h():
startword = ""
rep = 1
itrTot = 0
txt = ""
h.itrTot = len(inputList)**itr
print("Wordlist length: " + str(h.itrTot))
def pw_guess():
res = itertools.product(inputList, repeat=h.rep)
for guess in res:
yield guess
while True:
guess_generator = pw_guess()
for guess in guess_generator:
if h.startword == "":
h.startword = guess
else:
if guess == h.startword:
h.rep += 1
print("next rotation, itr rep: " + str(h.rep))
h.startword = ""
break
h.txt = ''.join(guess)
f = open(WL_path, "a")
f.write(h.txt+"\n")
f.close()
if h.rep > itr:
break

Python to Quickly Change 300K lines in a 8KK lines file

I am a chip test engineer, and I have one big text file about 8KK lines. For this file, most lines include '='. Meanwhile I have a log file, which is about 300K lines, each line is show a test failure. I need to change the 300K lines of the original file.
Currently it takes about 15 hours to finish the job.
I have existing solution, but it is too slow.
For the code, the parse_log is used to process the log file and get to know each modification to be made, and the stil_parse include below function:
read file as list in memory;
iterate the file, and modify each line in list if included in log file;
write back to disk;
class MaskStil:
def __init__(self):
self.log_signal_file = ''
self.pattern = r"^([^:]+)(:)(\d+)(\s+)(\d+)(\s+)(\d+)(\s+)(\d+)(\s)([.LH]+)$"
self.log_signal = {}
self.log_lines = []
self.mask_dict = {}
self.stil_name_new = ''
self.stil_name = ''
self.signal_all = {}
self.signal_group = []
self.offset = 0
self.mask_mode = -1 # mask_mode 0: revert between L/H; mask_mode 1: mask L/H to Z
self.convert_value=[{"L":"H", "H":"L"}, {"L":"Z", "H":"Z"}]
for i in range(100):
self.log_signal[i] = ''
def digest(self, log_signal, stil_file, signal_group, offset, mask_mode = 1):
self.log_signal_file = log_signal
self.stil_name = stil_file
self.stil_name_new = stil_file[:-5] + '_mask.stil'
self.signal_group = signal_group.replace('=', '+').strip().split('+')
self.offset = offset
self.mask_mode = mask_mode
for i in range(1, len(self.signal_group)):
self.signal_all[self.signal_group[i]] = (i - 1) / 10 + i
print(self.signal_all)
self.parse_log()
self.stil_parse()
def parse_log(self):
with open(self.log_signal_file) as infile:
line_num = 0
blank_line = 0
for line in infile:
line_num += 1
if line_num == 1:
blank_line = line.count(' ')
if "------------------" in line:
break
for i in range(blank_line, len(line)):
self.log_signal[i - blank_line] += line[i]
for (key, value) in self.log_signal.items():
self.log_signal[key] = value.rstrip()
print(self.log_signal)
with open(self.log_signal_file) as log_in:
self.log_lines = log_in.read().splitlines()
for line in self.log_lines:
if re.match(self.pattern, line):
match = re.match(self.pattern, line)
cycle = int(match.group(9))
signals = match.group(11)
# print cycle,signals
self.mask_dict[cycle] = {}
for i in range(len(signals)):
if signals[i] != '.':
self.mask_dict[cycle][i] = signals[i]
def stil_parse(self):
cycle_keys = []
vector_num = 0
for i in self.mask_dict.keys():
cycle_keys.append(i)
with open(self.stil_name, 'r') as stil_in:
stil_in_list = stil_in.read().splitlines()
total_len = len(stil_in_list)
vector_cycle_dict = {}
with tqdm(total=total_len, ncols=100, desc= " Stil Scanning in RAM Progress") as pbar:
for i_iter in range(total_len):
line = stil_in_list[i_iter]
pbar.update(1)
if "=" in line:
vector_num +=1
if (vector_num in cycle_keys):
vector_cycle_dict[vector_num] = i_iter
status = line[line.find("=") + 1:line.find(";")]
# if cycle + self.offset in cycle_keys:
if vector_num in cycle_keys:
match = 1
for (i, j) in self.mask_dict[vector_num].iteritems():
mask_point = i
mask_signal = self.log_signal[i]
mask_value = j
test_point = self.signal_all[mask_signal]
test_value = status[test_point]
if test_value != mask_value:
print("data did not match for cycle: ", test_value, " VS ", line, j, vector_num, mask_point, mask_signal, test_point, test_value)
match = 0
raise NameError
else:
status = status[:test_point] + self.convert_value[self.mask_mode][test_value] + status[test_point + 1:]
if match == 1:
replace_line = line[:line.find("=") + 1] + status + line[line.find(";"):]
print("data change from :", line)
print(" to:", replace_line)
stil_in_list[i_iter] = replace_line
else:
print("No matching for %d with %s" %(vector_num, line))
raise NameError
with tqdm(total=len(stil_in_list), ncols=100, desc= " Masked-stil to in RAM Progress") as pbar:
with open(self.stil_name_new, 'w') as stil_out:
for new_line in range(len(stil_in_list)):
pbar.update(1)
stil_out.write(new_line)
I was expecting a solution that could finish in about 1 or 2 hours.
As I mentioned in the comments, you can get some speedup by refactoring your code to be multithreaded or multiprocess.
I imagine you're also running into memory swapping issues here. If that's the case, this should help:
with open(self.log_signal_file) as log_in:
line = log_in.readline() # First line. Need logic to handle empty logs
while line: #Will return false at EOF
if re.match(self.pattern, line):
match = re.match(self.pattern, line)
cycle = int(match.group(9))
signals = match.group(11)
# print cycle,signals
self.mask_dict[cycle] = {}
for i in range(len(signals)):
if signals[i] != '.':
self.mask_dict[cycle][i] = signals[i]
line = log_in.readline()
Here we only read in one line at a time, so you don't have to try to hold 8KK lines in memory
*In case anyone else didn't know, KK means million apparently.
I managed to optimized the solution, and the timing consumed tremendously reduced to about 1 minute.
Mainly the optimization is in below fields:
instead of keeping checking if (vector_num in cycle_keys):, I use
ordered list and always check whether equal to index_to_mask;
use variable line_find_equal and line_find_coma for further usage
class MaskStil:
def __init__(self):
self.log_signal_file = ''
self.pattern = r"^([^:]+)(:)(\d+)(\s+)(\d+)(\s+)(\d+)(\s+)(\d+)(\s)([.LH]+)$"
self.log_signal = {}
self.log_lines = []
self.mask_dict = {}
self.stil_name_new = ''
self.stil_name = ''
self.signal_all = {}
self.signal_group = []
self.offset = 0
self.mask_mode = -1 # mask_mode 0: revert between L/H; mask_mode 1: mask L/H to Z
self.convert_value=[{"L":"H", "H":"L"}, {"L":"Z", "H":"Z"}]
for i in range(100):
self.log_signal[i] = ''
def digest(self, log_signal, stil_file, signal_group, offset, mask_mode = 1):
self.log_signal_file = log_signal
self.stil_name = stil_file
self.stil_name_new = stil_file[:-5] + '_mask.stil'
self.signal_group = signal_group.replace('=', '+').strip().split('+')
self.offset = offset
self.mask_mode = mask_mode
for i in range(1, len(self.signal_group)):
self.signal_all[self.signal_group[i]] = int(math.floor((i - 1) / 10) + i)
print(self.signal_all)
self.parse_log()
self.stil_parse()
def parse_log(self):
with open(self.log_signal_file) as infile:
line_num = 0
blank_line = 0
for line in infile:
line_num += 1
if line_num == 1:
blank_line = line.count(' ')
if "------------------" in line:
break
for i in range(blank_line, len(line)):
self.log_signal[i - blank_line] += line[i]
for (key, value) in self.log_signal.items():
self.log_signal[key] = value.rstrip()
print(self.log_signal)
with open(self.log_signal_file) as log_in:
self.log_lines = log_in.read().splitlines()
for line in self.log_lines:
if re.match(self.pattern, line):
match = re.match(self.pattern, line)
cycle = int(match.group(9))
signals = match.group(11)
# print cycle,signals
self.mask_dict[cycle] = {}
for i in range(len(signals)):
if signals[i] != '.':
self.mask_dict[cycle][i] = signals[i]
def stil_parse(self):
cycle_keys = []
vector_num = 0
for i in self.mask_dict.keys():
cycle_keys.append(i)
with open(self.stil_name, 'r') as stil_in:
stil_in_list = stil_in.read().splitlines()
total_len = len(stil_in_list)
index_to_mask = 0
with tqdm(total=total_len, ncols=100, desc= " Stil Scanning in RAM Progress") as pbar:
for i_iter in range(total_len):
line = stil_in_list[i_iter]
pbar.update(1)
if "=" in line:
vector_num +=1
if (vector_num<=cycle_keys[-1]):
if (vector_num == cycle_keys[index_to_mask]):
line_find_equal = line.find("=")
line_find_coma = line.find(";")
status = line[line_find_equal + 1:line_find_coma]
# if cycle + self.offset in cycle_keys:
try:
match = 1
for (i, j) in self.mask_dict[vector_num].items():
mask_point = i
mask_signal = self.log_signal[i]
mask_value = j
test_point = self.signal_all[mask_signal]
test_value = status[test_point]
if test_value != mask_value:
print("data did not match for cycle: ", test_value, " VS ", line, j, vector_num, mask_point, mask_signal, test_point, test_value)
match = 0
raise NameError
else:
status = status[:test_point] + self.convert_value[self.mask_mode][test_value] + status[test_point + 1:]
stil_in_list[i_iter] = line[:line_find_equal + 1] + status + line[line_find_coma:]
# print("data change from :", line)
# print(" to:", stil_in_list[i_iter])
index_to_mask = index_to_mask+1
except (Exception) as e:
print("No matching for %d with %s" %(vector_num, line))
raise NameError
with tqdm(total=len(stil_in_list), ncols=100, desc= " Masked-stil to disk Progress") as pbar:
with open(self.stil_name_new, 'w') as stil_out:
for i_iter in range(len(stil_in_list)):
pbar.update(1)
stil_out.write(stil_in_list[i_iter]+ "\n")

Python - WindowName for Compare with List

I am currently blocked on a point of a program in Python.
I wish to compare in a list, the WindowName event to launch directives.
Example:
import win32api
import pyHook
liste = ["Google", "Task"]
if event.WindowName == liste:
Screenshot ()
return True
else:
return False
Complete code, he work:
def OnMouseEvent(event):
global interval
data = '\n[' + str(time.ctime().split(' ')[3]) + ']' \
+ ' WindowName : ' + str(event.WindowName)
data += '\n\tButton:' + str(event.MessageName)
data += '\n\tClicked in (Position):' + str(event.Position)
data += '\n===================='
global t, start_time, pics_names
"""
Code Edit
"""
t = t + data
if len(t) > 300:
ScreenShot()
"""
Finish
"""
if len(t) > 500:
f = open('Logfile.txt', 'a')
f.write(t)
f.close()
t = ''
if int(time.time() - start_time) == int(interval):
Mail_it(t, pics_names)
start_time = time.time()
t = ''
return True
else:
return False
When i edit the code in """ doesn't work :
t = t + data
liste = ["Google", "Task"]
if event.WindowName == liste:
ScreenShot()
He return :
File "C:\Python26\lib\site-packages\pyHook\HookManager.py", line 324, in MouseSwitch func = self.mouse_funcs.get(msg) TypeError: an integer is required
I test this :
HookManager: func = self.keyboard_funcs.get(msg) to: func=self.keyboard_funcs.get( int(str(msg)) )
But is don't work, i think i note all problem.
Thanks for you help in advance :)

Trouble with recursive text splitting

trying to split text via text-defined boundary markers using recursion and create a list of lists and strings containing all of the organized parts of the original text file.
The split isn't happening.
Here is the short version: The real problem script:
def separate(text,boundary = None):
if boundary == None:
m = re.findall(r'(?<=boundary=).*',text)
i = 0
while i < len(m): #have all levels of Boundary/headers named
boundary = m[i]
textList = recursiveSplit(text,boundary)
i += 1
pdb.set_trace()
return textList
def recursiveSplit(chunk,boundary):
if type(chunk) is types.StringType:
ar = re.split(r'(?P<boundary>)(?!--)',chunk)
return ar
if type(chunk) is types.ListType:
i = 0
while i < len(chunk):
chunk[i] = recursiveSplit(chunk[i],boundary)
i += 1
return obj
I've posted this script before and people wanted me to post it in its entirety so I'll do that
#Textbasics email parser
#based on a "show original" file converted into text
from sys import argv
import re, os, pdb, types
script, filename = argv
text = open(filename).read()
type = "text only" #Set the default type of email
#cut the email up by sections
#--A section is defined as any time there are two line breaks in a row
textList = re.split(r"\n\n", text)
header = textList[0]
if re.search(r'MIME-Version',header):
type = "MIME"
# If mail has no attachments, parse as a text-only email
class Parser(object):
def __init__(self,textList):
a = 1
self.body = ""
self.textList = textList
self.header = textList[0]
while a < len(textList):
self.body = self.body + textList[a] + '\n\n'
a += 1
m = re.search(r'(?<=Subject: ).*', self.header)
self.subject = m.group(0)
m = re.search(r'(?<=From: ).*', self.header)
self.fromVar = m.group(0)
m = re.search(r'(?<=To: ).*', self.header)
self.toVar = m.group(0)
m = re.search(r'(?<=Date: )\w+\s\w+\s\w+', self.header)
self.date = m.group(0)
def returnParsed(self,descriptor = "all"):
if descriptor == "all":
retv = "Subject: " + self.subject + "\n" + "From: " + self.fromVar + "\n" + "To: " + self.toVar + "\n" + "Date: " + self.date + "\n" + "\n" + self.body
return retv
if descriptor == "subject":
return self.subject
if descriptor == "fromVar":
return self.fromVar
if descriptor == "toVar":
return self.toVar
if descriptor == "date":
return self.date
if descriptor == "body":
return self.body
class MIMEParser(Parser):
class MIMEDataDecoder(object):
def __init__(self,decodeString,type):
pass
def __init__(self,textList):
self.textList = textList
self.nestedItems = []
newItem = NestedItem(self)
newItem.setContentType("Header")
newItem.setValue(self.textList[0])
self.nestedItems.append(newItem)
if re.search(r'(boundary=)',newItem.value):
helperItem = NestedItem(self)
helperItem.value = (self.textList[0])
m = re.search(r'(?<=Content-Type: ).+(?=;)',newItem.value)
helperItem.setContentType(m.group(0))
self.nestedItems.append(helperItem)
self.organizeData()
"""i = 0
while i < len(self.textList):
newItem = NestedItem(self)
ct = self.nextContentType
newItem.setContentType(ct)
newItem.setValue(self.textList[i])
self.nestedItems.append(newItem)
m = re.search(r'(?<=Content-Type: ).+(?=;)',self.textList[i])
if m:
self.nextContentType = m.group(0)
i += 1
"""
def nestItem (self,item):
self.nestedItems.append(item)
def organizeData(self):
self.nestLevel = 1
self.currentSuper = self
m = re.search(r'(?<=boundary=).*',self.textList[0])
self.currentBoundary = m.group(0)
self.currentList = self.textList
self.currentList.remove(self.textList[0])
self.formerObjectDatabase = {}
pdb.set_trace()
while self.nestLevel > 0:
i = 0
while i < len(self.currentList):
boundary = self.currentBoundary
#If block is a "normal block", containing a current boundary identifier
p = re.search(r'--(?P<boundary>)(?!--)', text)
if p:
newItem = NestedItem(self.currentSuper)
newItem.setValue(self.currentList[i])
r = re.search(r'(?<=Content-Type: ).+(?=;)',newItem.value)
if r:
newItem.setContentType(r.group(0))
self.currentObject = newItem
self.currentSuper.nestItem(self.currentObject)
#If the block contains a new block boundary
m = re.search(r'(?<=boundary=).*',self.currentList[i])
if m:
#begin new layer of recursive commands
newFormerObject = self.FormerCurrentObject(self.currentList,self.currentSuper,self.currentBoundary)
self.formerObjectDatabase[self.nestLevel] = newFormerObject
self.currentSuper = self.currentObject
self.nestLevel += 1
self.currentBoundary = m.group(0)
boundary = self.currentBoundary
#self.currentList = re.split(r'--(?P<boundary>)(?!--)', self.currentList[i])
boundary = self.currentBoundary
#If block contains an "end of boundary" marker
q = re.search(r'(?P<boundary>)--', text)
if q:
self.nestLevel -= 1
currentObject = self.formerObjectDatabase[self.nestLevel]
self.currentList = currentObject.formerList
self.currentSuper = currentObject.formerSuper
self.currentBoundary = currentObject.formerBoundary
i += 1
class FormerCurrentObject:
def __init__(self,formerList,formerSuper,formerBoundary):
self.formerList = formerList
self.formerSuper = formerSuper
self.formerBoundary = formerBoundary
def printAll(self):
print "printing all: %d" % len(self.nestedItems)
i = 0
while i < len(self.nestedItems):
print "printing out item %d" % i
self.nestedItems[i].printOut()
i += 1
class NestedItem(object):
def __init__(self,superObject,contentType=" ",value = " "):
self.superObject = superObject
self.contentType = contentType
self.value = value
self.nestedItems = []
def nestItem(self,item):
self.nestedItems.append(item)
def printOut(self,printBuffer = ""):
print printBuffer + '++%s' % self.contentType
print printBuffer + self.value
a = 0
printBuffer = printBuffer + " "
while a < len(self.nestedItems):
self.nestedItems[a].printOut(printBuffer)
def setContentType(self,contentType):
self.contentType = contentType
def setValue(self,value):
self.value = value
if type == "text only":
p = Parser(textList)
print p.returnParsed()
# ---PROBLEM CODE STARTS HERE---
def separate(text,boundary = None):
pdb.set_trace()
if boundary == None:
m = re.findall(r'(?<=boundary=).*',text)
i = 0
textList = [text]
while i < len(m): #have all levels of Boundary/headers named
boundary = m[i]
textList = recursiveSplit(textList,boundary)
i += 1
return textList
def recursiveSplit(chunk,boundary):
if type(chunk) is types.ListType: #<<--error occurs here
for obj in chunk:
recursiveSplit(obj,boundary)
if type(chunk) is types.StringType:
list = re.split(r'(?P<boundary>)(?!--)',chunk)
return list
return None
#---PROBLEM CODE ENDS(?) HERE---
if type == "MIME":
#separate the text file instead by its boundary identifier
p = MIMEParser(separate(text))
p.printAll()
You can use any MIME type email for this to run. Here's the one I've been using for convenience
MIME-Version: 1.0
Received: by 10.112.170.40 with HTTP; Fri, 3 May 2013 05:08:21 -0700 (PDT)
Date: Fri, 3 May 2013 08:08:21 -0400
Delivered-To: MYEMAIL#gmail.com
Message-ID: <#mail.gmail.com>
Subject: MiB 5/3/13 7:43AM (EST)
From: ME<MYEMAIL#gmail.com>
To: SOMEONE <SOMEONE#aol.com>
Content-Type: multipart/mixed; boundary=BNDRY1
--BNDRY1
Content-Type: multipart/alternative; boundary=BNDRY2
--BNDRY2
Content-Type: text/plain; charset=ISO-8859-1
-changed signature methods to conform more to working clinic header
methods(please test/not testable in simulator)
-confirmed that signature image is showing up in simulator. Awaiting
further tests
-Modified findings spacing/buffer. See if you like it
--BNDRY2
Content-Type: text/html; charset=ISO-8859-1
<div dir="ltr">-changed signature methods to conform more to working clinic header methods(please test/not testable in simulator)<div style>-confirmed that signature image is showing up in simulator. Awaiting further tests</div>
<div style>-Modified findings spacing/buffer. See if you like it</div></div>
--BNDRY2--
--BNDRY1
Content-Type: application/zip; name="Make it Brief.ipa.zip"
Content-Disposition: attachment; filename="Make it Brief.ipa.zip"
Content-Transfer-Encoding: base64
X-Attachment-Id: f_hg9biuno0
<<FILE DATA>>
--BNDRY1--
The issue was in the regex. There may be a cooler way to do it, but I just created a search string literal based off of the variables.
def recursiveSplit(chunk,boundary):
if type(chunk) is types.StringType:
#ar = re.split(r'(?P<boundary>)(?!--)',chunk)
searchString = "--%s" % boundary
print searchString
ar = re.split(searchString,chunk)
return ar
if type(chunk) is types.ListType:
i = 0
while i < len(chunk):
chunk[i] = recursiveSplit(chunk[i],boundary)
i += 1
return obj

Categories

Resources