Python - WindowName for Compare with List - python

I am currently blocked on a point of a program in Python.
I wish to compare in a list, the WindowName event to launch directives.
Example:
import win32api
import pyHook
liste = ["Google", "Task"]
if event.WindowName == liste:
Screenshot ()
return True
else:
return False
Complete code, he work:
def OnMouseEvent(event):
global interval
data = '\n[' + str(time.ctime().split(' ')[3]) + ']' \
+ ' WindowName : ' + str(event.WindowName)
data += '\n\tButton:' + str(event.MessageName)
data += '\n\tClicked in (Position):' + str(event.Position)
data += '\n===================='
global t, start_time, pics_names
"""
Code Edit
"""
t = t + data
if len(t) > 300:
ScreenShot()
"""
Finish
"""
if len(t) > 500:
f = open('Logfile.txt', 'a')
f.write(t)
f.close()
t = ''
if int(time.time() - start_time) == int(interval):
Mail_it(t, pics_names)
start_time = time.time()
t = ''
return True
else:
return False
When i edit the code in """ doesn't work :
t = t + data
liste = ["Google", "Task"]
if event.WindowName == liste:
ScreenShot()
He return :
File "C:\Python26\lib\site-packages\pyHook\HookManager.py", line 324, in MouseSwitch func = self.mouse_funcs.get(msg) TypeError: an integer is required
I test this :
HookManager: func = self.keyboard_funcs.get(msg) to: func=self.keyboard_funcs.get( int(str(msg)) )
But is don't work, i think i note all problem.
Thanks for you help in advance :)

Related

How to fix KeyError in python --> KeyError: 'message'?

This error usually should not appear, but recently when I run this, this error appears, and I have no clue how I will fix it.
Please if Anyone can help me to fix this error on this code below:
import requests
import json
from time import sleep
global OFFSET
OFFSET = 0
botToken = ""
global requestURL
global sendURL
requestURL = "http://api.telegram.org/bot" + botToken + "/getUpdates"
sendURL = "http://api.telegram.org/bot" + botToken + "/sendMessage"
def update (url):
global OFFSET
try:
update_raw = requests.get(url + "?offset=" + str(OFFSET))
update = update_raw.json()
result = extract_result(update)
if result != False:
OFFSET = result['update_id'] + 1
return result
else:
return False
except requests.exceptions.ConnectionError:
pass
def extract_result (dict):
result_array = dict['result']
if result_array == []:
return False
else:
result_dic = result_array[0]
return result_dic
def is_callback (dict):
if 'callback_query' in dict:
return True
def send_message (chatId, message):
requests.post(sendURL + "?chat_id=" + str(chatId) + "&text=" + message)
def send_message_button (chatId, message, buttonJSON):
requests.post(sendURL + "?chat_id=" + str(chatId) + "&reply_markup=" + buttonJSON + "&text=" + message)
#print (sendURL + "?chat_id=" + str(chatId) + "&reply_markup=" + buttonJSON + "&text=" + message)
while True:
newmessage = update (requestURL)
if newmessage != False:
if is_callback(newmessage) == True:
userchatid = newmessage['callback_query']['message']['chat']['id']
usertext = newmessage['callback_query']['message']['text']
username = newmessage['callback_query']['message']['chat']['first_name']
callback_data = newmessage['callback_query']['data']
send_message (userchatid, "Callback from " + callback_data + ", pressed by " + username)
else:
userchatid = newmessage['message']['chat']['id']
usertext = newmessage['message']['text']
username = newmessage['message']['chat']['first_name']
if usertext.lower() == "button":
buttonDict1 = {"text":"Knopf\n" + "hitest", "callback_data":"Knopf"}
buttonDict2 = {"text":"Knopf2", "callback_data":"Knopf2"}
buttonArr = {"inline_keyboard":[[buttonDict1, buttonDict2]]}
send_message_button (userchatid, "Hi " + username, json.dumps(buttonArr))
else:
send_message(userchatid, "You said: " + usertext)
sleep (1)
This is the error that appears to me after I run this bot
Line: 67
userchatid = newmessage['message']['chat']['id']
KeyError: 'message'
You catch the requests.exceptions.ConnectionError but don't handle it ( in the update function ), so now update does not return False as it returns nothing at all and can pass your check and cause havock.
Try to deal with the exception, or at least put a print in there to see if it's the one causing you issues, good luck!

why it say "the action cannot be completed because the file is open in python“?

def main_loop():
global errname, errtime, error_detail, conclusion
error_detail = ""
facts_all = {}
facts = []
buffer = 0
current_time = datetime.now()
while os.path.exists("C:\Winusr"):
print(paths["wintrace"])
try:
start_point = 0
old_size = os.path.getsize(paths["wintrace"])
while os.path.getsize(paths["wintrace"])>= old_size:
#fo = open(paths["wintrace"], "rb")
#fo.seek(start_point,1)
shutil.copyfile(paths["wintrace"], "C:\Winusr\wintrace1.log")
fo = open("C:\Winusr\wintrace1.log", "rb")
fo.seek(start_point, 1)
errtime = datetime(1900, 1, 1)
old_size = os.path.getsize(paths["wintrace"])
#start from here
for line in fo.readlines():
line = str(line.decode('ISO-8859-1'))
print(line)
if fnmatch.fnmatch(line, "*START DUMP LOG BUFFER*"):
buffer = 1
if fnmatch.fnmatch(line, "*END DUMP LOG BUFFER*"):
buffer = 0
if buffer == 1:
continue
facts_all = collect_facts(line,facts_all,key_string,key_value_dic)
for pattern in error_detect:
if fnmatch.fnmatch(line, pattern):
try:
err_type = df[df["Error Detect Keyword"] == pattern]["Err Type"].to_string(index=False).lstrip()
errname = df[df["Err Type"] == err_type]["Error Name"].tolist()[0].lstrip()
errtime = datetime.strptime(
datetime.fromtimestamp(os.path.getmtime(paths["wintrace"])).strftime("%Y-%m-%d") + " " + line[:8], "%Y-%m-%d %H:%M:%S") #"%d-%b-%Y %H:%M:%S"
#errtime = datetime.fromtimestamp(os.path.getmtime(paths["wintrace"])).strftime("%Y-%m-%d") + " " + line[:8]
#errtime = errtime.strftime('%Y-%m-%d %H:%M:%S')
product = re.findall(r"[/](.+?)[.]", paths["cur"])
product = product[0].split("/")[-1]
tester = tester_name(paths["cur"])
if len(facts_all) != 0:
facts.append(errname)
#idex = 9999
for fact, line in facts_all.items():
if fact in dic1[errname]:
error_detail = error_detail + line + '\n'
facts.append(fact)
print("err_detail1", error_detail)
if len(facts) != 1:
facts = list(set(facts))
conclusion = inference_engine(facts)
print("errtime", errtime)
print("current_time", current_time)
if conclusion != "cannot find solution for this error" and errtime > current_time:
solutions = sop2(errlist, errname, conclusion)
row = recording(tester, product, errname, errtime, error_detail, conclusion)
print("gg pop out GUI!!!")
#send_email(errname, errtime, tester, error_detail)
GUI(errname, errtime, error_detail, conclusion, solutions, row)
current_time = datetime.now()
workbook = xlrd.open_workbook(r"G:\expert system data\Machine Database.xls")
workbook1 = copy(workbook)
ws1 = workbook1.get_sheet(0)
style = xlwt.XFStyle()
style.num_format_str = 'yyyy-mm-dd hh:mm:ss'
ws1.write(row, 8, current_time, style)
workbook1.save(r"G:\expert system data\Machine Database.xls")
error_detail = ""
facts_all = {}
facts = []
error_detail = ""
facts_all = {}
facts = []
except:
continue
start_point = fo.tell()
fo.close()
except:
continue
else:
main_loop()
the paths["wintrace"] is ""C:\Winusr\Wintrace.log", i dont want it is open cause sometimes need to change its name or delete, i copy this file and open the copied one, but it still show it is open, can u help me check where it is opened? besides, i use "filepath = tkinter.filedialog.askopenfilename()", but dont think it will open the wintrace file.the error screenshot

fastest way to check a .rar file password - other than rarfile extractall

So i have written a little .rar password "cracker" based on tutorials, using the code underneath. It works fine, but is very slow when the file size is big. The best reason i could find is, that ever so often when you put in a wrong password, it extracts the whole file, before refusing the password. With small files that is not a big problem, but with big files it slows the process a lot.
Is there a way to just check a hashed version of the password against a iterated hash?
import itertools
import rarfile
import time
rarfile.UNRAR_TOOL = "path"
rar = rarfile.RarFile("path")
done = False
n = 0
inputList = ["A","B","1","2"]
class h():
startword = ""
rep = 1
start = 0
itrTot = 0
f = open("save.txt")
for x,each in enumerate(f):
if x == 0:
h.rep = int(each)
else:
h.start = int(each)-3
f.close()
if h.start < 0:
h.start = 0
h.itrTot = len(inputList)**h.rep
def pw_guess():
res = itertools.product(inputList, repeat=h.rep)
for guess in res:
yield guess
start_time = time.time()
while True:
guess_generator = pw_guess()
for guess in guess_generator:
n += 1
if h.startword == "":
h.startword = guess
else:
if guess == h.startword:
h.rep += 1
n = 0
h.itrTot = len(inputList)**h.rep
h.start = 0
print("next rotation, itr rep: "+str(h.rep))
h.startword = ""
break
if n < h.start:
continue
txt = f"({n}/{h.itrTot}, {round((100/h.itrTot)*n,2)}%) - {h.rep}: {''.join(guess)}"
print(txt)
try:
rar.extractall(path="path",members=None,pwd=''.join(guess))
print("Pass found!")
print(str(n) + " - " + str(h.rep) + ": " + str(''.join(guess)))
done = True
txt2 = f"({n}/{h.itrTot}, {round((100/h.itrTot)*n,2)}%) - {h.rep}: {''.join(guess)}\n"
f = open("pass.txt", "a")
f.write(txt2)
f.close()
break
except:
f = open("save.txt", "w")
f.write(str(h.rep) + "\n" + str(n))
f.close()
if done:
end_time = time.time()
break
print("End time: " + str(end_time-start_time))
John the ripper is the answer. +20k passwords checked in 2 minutes. But using parts of the script for wordlist generation, is still very fast and functional.
wordlist generator i used:
import itertools
inputList = ["A","B","C","D","1","2","3","4","5"]
itr = 7
WL_path = "path"
f = open(WL_path,"w")
f.write("")
f.close()
class h():
startword = ""
rep = 1
itrTot = 0
txt = ""
h.itrTot = len(inputList)**itr
print("Wordlist length: " + str(h.itrTot))
def pw_guess():
res = itertools.product(inputList, repeat=h.rep)
for guess in res:
yield guess
while True:
guess_generator = pw_guess()
for guess in guess_generator:
if h.startword == "":
h.startword = guess
else:
if guess == h.startword:
h.rep += 1
print("next rotation, itr rep: " + str(h.rep))
h.startword = ""
break
h.txt = ''.join(guess)
f = open(WL_path, "a")
f.write(h.txt+"\n")
f.close()
if h.rep > itr:
break

Parsing Event Information Table files

My dreambox compatible video recorder stores event information table ".eit" files with every recording. I'd like to work with this information to rearrange my recordings.
A similar question came up in http://www.i-have-a-dreambox.com/wbb2/thread.php?threadid=186234&sid=3b36acb1ba62e4724cb47216ce08a564
The format seems to be a binary format as outlined in:
https://de.wikipedia.org/wiki/Event_Information_Table
and
http://www.etsi.org/deliver/etsi_en/300400_300499/300468/01.14.01_60/en_300468v011401p.pdf
I am now looking for a parser for such files. Where could I find one that works with files and does not assume a broadcast stream as input?
What did i try so far?
I searched the web and found the following links and pointers:
There seems to be a java library
https://docs.oracle.com/javame/config/cdc/opt-pkgs/api/jsr927/javax/tv/service/guide/ProgramEvent.html
which is part of the JSR 927 https://jcp.org/en/jsr/detail?id=927
specification.
As it looks this libary is only available for Java-ME see https://en.wikipedia.org/wiki/Java_TV
If found some dvb related EIT code snippets e.g.
https://github.com/jinfeng-geeya/3202C/blob/master/SRC/lib/libdvb/libepg/eit_parser.c
or
http://linuxtv.org/docs/libdvbv5/eit_8h.html
as part of the Kamaelia DVB Tools Project http://www.kamaelia.org/Developers/Projects/DVBTools.html there seems to be a python solution:
http://pydoc.net/Python/Kamaelia/0.6.0/Kamaelia.Device.DVB.Parse.ParseEventInformationTable/
The closest I found so far was from a hint athttp://forums.openpli.org/topic/29141-eit-file-format/ which points to:
https://github.com/betonme/e2openplugin-EnhancedMovieCenter/blob/master/src/EitSupport.py
Currently I am pursuing to go from this Open Source Python Code.
This is a Python script that seems to be a valid start.
It's available as opensource at https://github.com/WolfgangFahl/eitparser where you'll find the latest python3 compatible version and documentation.
When you call it with
python EitParser.py SomeEitFile
it will print out the name and description of the eit file.
Add you language codes as you need e.g. from https://github.com/libo/Enigma2/blob/master/lib/python/Tools/ISO639.py
#!/usr/bin/python
# encoding: utf-8
#
# EitSupport
# Copyright (C) 2011 betonme
# Copyright (C) 2016 Wolfgang Fahl
#
# This EITParser is based on:
# https://github.com/betonme/e2openplugin-EnhancedMovieCenter/blob/master/src/EitSupport.py
#
# In case of reuse of this source code please do not remove this copyright.
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# For more information on the GNU General Public License see:
# <http://www.gnu.org/licenses/>.
#
import os
import struct
import time
from datetime import datetime
#from Components.config import config
#from Components.Language import language
#from EMCTasker import emcDebugOut
#from IsoFileSupport import IsoSupport
#from MetaSupport import getInfoFile
#def crc32(data):
# poly = 0x4c11db7
# crc = 0xffffffffL
# for byte in data:
# byte = ord(byte)
# for bit in range(7,-1,-1): # MSB to LSB
# z32 = crc>>31 # top bit
# crc = crc << 1
# if ((byte>>bit)&1) ^ z32:
# crc = crc ^ poly
# crc = crc & 0xffffffffL
# return crc
decoding_charSpecHR = {u'Ć': u'\u0106', u'æ': u'\u0107', u'®': u'\u017D', u'¾': u'\u017E', u'©': u'\u0160', u'¹': u'\u0161', u'Č': u'\u010C', u'è': u'\u010D', u'ð': u'\u0111'}
decoding_charSpecCZSK = {u'Ï'+u'C': u'Č',u'Ï'+u'E': u'Ě',u'Ï'+u'L': u'Ľ',u'Ï'+u'N': u'Ň',u'Ï'+u'R': u'Ř',u'Ï'+u'S': u'Š',u'Ï'+u'T': u'Ť',u'Ï'+u'Z': u'Ž',u'Ï'+u'c': u'č',u'Ï'+u'd': u'ď',u'Ï'+u'e': u'ě',u'Ï'+u'l': u'ľ', u'Ï'+u'n': u'ň',
u'Ï'+u'r': u'ř',u'Ï'+u's': u'š',u'Ï'+u't': u'ť',u'Ï'+u'z': u'ž',u'Ï'+u'D': u'Ď',u'Â'+u'A': u'Á',u'Â'+u'E': u'É',u'Â'+u'I': u'Í',u'Â'+u'O': u'Ó',u'Â'+u'U': u'Ú',u'Â'+u'a': u'á',u'Â'+u'e': u'é',u'Â'+u'i': u'í',u'Â'+u'o': u'ó',
u'Â'+u'u': u'ú',u'Â'+u'y': u'ý',u'Ã'+u'o': u'ô',u'Ã'+u'O': u'Ô',u'Ê'+u'u': u'ů',u'Ê'+u'U': u'Ů',u'È'+u'A': u'Ä',u'È'+u'E': u'Ë',u'È'+u'I': u'Ï',u'È'+u'O': u'Ö',u'È'+u'U': u'Ü',u'È'+u'Y': u'Ÿ',u'È'+u'a': u'ä',u'È'+u'e': u'ë',
u'È'+u'i': u'ï',u'È'+u'o': u'ö',u'È'+u'u': u'ü',u'È'+u'y': u'ÿ'}
def convertCharSpecHR(text):
for i, j in decoding_charSpecHR.iteritems():
text = text.replace(i, j)
return text
def convertCharSpecCZSK(text):
for i, j in decoding_charSpecCZSK.iteritems():
text = text.replace(i, j)
return text
def parseMJD(MJD):
# Parse 16 bit unsigned int containing Modified Julian Date,
# as per DVB-SI spec
# returning year,month,day
YY = int( (MJD - 15078.2) / 365.25 )
MM = int( (MJD - 14956.1 - int(YY*365.25) ) / 30.6001 )
D = MJD - 14956 - int(YY*365.25) - int(MM * 30.6001)
K=0
if MM == 14 or MM == 15: K=1
return (1900 + YY+K), (MM-1-K*12), D
def unBCD(byte):
return (byte>>4)*10 + (byte & 0xf)
#from Tools.ISO639 import LanguageCodes
# -*- coding: iso-8859-2 -*-
LanguageCodes = { }
LanguageCodes["deu"] = LanguageCodes["ger"] = LanguageCodes["de"] = ("German", "Germanic")
LanguageCodes["fra"] = LanguageCodes["fre"] = LanguageCodes["fr"] = ("French", "Romance")
def language_iso639_2to3(alpha2):
ret = alpha2
if alpha2 in LanguageCodes:
language = LanguageCodes[alpha2]
for alpha, name in LanguageCodes.items():
if name == language:
if len(alpha) == 3:
return alpha
return ret
#TEST
#print LanguageCodes["sv"]
#print language_iso639_2to3("sv")
# Eit File support class
# Description
# http://de.wikipedia.org/wiki/Event_Information_Table
class EitList():
EIT_SHORT_EVENT_DESCRIPTOR = 0x4d
EIT_EXTENDED_EVENT_DESCRIPOR = 0x4e
def __init__(self, path=None):
self.eit_file = None
#TODO
# The dictionary implementation could be very slow
self.eit = {}
self.iso = None
self.__newPath(path)
self.__readEitFile()
def __newPath(self, path):
name = None
if path:
if self.eit_file != path:
self.eit_file = path
def __mk_int(self, s):
return int(s) if s else 0
def __toDate(self, d, t):
if d and t:
#TODO Is there another fast and safe way to get the datetime
try:
return datetime(int(d[0]), int(d[1]), int(d[2]), int(t[0]), int(t[1]))
except ValueError:
return None
else:
return None
##############################################################################
## Get Functions
def getEitsid(self):
return self.eit.get('service', "") #TODO
def getEitTsId(self):
return self.eit.get('transportstream', "") #TODO
def getEitWhen(self):
return self.eit.get('when', "")
def getEitStartDate(self):
return self.eit.get('startdate', "")
def getEitStartTime(self):
return self.eit.get('starttime', "")
def getEitDuration(self):
return self.eit.get('duration', "")
def getEitName(self):
return self.eit.get('name', "").strip()
def getEitDescription(self):
return self.eit.get('description', "").strip()
# Wrapper
def getEitShortDescription(self):
return self.getEitName()
def getEitExtendedDescription(self):
return self.getEitDescription()
def getEitLengthInSeconds(self):
length = self.eit.get('duration', "")
#TODO Is there another fast and safe way to get the length
if len(length)>2:
return self.__mk_int((length[0]*60 + length[1])*60 + length[2])
elif len(length)>1:
return self.__mk_int(length[0]*60 + length[1])
else:
return self.__mk_int(length)
def getEitDate(self):
return self.__toDate(self.getEitStartDate(), self.getEitStartTime())
##############################################################################
## File IO Functions
def __readEitFile(self):
data = ""
path = self.eit_file
#lang = language.getLanguage()[:2]
lang = language_iso639_2to3( "de" )
#print lang + str(path)
if path and os.path.exists(path):
#print "Reading Event Information Table " + str(path)
# Read data from file
# OE1.6 with Pyton 2.6
#with open(self.eit_file, 'r') as file: lines = file.readlines()
f = None
try:
f = open(path, 'rb')
#lines = f.readlines()
data = f.read()
except Exception, e:
emcDebugOut("[META] Exception in readEitFile: " + str(e))
finally:
if f is not None:
f.close()
# Parse the data
if data and 12 <= len(data):
# go through events
pos = 0
e = struct.unpack(">HHBBBBBBH", data[pos:pos+12])
event_id = e[0]
date = parseMJD(e[1]) # Y, M, D
time = unBCD(e[2]), unBCD(e[3]), unBCD(e[4]) # HH, MM, SS
duration = unBCD(e[5]), unBCD(e[6]), unBCD(e[7]) # HH, MM, SS
running_status = (e[8] & 0xe000) >> 13
free_CA_mode = e[8] & 0x1000
descriptors_len = e[8] & 0x0fff
if running_status in [1,2]:
self.eit['when'] = "NEXT"
elif running_status in [3,4]:
self.eit['when'] = "NOW"
self.eit['startdate'] = date
self.eit['starttime'] = time
self.eit['duration'] = duration
pos = pos + 12
short_event_descriptor = []
short_event_descriptor_multi = []
extended_event_descriptor = []
extended_event_descriptor_multi = []
component_descriptor = []
content_descriptor = []
linkage_descriptor = []
parental_rating_descriptor = []
endpos = len(data) - 1
while pos < endpos:
rec = ord(data[pos])
length = ord(data[pos+1]) + 2
if rec == 0x4D:
descriptor_tag = ord(data[pos+1])
descriptor_length = ord(data[pos+2])
ISO_639_language_code = str(data[pos+3:pos+5])
event_name_length = ord(data[pos+5])
short_event_description = data[pos+6:pos+6+event_name_length]
if ISO_639_language_code == lang:
short_event_descriptor.append(short_event_description)
short_event_descriptor_multi.append(short_event_description)
elif rec == 0x4E:
ISO_639_language_code = str(data[pos+3:pos+5])
extended_event_description = ""
extended_event_description_multi = ""
for i in range (pos+8,pos+length):
if str(ord(data[i]))=="138":
extended_event_description += '\n'
extended_event_description_multi += '\n'
else:
if data[i]== '\x10' or data[i]== '\x00' or data[i]== '\x02':
pass
else:
extended_event_description += data[i]
extended_event_description_multi += data[i]
if ISO_639_language_code == lang:
extended_event_descriptor.append(extended_event_description)
extended_event_descriptor_multi.append(extended_event_description)
elif rec == 0x50:
component_descriptor.append(data[pos+8:pos+length])
elif rec == 0x54:
content_descriptor.append(data[pos+8:pos+length])
elif rec == 0x4A:
linkage_descriptor.append(data[pos+8:pos+length])
elif rec == 0x55:
parental_rating_descriptor.append(data[pos+2:pos+length])
else:
#print "unsopported descriptor: %x %x" %(rec, pos + 12)
#print data[pos:pos+length]
pass
pos += length
# Very bad but there can be both encodings
# User files can be in cp1252
# Is there no other way?
if short_event_descriptor:
short_event_descriptor = "".join(short_event_descriptor)
else:
short_event_descriptor = "".join(short_event_descriptor_multi)
if short_event_descriptor:
#try:
# short_event_descriptor = short_event_descriptor.decode("iso-8859-1").encode("utf-8")
#except UnicodeDecodeError:
# pass
try:
short_event_descriptor.decode('utf-8')
except UnicodeDecodeError:
try:
short_event_descriptor = short_event_descriptor.decode("cp1252").encode("utf-8")
except UnicodeDecodeError:
# do nothing, otherwise cyrillic wont properly displayed
#short_event_descriptor = short_event_descriptor.decode("iso-8859-1").encode("utf-8")
pass
if (lang == "cs") or (lang == "sk"):
short_event_descriptor = str(convertCharSpecCZSK(short_event_descriptor))
if (lang == "hr"):
short_event_descriptor = str(convertCharSpecHR(short_event_descriptor))
self.eit['name'] = short_event_descriptor
# Very bad but there can be both encodings
# User files can be in cp1252
# Is there no other way?
if extended_event_descriptor:
extended_event_descriptor = "".join(extended_event_descriptor)
else:
extended_event_descriptor = "".join(extended_event_descriptor_multi)
if extended_event_descriptor:
#try:
# extended_event_descriptor = extended_event_descriptor.decode("iso-8859-1").encode("utf-8")
#except UnicodeDecodeError:
# pass
try:
extended_event_descriptor.decode('utf-8')
except UnicodeDecodeError:
try:
extended_event_descriptor = extended_event_descriptor.decode("cp1252").encode("utf-8")
except UnicodeDecodeError:
# do nothing, otherwise cyrillic wont properly displayed
#extended_event_descriptor = extended_event_descriptor.decode("iso-8859-1").encode("utf-8")
pass
if (lang == "cs") or (lang == "sk"):
extended_event_descriptor = str(convertCharSpecCZSK(extended_event_descriptor))
if (lang == "hr"):
extended_event_descriptor = str(convertCharSpecHR(extended_event_descriptor))
self.eit['description'] = extended_event_descriptor
else:
# No date clear all
self.eit = {}
"""Module docstring.
Read Eit File and show the information.
"""
import sys
import getopt
def readeit(eitfile):
eitlist=EitList(eitfile)
print eitlist.getEitName();
print eitlist.getEitStartDate();
print eitlist.getEitDescription();
def main():
# parse command line options
try:
opts, args = getopt.getopt(sys.argv[1:], "h", ["help"])
except getopt.error, msg:
print msg
print "for help use --help"
sys.exit(2)
# process options
for o, a in opts:
if o in ("-h", "--help"):
print __doc__
sys.exit(0)
# process arguments
for arg in args:
readeit(arg) # process() is defined elsewhere
if __name__ == "__main__":
main()

Issue with Python garbage collector?

I have a simple program which reads a large file containing few million rows, parses each row (numpy array) and converts into an array of doubles (python array) and later writes into an hdf5 file. I repeat this loop for multiple days. After reading each file, i delete all the objects and call garbage collector. When I run the program, First day is parsed without any error but on the second day i get MemoryError. I monitored the memory usage of my program, during first day of parsing, memory usage is around 1.5 GB. When the first day parsing is finished, memory usage goes down to 50 MB. Now when 2nd day starts and i try to read the lines from the file I get MemoryError. Following is the output of the program.
source file extracted at C:\rfadump\au\2012.08.07.txt
parsing started
current time: 2012-09-16 22:40:16.829000
500000 lines parsed
1000000 lines parsed
1500000 lines parsed
2000000 lines parsed
2500000 lines parsed
3000000 lines parsed
3500000 lines parsed
4000000 lines parsed
4500000 lines parsed
5000000 lines parsed
parsing done.
end time is 2012-09-16 23:34:19.931000
total time elapsed 0:54:03.102000
repacking file
done
> s:\users\aaj\projects\pythonhf\rfadumptohdf.py(132)generateFiles()
-> while single_date <= self.end_date:
(Pdb) c
*** 2012-08-08 ***
source file extracted at C:\rfadump\au\2012.08.08.txt
cought an exception while generating file for day 2012-08-08.
Traceback (most recent call last):
File "rfaDumpToHDF.py", line 175, in generateFile
lines = self.rawfile.read().split('|\n')
MemoryError
I am very sure that windows system task manager shows the memory usage as 50 MB for this process. It looks like the garbage collector or memory manager for Python is not calculating the free memory correcly. There should be lot of free memory but it thinks there is not enough.
Any idea?
EDIT
Adding my code here
I will put parts of my code. I am new to python, please pardon my python coding style.
module 1
def generateFile(self, current_date):
try:
print "*** %s ***" % current_date.strftime("%Y-%m-%d")
weekday=current_date.weekday()
if weekday >= 5:
print "skipping weekend"
return
self.taqdb = taqDB(self.index, self.offset)
cache_filename = os.path.join(self.cache_dir,current_date.strftime("%Y.%m.%d.h5"))
outputFile = config.hdf5.filePath(self.index, date=current_date)
print "cache file: ", cache_filename
print "output file: ", outputFile
tempdir = "C:\\rfadump\\"+self.region+"\\"
input_filename = tempdir + filename
print "source file extracted at %s " % input_filename
## universe
reader = rfaTextToTAQ.rfaTextToTAQ(self.tickobj) ## PARSER
count = 0
self.rawfile = open(input_filename, 'r')
lines = self.rawfile.read().split('|\n')
total_lines = len(lines)
self.rawfile.close()
del self.rawfile
print "parsing started"
start_time = dt.datetime.now()
print "current time: %s" % start_time
#while(len(lines) > 0):
while(count < total_lines):
#line = lines.pop(0) ## This slows down processing
result = reader.parseline(lines[count]+"|")
count += 1
if(count % 500000 == 0):
print "%d lines parsed" %(count)
if(result == None):
continue
ric, timestamp, quotes, trades, levelsUpdated, tradeupdate = result
if(len(levelsUpdated) == 0 and tradeupdate == False):
continue
self.taqdb.insert(result)
## write to hdf5 TODO
writer = h5Writer.h5Writer(cache_filename, self.tickobj)
writer.write(self.taqdb.groups)
writer.close()
del lines
del self.taqdb, self.tickobj
##########################################################
print "parsing done."
end_time = dt.datetime.now()
print "end time is %s" % end_time
print "total time elapsed %s" % (end_time - start_time)
defragger = hdf.HDF5Defragmenter()
defragger.Defrag(cache_filename,outputFile)
del defragger
print "done"
gc.collect(2)
except:
print "cought an exception while generating file for day %s." % current_date.strftime("%Y-%m-%d")
tb = traceback.format_exc()
print tb
module 2 - taqdb - to store parsed data in an array
class taqDB:
def __init__(self, index, offset):
self.index = index
self.tickcfg = config.hdf5.getTickConfig(index)
self.offset = offset
self.groups = {}
def getGroup(self,ric):
if (self.groups.has_key(ric) == False):
self.groups[ric] = {}
return self.groups[ric]
def getOrderbookArray(self, ric, group):
datasetname = orderBookName
prodtype = self.tickcfg.getProdType(ric)
if(prodtype == ProdType.INDEX):
return
orderbookArrayShape = self.tickcfg.getOrderBookArrayShape(prodtype)
if(group.has_key(datasetname) == False):
group[datasetname] = array.array("d")
orderbookArray = self.tickcfg.getOrderBookArray(prodtype)
return orderbookArray
else:
orderbookArray = group[datasetname]
if(len(orderbookArray) == 0):
return self.tickcfg.getOrderBookArray(prodtype)
lastOrderbook = orderbookArray[-orderbookArrayShape[1]:]
return np.array([lastOrderbook])
def addToDataset(self, group, datasetname, timestamp, arr):
if(group.has_key(datasetname) == False):
group[datasetname] = array.array("d")
arr[0,0]=timestamp
a1 = group[datasetname]
a1.extend(arr[0])
def addToOrderBook(self, group, timestamp, arr):
self.addToDataset(self, group, orderBookName, timestamp, arr)
def insert(self, data):
ric, timestamp, quotes, trades, levelsUpdated, tradeupdate = data
delta = dt.timedelta(hours=timestamp.hour,minutes=timestamp.minute, seconds=timestamp.second, microseconds=(timestamp.microsecond/1000))
timestamp = float(str(delta.seconds)+'.'+str(delta.microseconds)) + self.offset
## write to array
group = self.getGroup(ric)
orderbookUpdate = False
orderbookArray = self.getOrderbookArray(ric, group)
nonzero = quotes.nonzero()
orderbookArray[nonzero] = quotes[nonzero]
if(np.any(nonzero)):
self.addToDataset(group, orderBookName, timestamp, orderbookArray)
if(tradeupdate == True):
self.addToDataset(group, tradeName, timestamp, trades)
Module 3- Parser
class rfaTextToTAQ:
"""RFA Raw dump file reader. Readers single line (record) and returns an array or array of fid value pairs."""
def __init__(self,tickconfig):
self.tickconfig = tickconfig
self.token = ''
self.state = ReadState.SEQ_NUM
self.fvstate = fvstate.FID
self.quotes = np.array([]) # read from tickconfig
self.trades = np.array([]) # read from tickconfig
self.prodtype = ProdType.STOCK
self.allquotes = {}
self.alltrades = {}
self.acvol = 0
self.levelsUpdated = []
self.quoteUpdate = False
self.tradeUpdate = False
self.depth = 0
def updateLevel(self, index):
if(self.levelsUpdated.__contains__(index) == False):
self.levelsUpdated.append(index)
def updateQuote(self, fidindex, field):
self.value = float(self.value)
if(self.depth == 1):
index = fidindex[0]+(len(self.tickconfig.stkQuotes)*(self.depth - 1))
self.quotes[index[0]][fidindex[1][0]] = self.value
self.updateLevel(index[0])
else:
self.quotes[fidindex] = self.value
self.updateLevel(fidindex[0][0])
self.quoteUpdate = True
def updateTrade(self, fidindex, field):
#self.value = float(self.value)
if(self.tickconfig.tradeUpdate(self.depth) == False):
return
newacvol = float(self.value)
if(field == acvol):
if(self.value > self.acvol):
tradesize = newacvol - self.acvol
self.acvol = newacvol
self.trades[fidindex] = tradesize
if(self.trades.__contains__(0) == False):
self.tradeUpdate = True
else:
self.trades[fidindex] = self.value
if(not (self.trades[0,1]==0 or self.trades[0,2]==0)):
self.tradeUpdate = True
def updateResult(self):
field = ''
valid, field = field_dict.FIDToField(int(self.fid), field)
if(valid == False):
return
if(self.value == '0'):
return
if(self.prodtype == ProdType.STOCK):
fidindex = np.where(self.tickconfig.stkQuotes == field)
if(len(fidindex[0]) == 0):
fidindex = np.where(self.tickconfig.stkTrades == field)
if(len(fidindex[0]) == 0):
return
else:
self.updateTrade(fidindex, field)
else:
self.updateQuote(fidindex, field)
else:
fidindex = np.where(self.tickconfig.futQuotes == field)
if(len(fidindex[0]) == 0):
fidindex = np.where(self.tickconfig.futTrades == field)
if(len(fidindex[0]) == 0):
return
else:
self.updateTrade(fidindex, field)
else:
self.updateQuote(fidindex, field)
def getOrderBookTrade(self):
if (self.allquotes.has_key(self.ric) == False):
acvol = 0
self.allquotes[self.ric] = self.tickconfig.getOrderBookArray(self.prodtype)
trades = self.tickconfig.getTradesArray()
self.alltrades[self.ric] = [trades, acvol]
return self.allquotes[self.ric], self.alltrades[self.ric]
def parseline(self, line):
self.tradeUpdate = False
self.levelsUpdated = []
pos = 0
length = len(line)
self.state = ReadState.SEQ_NUM
self.fvstate = fvstate.FID
self.token = ''
ch = ''
while(pos < length):
prevChar = ch
ch = line[pos]
pos += 1
#SEQ_NUM
if(self.state == ReadState.SEQ_NUM):
if(ch != ','):
self.token += ch
else:
self.seq_num = int(self.token)
self.state = ReadState.TIMESTAMP
self.token = ''
# TIMESTAMP
elif(self.state == ReadState.TIMESTAMP):
if(ch == ' '):
self.token = ''
elif(ch != ','):
self.token += ch
else:
if(len(self.token) != 12):
print "Invalid timestamp format. %s. skipping line.\n", self.token
self.state = ReadState.SKIPLINE
else:
self.timestamp = datetime.strptime(self.token,'%H:%M:%S.%f')
self.state = ReadState.RIC
self.token = ''
# RIC
elif(self.state == ReadState.RIC):
if(ch != ','):
self.token += ch
else:
self.ric = self.token
self.token = ''
self.ric, self.depth = self.tickconfig.replaceRic(self.ric)
self.prodtype = self.tickconfig.getProdType(self.ric)
if(self.tickconfig.subscribed(self.ric)):
self.state = ReadState.UPDATE_TYPE
self.quotes, trades = self.getOrderBookTrade()
self.trades = trades[0]
self.acvol = trades[1]
else:
self.state = ReadState.SKIPLINE
# UPDATE_TYPE
elif(self.state == ReadState.UPDATE_TYPE):
if(ch != '|'):
self.token += ch
else:
self.update_type = self.token
self.token = ''
self.state = ReadState.FVPAIRS
#SKIPLINE
elif(self.state == ReadState.SKIPLINE):
return None
# FV PAIRS
elif(self.state == ReadState.FVPAIRS):
# FID
if(self.fvstate == fvstate.FID):
if(ch != ','):
if(ch.isdigit() == False):
self.token = self.value+ch
self.fvstate = fvstate.FIDVALUE
self.state = ReadState.FVPAIRS
else:
self.token += ch
else:
self.fid = self.token
self.token = ''
self.fvstate = fvstate.FIDVALUE
self.state = ReadState.FVPAIRS
# FIDVALUE
elif(self.fvstate == fvstate.FIDVALUE):
if(ch != '|'):
self.token += ch
else:
self.value = self.token
self.token = ''
self.state = ReadState.FVPAIRS
self.fvstate = fvstate.FID
# TODO set value
self.updateResult()
return self.ric, self.timestamp, self.quotes, self.trades, self.levelsUpdated, self.tradeUpdate
Thanks.
The only reliable way to free memory is to terminate the process.
So, if your main program spawns a worker process to do most of the work (the stuff that is done in one day) then when that worker process completes, the memory used will be freed:
import multiprocessing as mp
def work(date):
# Do most of the memory-intensive work here
...
while single_date <= self.end_date:
proc = mp.Process(target = work, args = (single_date,))
proc.start()
proc.join()

Categories

Resources