Trouble with recursive text splitting - python

trying to split text via text-defined boundary markers using recursion and create a list of lists and strings containing all of the organized parts of the original text file.
The split isn't happening.
Here is the short version: The real problem script:
def separate(text,boundary = None):
if boundary == None:
m = re.findall(r'(?<=boundary=).*',text)
i = 0
while i < len(m): #have all levels of Boundary/headers named
boundary = m[i]
textList = recursiveSplit(text,boundary)
i += 1
pdb.set_trace()
return textList
def recursiveSplit(chunk,boundary):
if type(chunk) is types.StringType:
ar = re.split(r'(?P<boundary>)(?!--)',chunk)
return ar
if type(chunk) is types.ListType:
i = 0
while i < len(chunk):
chunk[i] = recursiveSplit(chunk[i],boundary)
i += 1
return obj
I've posted this script before and people wanted me to post it in its entirety so I'll do that
#Textbasics email parser
#based on a "show original" file converted into text
from sys import argv
import re, os, pdb, types
script, filename = argv
text = open(filename).read()
type = "text only" #Set the default type of email
#cut the email up by sections
#--A section is defined as any time there are two line breaks in a row
textList = re.split(r"\n\n", text)
header = textList[0]
if re.search(r'MIME-Version',header):
type = "MIME"
# If mail has no attachments, parse as a text-only email
class Parser(object):
def __init__(self,textList):
a = 1
self.body = ""
self.textList = textList
self.header = textList[0]
while a < len(textList):
self.body = self.body + textList[a] + '\n\n'
a += 1
m = re.search(r'(?<=Subject: ).*', self.header)
self.subject = m.group(0)
m = re.search(r'(?<=From: ).*', self.header)
self.fromVar = m.group(0)
m = re.search(r'(?<=To: ).*', self.header)
self.toVar = m.group(0)
m = re.search(r'(?<=Date: )\w+\s\w+\s\w+', self.header)
self.date = m.group(0)
def returnParsed(self,descriptor = "all"):
if descriptor == "all":
retv = "Subject: " + self.subject + "\n" + "From: " + self.fromVar + "\n" + "To: " + self.toVar + "\n" + "Date: " + self.date + "\n" + "\n" + self.body
return retv
if descriptor == "subject":
return self.subject
if descriptor == "fromVar":
return self.fromVar
if descriptor == "toVar":
return self.toVar
if descriptor == "date":
return self.date
if descriptor == "body":
return self.body
class MIMEParser(Parser):
class MIMEDataDecoder(object):
def __init__(self,decodeString,type):
pass
def __init__(self,textList):
self.textList = textList
self.nestedItems = []
newItem = NestedItem(self)
newItem.setContentType("Header")
newItem.setValue(self.textList[0])
self.nestedItems.append(newItem)
if re.search(r'(boundary=)',newItem.value):
helperItem = NestedItem(self)
helperItem.value = (self.textList[0])
m = re.search(r'(?<=Content-Type: ).+(?=;)',newItem.value)
helperItem.setContentType(m.group(0))
self.nestedItems.append(helperItem)
self.organizeData()
"""i = 0
while i < len(self.textList):
newItem = NestedItem(self)
ct = self.nextContentType
newItem.setContentType(ct)
newItem.setValue(self.textList[i])
self.nestedItems.append(newItem)
m = re.search(r'(?<=Content-Type: ).+(?=;)',self.textList[i])
if m:
self.nextContentType = m.group(0)
i += 1
"""
def nestItem (self,item):
self.nestedItems.append(item)
def organizeData(self):
self.nestLevel = 1
self.currentSuper = self
m = re.search(r'(?<=boundary=).*',self.textList[0])
self.currentBoundary = m.group(0)
self.currentList = self.textList
self.currentList.remove(self.textList[0])
self.formerObjectDatabase = {}
pdb.set_trace()
while self.nestLevel > 0:
i = 0
while i < len(self.currentList):
boundary = self.currentBoundary
#If block is a "normal block", containing a current boundary identifier
p = re.search(r'--(?P<boundary>)(?!--)', text)
if p:
newItem = NestedItem(self.currentSuper)
newItem.setValue(self.currentList[i])
r = re.search(r'(?<=Content-Type: ).+(?=;)',newItem.value)
if r:
newItem.setContentType(r.group(0))
self.currentObject = newItem
self.currentSuper.nestItem(self.currentObject)
#If the block contains a new block boundary
m = re.search(r'(?<=boundary=).*',self.currentList[i])
if m:
#begin new layer of recursive commands
newFormerObject = self.FormerCurrentObject(self.currentList,self.currentSuper,self.currentBoundary)
self.formerObjectDatabase[self.nestLevel] = newFormerObject
self.currentSuper = self.currentObject
self.nestLevel += 1
self.currentBoundary = m.group(0)
boundary = self.currentBoundary
#self.currentList = re.split(r'--(?P<boundary>)(?!--)', self.currentList[i])
boundary = self.currentBoundary
#If block contains an "end of boundary" marker
q = re.search(r'(?P<boundary>)--', text)
if q:
self.nestLevel -= 1
currentObject = self.formerObjectDatabase[self.nestLevel]
self.currentList = currentObject.formerList
self.currentSuper = currentObject.formerSuper
self.currentBoundary = currentObject.formerBoundary
i += 1
class FormerCurrentObject:
def __init__(self,formerList,formerSuper,formerBoundary):
self.formerList = formerList
self.formerSuper = formerSuper
self.formerBoundary = formerBoundary
def printAll(self):
print "printing all: %d" % len(self.nestedItems)
i = 0
while i < len(self.nestedItems):
print "printing out item %d" % i
self.nestedItems[i].printOut()
i += 1
class NestedItem(object):
def __init__(self,superObject,contentType=" ",value = " "):
self.superObject = superObject
self.contentType = contentType
self.value = value
self.nestedItems = []
def nestItem(self,item):
self.nestedItems.append(item)
def printOut(self,printBuffer = ""):
print printBuffer + '++%s' % self.contentType
print printBuffer + self.value
a = 0
printBuffer = printBuffer + " "
while a < len(self.nestedItems):
self.nestedItems[a].printOut(printBuffer)
def setContentType(self,contentType):
self.contentType = contentType
def setValue(self,value):
self.value = value
if type == "text only":
p = Parser(textList)
print p.returnParsed()
# ---PROBLEM CODE STARTS HERE---
def separate(text,boundary = None):
pdb.set_trace()
if boundary == None:
m = re.findall(r'(?<=boundary=).*',text)
i = 0
textList = [text]
while i < len(m): #have all levels of Boundary/headers named
boundary = m[i]
textList = recursiveSplit(textList,boundary)
i += 1
return textList
def recursiveSplit(chunk,boundary):
if type(chunk) is types.ListType: #<<--error occurs here
for obj in chunk:
recursiveSplit(obj,boundary)
if type(chunk) is types.StringType:
list = re.split(r'(?P<boundary>)(?!--)',chunk)
return list
return None
#---PROBLEM CODE ENDS(?) HERE---
if type == "MIME":
#separate the text file instead by its boundary identifier
p = MIMEParser(separate(text))
p.printAll()
You can use any MIME type email for this to run. Here's the one I've been using for convenience
MIME-Version: 1.0
Received: by 10.112.170.40 with HTTP; Fri, 3 May 2013 05:08:21 -0700 (PDT)
Date: Fri, 3 May 2013 08:08:21 -0400
Delivered-To: MYEMAIL#gmail.com
Message-ID: <#mail.gmail.com>
Subject: MiB 5/3/13 7:43AM (EST)
From: ME<MYEMAIL#gmail.com>
To: SOMEONE <SOMEONE#aol.com>
Content-Type: multipart/mixed; boundary=BNDRY1
--BNDRY1
Content-Type: multipart/alternative; boundary=BNDRY2
--BNDRY2
Content-Type: text/plain; charset=ISO-8859-1
-changed signature methods to conform more to working clinic header
methods(please test/not testable in simulator)
-confirmed that signature image is showing up in simulator. Awaiting
further tests
-Modified findings spacing/buffer. See if you like it
--BNDRY2
Content-Type: text/html; charset=ISO-8859-1
<div dir="ltr">-changed signature methods to conform more to working clinic header methods(please test/not testable in simulator)<div style>-confirmed that signature image is showing up in simulator. Awaiting further tests</div>
<div style>-Modified findings spacing/buffer. See if you like it</div></div>
--BNDRY2--
--BNDRY1
Content-Type: application/zip; name="Make it Brief.ipa.zip"
Content-Disposition: attachment; filename="Make it Brief.ipa.zip"
Content-Transfer-Encoding: base64
X-Attachment-Id: f_hg9biuno0
<<FILE DATA>>
--BNDRY1--

The issue was in the regex. There may be a cooler way to do it, but I just created a search string literal based off of the variables.
def recursiveSplit(chunk,boundary):
if type(chunk) is types.StringType:
#ar = re.split(r'(?P<boundary>)(?!--)',chunk)
searchString = "--%s" % boundary
print searchString
ar = re.split(searchString,chunk)
return ar
if type(chunk) is types.ListType:
i = 0
while i < len(chunk):
chunk[i] = recursiveSplit(chunk[i],boundary)
i += 1
return obj

Related

Find and replace in string re.insensitive

I have this code to find matches in a string, im using in a search to mark the wordds that match my search, i need this to be case insensitive, the issue here is that it replaces the word by the one we search.
val_to_searchp = "this Text string has alot of teXt"
word = "TEXT"
pal2rep = str(":::")+word+str(":::")
val_to_search = re.sub(re.escape(word), pal2rep, val_to_searchp, flags=re.IGNORECASE)
this will return
"this :::TEXT::: string has alot of :::TEXT:::"
I need it to return
"this :::Text::: string has alot of :::teXt:::"
Also tryed with this but its not working very well :(
f = 0
s = 0
val_to_search = val_to_searchp
for m in re.finditer(str(word), str(val_to_searchp)):
inicio = int(m.start()+s)
fim = int(m.end()+f)
val_to_search = val_to_search[:inicio] \
+ str(":::") \
+ val_to_search[inicio:fim] \
+ str(":::") \
+ val_to_search[fim:].strip()
f = f+2
s = s+1
This is my actuall code
def findtext():
if len(str(findtext_inp.get('1.0', END)))>1:
val_to_searchp = str(respon_txt.get(1.0, END).replace(html.unescape('⛔'), "").strip())
respon_txt.delete(1.0, END)
word = str(findtext_inp.get('1.0', END).strip())
pal2rep = str(str(html.unescape('⛔'))+word+str(html.unescape('⛔')))
val_to_search = re.sub(re.escape(word), pal2rep, val_to_searchp, flags=re.IGNORECASE)
"""
f = 0
s = 0
for m in re.finditer(str(word), str(val_to_search)):
inicio = int(m.start()+s)
fim = int(m.end()+f)
val_to_search = val_to_search[:inicio] \
+ str(html.unescape('⛔')) \
+ val_to_search[inicio:fim] \
+ str(html.unescape('⛔')) \
+ val_to_search[fim:].strip()
f = f+2
s = s+1
"""
respon_txt.insert(1.0, val_to_search)#val_to_search.replace(findtext_inp.get('1.0', END).strip() , str(html.unescape('⛔')+findtext_inp.get('1.0', END).strip())+html.unescape('⛔')))
I'm sure there's a way to do this with RE but it's really trivial without the aid of that module.
val_to_searchp = "this Text string has alot of teXt\nThis also has a lot of text"
text = 'TEXT'
def func(s, txt):
txt = txt.lower()
result = []
for line in s.split('\n'):
for i, e in enumerate(t := line.split()):
if e.lower() == txt:
t[i] = f':::{e}:::'
result.append(' '.join(t))
return '\n'.join(result)
print(func(val_to_searchp, text))
Output:
this :::Text::: string has alot of :::teXt:::
This also has a lot of :::text:::
This is a rewrite of my original answer. In the comments for that answer you will see that the OP has changed his mind about how this needs to work. This now (hopefully) complies with the altered specification:
val_to_searchp = '''{\"configurationKey\":[{\"key\":\"GetMaxKeys\",\"readonly\":true,\"value\":\"20\"}'''
text = 'GetMaxKeys'
def func(s, txt):
result = []
sl = s.lower()
txt = txt.lower()
lt = len(txt)
offset = 0
while (i := sl[offset:].find(txt)) >= 0:
result.append(s[offset:i+offset])
offset += i
result.append(f':::{s[offset:offset+lt]}:::')
offset += lt
result.append(s[offset:])
return ''.join(result)
print(func(val_to_searchp, text))
Output:
{"configurationKey":[{"key":":::GetMaxKeys:::","readonly":true,"value":"20"}

Out of memory issue python inverted index creation

This code is intended to create an inverted index but when working with a wikipedia xml dump (~80GB) it runs out of memory. I haven't been able to find out where the memory leak is happening and have explicitly deleted most of the data after using it. The xml dump is parsed using the sax parser, and I've attached the contentHandler and a cleaner class for reference.
Content handler:
class Handler(sx.ContentHandler):
def __init__(self, index_dir):
self.title = []
self.body = []
self.current = ''
self.id = None
self.cleaner = CleanerChunker()
self.pages = 0
self.index_dir = index_dir
self.titles = []
self.keys = ['t', 'b', 'i', 'c', 'r', 'l']
self.inv_index = {}
# This function is called whenever a page end tag is received
# It adds the words to the current inverted index which is
# written to a file every 1000 pages
def add_page(self, page=None, force_write=False):
if page:
c = 0
ind = {}
words = set()
for key in page.keys():
temp = {}
has = {}
for word in page[key]:
flag = False
for letter in word:
if letter not in has.keys():
has[letter] = 0
has[letter] += 1
for key in has.keys():
if has[key] > 5:
flag = True
has.clear()
if flag:
continue
if word not in temp.keys():
temp[word] = 0
temp[word] += 1
words.add(word)
ind[self.keys[c]] = temp
c += 1
del temp
del has
for word in words:
encoding = str(self.pages)
for key in ind.keys():
if word in ind[key].keys():
encoding += key + str(ind[key][word])
if word not in self.inv_index.keys():
self.inv_index[word] = []
self.inv_index[word].append(encoding)
del encoding
del ind
del words
if self.pages % 1000 == 0 or force_write:
f = open(f'{self.index_dir}/index{int((self.pages+9999)/10000)}.txt', "w")
for key in sorted(self.inv_index.keys()):
data = key + ' ' + ' '.join(self.inv_index[key]) + '\n'
f.write(data)
self.inv_index.clear()
f.close()
# if self.pages % 10000 == 0 or force_write:
# f = open(f'{self.index_dir}/titles{int((self.pages+9999)/10000)}.txt', 'w')
# f.write(' '.join(self.titles))
# del self.titles
# self.titles = []
# f.close()
if force_write:
f = open(f'{self.index_dir}/numdocs.txt', 'w')
f.write(str(self.pages))
f.close()
# Function called when parser receives an opening tag
def startElement(self, tag, attributes):
self.current = tag
# Function called whenever parser receives a closing tag
def endElement(self, tag):
if tag == 'page':
self.body = ' '.join(self.body)
self.title = ' '.join(self.title)
#self.titles.append(self.title.lower())
body, infobox, cat, ref, links = self.cleaner.chunk(self.body)
title = self.cleaner.clean(self.title)
page = {"title":title, "body":body, "infobox":infobox,
"categories":cat, "references":ref, "links":links}
self.pages += 1
self.add_page(page=page)
self.title = []
self.body = []
self.id = None
if self.pages % 1000 == 0:
print(f"Successfully parsed {self.pages} pages", flush=True)
if tag == 'mediawiki':
self.add_page(force_write=True)
# Function called whenever content is read
def characters(self, content):
if self.current == 'id' and not self.id:
self.id = content
elif self.current == 'text':
self.body.append(content)
elif self.current == 'title':
self.title.append(content)
def get_file_count(self):
return int((self.pages+9999)/10000)
Cleaner class:
from Stemmer import Stemmer
from nltk.corpus import stopwords
import re
class CleanerChunker:
def __init__(self):
self.stemmer = Stemmer('english')
self.stopwords = set(stopwords.words('english'))
extra_stops = set(['cite', 'https', 'http', 'com', 'url', 'categori'
'ref', 'reflist', 'title', 'name', 'author',
'data', 'also', 'link', 'org', 'publish', 'websit',
'caption', 'imag', 'infobox', 'wiki'])
self.stopwords = set.union(self.stopwords, extra_stops)
# Removes whitespace, non alphanumeric characters and stop words
def clean(self, text):
text = text.lower()
text = re.sub(r'http[^ ]*\ ', r' ', text)
text = re.sub(r'&lt|&gt|&amp|&quot|&apos|&nbsp', r' ', text)
text = re.sub(r'[^a-z0-9 ]', r' ', text)
tokens = text.split()
tokens_nostop = [word for word in tokens if word not in self.stopwords]
ret = self.stemmer.stemWords(tokens_nostop)
del tokens
del tokens_nostop
return ret
# Parses the wikipedia body from the entire page
def get_body(self, text):
body = []
prev = 0
for info in re.finditer(r'\{\{\ *infobox', text):
body.append(text[prev:info.start()])
i = info.start()+2
bracks = 2
while bracks != 0 and i < len(text):
if text[i] == '{':
bracks += 1
elif text[i] == '}':
bracks -= 1
i += 1
prev = i
body.append(text[prev:])
del prev
return self.clean(' '.join(body))
# Parses the infobox from the entire wikipedia body
def get_infobox(self, text):
infoboxes = []
for info in re.finditer(r'\{\{\ *infobox', text):
i = info.start()+2
bracks = 2
while bracks != 0 and i < len(text):
if text[i] == '{':
bracks += 1
elif text[i] == '}':
bracks -= 1
i += 1
infoboxes.append(text[info.start():i])
return self.clean(' '.join(infoboxes))
# Parses the references from the wikipedia body
def get_references(self, text):
res = []
for ref in re.finditer(r'==\ *references\ *==', text):
next_debar = re.search(r'==\ *[a-z]*\ *==|\[\[category', text[ref.end():])
if next_debar:
res.append(text[ref.end():ref.end()+next_debar.start()])
else:
res.append(text[ref.end():])
return self.clean(' '.join(res))
# Parses categories from the wiki body
def get_categories(self, text):
ret = re.findall(r'\[\[category:(.*)', text)
return self.clean(' '.join(ret))
# Parses links from the wiki body
def get_links(self, text):
res = []
for ref in re.finditer(r'==\ *external links\ *==', text):
next_debar = re.search(r'\[\[category', text[ref.end():])
if next_debar:
res.append(text[ref.end():ref.end()+next_debar.start()])
else:
res.append(text[ref.end():])
return self.clean(' '.join(res))
# Takes the wikipedia body as a string and returns separate
# strings for each part of the wikipedia article
def chunk(self, text):
text = text.lower()
chunks = (text, "")
res = re.search(r'==\ *references\ *==', text)
if res:
chunks = (text[:res.start()], text[res.start():])
return self.get_body(chunks[0]), \
self.get_infobox(chunks[0]), \
self.get_categories(chunks[1]), \
self.get_references(chunks[1]), \
self.get_links(chunks[1])

Python tnsnames.ora parser

I need a dict containing all database connections from tnsnames.ora file.
I need to go from this :
(DESCRIPTION=(ADDRESS_LIST=(ADDRESS=(PROTOCOL=TCP)(HOST=mydbserver.mydomain.com)(PORT=1521)))(CONNECT_DATA=(SID=CATAL)(SERVER=DEDICATED)(SERVICE_NAME=mydb.mydomain.com)))
to this :
{'DESCRIPTION': [{'ADDRESS_LIST': [{'ADDRESS': [{'PROTOCOL': 'TCP'},
{'HOST': 'mydbserver.mydomain.com'},
{'PORT': '1521'}
]
}]
},
{'CONNECT_DATA': [{'SID': 'CATAL'},
{'SERVER': 'DEDICATED'},
{'SERVICE_NAME': 'mydb.mydomain.com'}
]
}
]
}
So far, my code is :
def get_param(param_string):
print("get_param input:", param_string)
if param_string.count("(") != param_string.count(")"):
raise Exception("Number of '(' is not egal to number of ')' : " + str(param_string.count("(")) + " and " + str(param_string.count(")")))
else:
param_string = param_string[1:-1]
splitted = param_string.split("=")
keywork = splitted[0]
if len(splitted) == 2:
return {keywork: splitted[1]}
else:
splitted.remove(keywork)
values = "=".join(splitted)
return {keywork: get_value_list(values)}
def get_value_list(value_string):
print("get_value_list input:", value_string)
to_return = list()
if "=" not in value_string and "(" not in value_string and ")" not in value_string:
to_return.append(value_string)
elif value_string[0] != "(":
raise Exception("[ERROR] Format error '(' is not the first char: " + repr(value_string))
else:
parenth_count = 0
strlen = len(value_string)
current_value = ""
for i in range(0,strlen):
current_char = value_string[i]
current_value += current_char
if current_char == "(":
parenth_count += 1
elif current_char == ")":
parenth_count += -1
if parenth_count == 0:
to_return.append(get_param(current_value))
if i != (strlen - 1):
if value_string[i+1] == "(":
to_return += get_value_list(value_string[i+1:])
else:
raise Exception("Format error - Next char should be a '('. value_string[i+1]:" + repr(value_string[i+1]) )
break
print("get_value_list return:", to_return)
if len(to_return) == 0:
to_return = ""
elif len(to_return) == 1:
to_return = to_return[0]
return to_return
connection_infos = "(DESCRIPTION=(ADDRESS_LIST=(ADDRESS=(PROTOCOL=TCP)(HOST=mydbserver.mydomain.com)(PORT=1521)))(CONNECT_DATA=(SID=CATAL)(SERVER=DEDICATED)(SERVICE_NAME=mydb.mydomain.com)))"
current_connection = get_param(connection_infos)
print("current_connection:", current_connection)
pprint(current_connection)
And I got this :
{'DESCRIPTION': [{'ADDRESS_LIST': {'ADDRESS': [{'PROTOCOL': 'TCP'},
{'HOST': 'mydbserver.mydomain.com'},
'PORT']
}
},
'CONNECT_DATA'
]
}
So I'm doing something wrong. And I feel I'm doing something too complicated. Would anyone point some mistake I made or help me find a simpler way to do this ?
I have a working code now, but I'm not really satisfied with it. It's too long, not flexible, and will not work with some other tnsnames.ora possible formats :
class Tnsnames():
def __init__(self, file_path, file_name='tnsnames.ora'):
self.file_path = file_path
self.file_name = file_name
self.load_file()
def load_file(self):
try:
fhd = open(os.path.join(self.file_path, self.file_name), 'rt', encoding='utf-8')
except:
raise
else:
#Oracle doc : https://docs.oracle.com/cd/B28359_01/network.111/b28317/tnsnames.htm#NETRF007
file_content = list()
for l in fhd:
l = " ".join(l.split()).strip(" \n")
if len(l) > 0:
if "#" not in l:
file_content.append(l)
fhd.close()
file_content = " ".join(file_content)
connections_list = dict()
current_depth = 0
current_word = ""
current_keyword = ""
name_to_register = ""
is_in_add_list = False
current_addr = dict()
connections_aliases = dict()
stop_registering = False
connections_duplicates = list()
for c in file_content:
if c == " ":
pass
elif c == "=":
current_keyword = str(current_word)
current_word = ""
if current_keyword == "ADDRESS_LIST":
is_in_add_list = True
elif c == "(":
if current_depth == 0:
current_keyword = current_keyword.upper()
names_list = current_keyword.replace(" ","").split(",")
if len(names_list) == 1:
name_to_register = names_list[0]
else:
name_to_register = None
# We use either the first name with at least
# a dot in it, or the longest one.
for n in names_list:
if "." in n:
name_to_register = n
break
else:
name_to_register = max(names_list, key=len)
names_list.remove(name_to_register)
for n in names_list:
if n in connections_aliases.keys():
print("[ERROR] already registered alias:", n,
". Registered to:", connections_aliases[n],
". New:", name_to_register,
". This possible duplicate will not be registered.")
connections_duplicates.append(n)
stop_registering = True
else:
connections_aliases[n] = name_to_register
if not stop_registering:
connections_list[name_to_register] = {"ADDRESS_LIST": list(),
"CONNECT_DATA": dict(),
"LAST_TEST_TS": None}
current_depth += 1
elif current_depth in [1,2,3]:
current_depth += 1
else:
print("[ERROR] Incorrect depth:", repr(current_depth), ". Current connection will not be registered" )
del connections_list[name_to_register]
stop_registering = True
elif c == ")":
if current_depth == 1:
if stop_registering:
stop_registering = False
else:
# Before moving to next connection,
# we check that current connection
# have at least a HOST, and a SID or
# SERVICE_NAME
connection_is_valid = True
if isinstance(connections_list[name_to_register]["ADDRESS_LIST"], dict):
if "HOST" not in connections_list[name_to_register]["ADDRESS_LIST"].keys():
print("[ERROR] Only one address defined, and no HOST defined. Current connection will not be registered:", name_to_register)
connection_is_valid = False
elif isinstance(connections_list[name_to_register]["ADDRESS_LIST"], list):
for current_address in connections_list[name_to_register]["ADDRESS_LIST"]:
if "HOST" in current_address.keys():
break
else:
print("[ERROR] Multiple addresses but none with HOST. Current connection will not be registered:", name_to_register)
connection_is_valid = False
else:
print("[ERROR] Incorrect address format:", connections_list[name_to_register]["ADDRESS_LIST"], " Connection:", name_to_register)
connection_is_valid = False
if not connection_is_valid:
del connections_list[name_to_register]
else:
if "SERVICE_NAME" not in connections_list[name_to_register]["CONNECT_DATA"].keys() and \
"SID" not in connections_list[name_to_register]["CONNECT_DATA"].keys():
print("[ERROR] Missing SERVICE_NAME / SID for connection:", name_to_register)
del connections_list[name_to_register]
elif current_depth == 2:
if is_in_add_list:
is_in_add_list = False
if not stop_registering:
if len(connections_list[name_to_register]["ADDRESS_LIST"]) == 1:
connections_list[name_to_register]["ADDRESS_LIST"] = connections_list[name_to_register]["ADDRESS_LIST"][0]
elif current_depth == 3:
if is_in_add_list:
if not stop_registering:
connections_list[name_to_register]["ADDRESS_LIST"].append(current_addr)
current_addr = dict()
elif current_keyword.upper() in ["SID", "SERVER", "SERVICE_NAME"]:
if not stop_registering:
connections_list[name_to_register]["CONNECT_DATA"][current_keyword.upper()] = current_word.upper()
elif current_depth == 4:
if is_in_add_list:
if not stop_registering:
current_addr[current_keyword.upper()] = current_word.upper()
current_keyword = ""
current_word = ""
current_depth += -1
else:
current_word += c
self.connections = connections_list
self.aliases = connections_aliases
self.duplicates = connections_duplicates
Test tnsnames.ora :
########################################
# This is a sample tnsnames.ora #
########################################
###################################################
# PRODDB
###################################################
proddb.mydbs.domain.com, PRODDB =
(DESCRIPTION =
(ADDRESS_LIST =
(ADDRESS = (PROTOCOL = TCP)(HOST = proddb1.mydbs.domain.com)(PORT = 1522))
(ADDRESS = (PROTOCOL = TCP)(HOST = proddb2.mydbs.domain.com)(PORT = 1522))
(ADDRESS = (PROTOCOL = TCP)(HOST = proddb3.mydbs.domain.com)(PORT = 1522))
(ADDRESS = (PROTOCOL = TCP)(HOST = proddb4.mydbs.domain.com)(PORT = 1522))
)
(CONNECT_DATA =
(SID = PRODDB)
(SERVER = DEDICATED)
(SERVICE_NAME = proddb.mydbs.domain.com)
)
)
###################################################
# DEVDBA : Test database for DBA usage
###################################################
devdba.mydbs.domain.com, DEVDBA =
(DESCRIPTION =
(ADDRESS_LIST =
(ADDRESS = (PROTOCOL = TCP)(HOST = devdba.mydbs.domain.com)(PORT = 1521))
)
(CONNECT_DATA =
(SID = DEVDBA)
)
)
Test code :
from pprint import pprint
from lib_database import Tnsnames
tnsnnames = Tnsnames('/usr/lib/oracle/12.2/client64/network/admin')
print('Connexions:')
pprint(tnsnnames.connections)
print('Aliases:')
pprint(tnsnnames.aliases)
print('Duplicates:')
pprint(tnsnnames.duplicates)
Output :
Connexions:
{'DEVDBA.MYDBS.DOMAIN.COM': {'ADDRESS_LIST': {'HOST': 'DEVDBA.MYDBS.DOMAIN.COM',
'PORT': '1521',
'PROTOCOL': 'TCP'},
'CONNECT_DATA': {'SID': 'DEVDBA'},
'PRODDB.MYDBS.DOMAIN.COM': {'ADDRESS_LIST': [{'HOST': 'PRODDB1.MYDBS.DOMAIN.COM',
'PORT': '1522',
'PROTOCOL': 'TCP'},
{'HOST': 'PRODDB2.MYDBS.DOMAIN.COM',
'PORT': '1522',
'PROTOCOL': 'TCP'},
{'HOST': 'PRODDB3.MYDBS.DOMAIN.COM',
'PORT': '1522',
'PROTOCOL': 'TCP'},
{'HOST': 'PRODDB4.MYDBS.DOMAIN.COM',
'PORT': '1522',
'PROTOCOL': 'TCP'}],
'CONNECT_DATA': {'SERVER': 'DEDICATED',
'SERVICE_NAME': 'PRODDB.MYDBS.DOMAIN.COM',
'SID': 'PRODDB'}}
Aliases:
{'DEVDBA': 'DEVDBA.MYDBS.DOMAIN.COM', 'PRODDB': 'PRODDB.MYDBS.DOMAIN.COM'}
Duplicates:
[]
I could not find other Python parser for tnsnames.ora files. If you know about one, please point me to it.
You can do this with pyparsing:
import pyparsing as pp
# 1. Literals
VAR = pp.Word(pp.alphas + "_", pp.alphanums + "_").setName('variable')
SPACE = pp.Suppress(pp.Optional(pp.White()))
EQUALS = SPACE + pp.Suppress('=') + SPACE
OPEN = pp.Suppress('(') + SPACE
CLOSE = pp.Suppress(')') + SPACE
INTEGER = pp.Optional('-') + pp.Word(pp.nums) + ~pp.Char(".")
INTEGER.setParseAction(lambda t: int(t[0]))
FLOAT = pp.Optional('-') + pp.Word(pp.nums) + pp.Char('.') + pp.Optional(pp.Word(pp.nums))
FLOAT.setParseAction(lambda t: float(t[0]))
STRING = pp.Word(pp.alphanums + r'_.-')
# 2. Literal assignment expressions: (IDENTIFIER = VALUE)
INTEGER_ASSIGNMENT = pp.Group(OPEN + VAR + EQUALS + INTEGER + CLOSE)
FLOAT_ASSIGNMENT = pp.Group(OPEN + VAR + EQUALS + FLOAT + CLOSE)
STRING_ASSIGNMENT = pp.Group(OPEN + VAR + EQUALS + STRING + CLOSE)
# 3. Nested object assignment
ASSIGNMENT = pp.Forward()
NESTED_ASSIGNMENT = pp.Group(OPEN + VAR + EQUALS + ASSIGNMENT + CLOSE)
ASSIGNMENT << pp.OneOrMore(INTEGER_ASSIGNMENT |
FLOAT_ASSIGNMENT |
STRING_ASSIGNMENT |
NESTED_ASSIGNMENT)
# 4. Net service name(s): NAME(.DOMAIN)[, NAME(.DOMAIN)...]
NET_SERVICE_NAME = pp.OneOrMore(pp.Word(pp.alphas + '_' + '.', pp.alphanums + '_' + '.')
+ pp.Optional(pp.Suppress(',')))
# 5. Full TNS entry
TNS_ENTRY = NET_SERVICE_NAME + EQUALS + ASSIGNMENT
Here is some example data:
TNS_NAMES_ORA = """
MYDB =
(DESCRIPTION =
(ADDRESS_LIST =
(ADDRESS = (PROTOCOL = TCP)(HOST = server01)(PORT = 25881))
)
(CONNECT_DATA =
(SID = MYDB01)
)
)
OTHERDB.DOMAIN, ALIAS_FOR_OTHERDB.DOMAIN =
(DESCRIPTION_LIST =
(DESCRIPTION =
(ADDRESS_LIST = (ADDRESS = (PROTOCOL = TCP)
(HOST = server02)
(PORT = 25881)
))
(CONNECT_DATA = (SID = MYDB02))
)
)
"""
This part is a little one-off-ish, but here's an example of putting some additional parsing on top of that in order to extract all the data:
def _parse_addresses(tns_entry, addresses):
"""
Parse ADDRESS keywords from the a TNS entry
:param definition: Unparsed part of the TNS entry
:param addresses: List of addresses parsed
"""
keyword = tns_entry[0]
# Base Case: We found an ADDRESS, so extract the data
# and do not recurse into it
if keyword.upper() == 'ADDRESS':
port = None
host = None
for k, v in tns_entry[1:]:
if k == 'PORT':
port = v
elif k == 'HOST':
host = v
if port is None:
print('WARNING: Ignoring ADDRESS due to missing PORT')
elif host is None:
print('WARNING: Ignoring ADDRESS due to missing HOST')
addresses.append({'host': host, 'port': port})
# Else recursively descend through the definition
for d in tns_entry[1:]:
# Only parse sub-lists, not literals
if isinstance(d, list):
_parse_addresses(d, addresses)
def _parse_connect_data(tns_entry, sids):
"""
Parse CONNECT_DATA keywords from the a TNS entry
:param definition: Unparsed part of the TNS entry
:param sids: List of Oracle SIDs
"""
keyword = tns_entry[0]
# Base Case: We found a CONNECT_DATA, so extract the data
# and do not recurse into it
if keyword.upper() == 'CONNECT_DATA':
sid = None
for k, v in tns_entry[1:]:
if k == 'SID':
sid = v
if sid is None:
print('WARNING: Ignoring CONNECT_DATA due to missing SID')
sids.append(sid)
for d in tns_entry[1:]:
# Only parse sub-lists, not literals
if isinstance(d, list):
_parse_connect_data(d, sids)
def get_connection_info(net_service_name: str, tns_string: str):
"""
Generator over all simple connections inferred from a TNS entry
:param net_service_name: Net service name to return connection info for
:param tns_string: tnsnames.ora file contents
"""
# Parse the TNS entries and keep the requested definition
definition = None
for tokens, _start, _end in TNS_ENTRY.scanString(tns_string):
if net_service_name in tokens.asList()[0]:
definition = tokens.asList()[1]
break
# Check if we found a definition
if definition is None:
raise KeyError(f'No net service named {net_service_name}')
# Look for all the ADDRESS keywords
addresses = []
_parse_addresses(definition, addresses)
# Look for all CONNECT_DATA keywords
sids = []
_parse_connect_data(definition, sids)
# Emit all combinations
for address in addresses:
for sid in sids:
yield {'sid': sid, **address}
# Try it out!
for connection_info in get_connection_info('MYDB', TNS_NAMES_ORA):
print(connection_info)
I wrote a blog post about it here for "fun":
https://unparameterized.blogspot.com/2021/02/parsing-oracle-tns-files-in-python.html
import re
def find_match(tns_regex, y):
x1 = re.match(tns_regex, y, re.M + re.I + re.S)
if x1 is not None:
x1 = x1.groups(1)[0] # Only first match is returned
x1 = x1.strip('\n')
return(x1)
# Removing commented text
with open("C:\\Oracle\\product\\11.2.0\\client_1\\network\\admin\\tnsnames.ora") as tns_file:
with open("test_tns.ora", 'w+') as output:
lines =tns_file.readlines()
for line in lines:
if not line.startswith('#'):
output.write(line)
with open('test_tns.ora') as tns_file:
tnsnames = tns_file.read()
tnsnames1 = re.split(r"\){3,}\n\n", tnsnames)
# Regex matches
tns_name = '^(.+?)\s?\=\s+\(DESCRIPTION.*'
tns_host = '.*?HOST\s?=\s?(.+?)\)'
tns_port = '.*?PORT\s?=\s?(\d+?)\)'
tns_sname = '.*?SERVICE_NAME\s?=\s?(.+?)\)'
tns_sid = '.*?SID\s?=\s?(.+?)\)'
easy_connects = []
for y in tnsnames1:
y = '%s))' % y
l = [find_match(x, y) for x in [tns_name, tns_host, tns_port, tns_sname, tns_sid]]
d = {
'name': l[0],
'host': l[1],
'port': l[2],
'service_name': l[3],
'sid': l[4]
}
easy_connects.append(d)
print(easy_connects)
I have written this small code. It parses tnsnames.ora. It is fast and works great.
def parse_ora_tns_file(fpath,tnskey=None,return_all_keys=False,view_file=False,logger=None)->str:
"""
This function parse oracle tns file
parms: fpath : full file path like
fpath=filepath\tnsnames.ora
param: tnskey: find tns entry for given tns key like tnskey='ABC.WORLD'
param: return_all_keys: if True it will return all tns key names
param: view_file : if True it returns tnsnames.ora as str
"""
clean_tns_file=''
if logger:
logger.info('Reading tnsnames ora file at {} ...'.format(fpath))
with open(fpath,mode='r') as tns_file:
lines =tns_file.readlines()
for line in lines:
if not line.startswith('#'):
clean_tns_file=clean_tns_file+line
#clean file
clean_str = clean_tns_file.replace('\n','')
clean_str = clean_str.replace('\t','')
#replace = with ' ' so later I can split with ' '
#with below it becomes like ABC.WORLD = (DESCRIPTION) to ABC.WORLD ' ' (DESCRIPTION)
clean_str = clean_str.replace('=',' ')
#Below one output to ['ABC.WORLD','',' (DESCRIPTION)']
lstresult= clean_str.split(" ")
#Below code remove extra space from char list like it becomes ['ABC.WORLD','','(DESCRIPTION)']
lstresult = [txt.strip() for txt in lstresult]
#Below code to replace () chars to '' from list so output as ['ABC.WORLD','','DESCRIPTION']
removetable = str.maketrans('', '', '()')
out_list = [s.translate(removetable) for s in lstresult]
#Below code remove any empty list items so output as ['ABC.WORLD','DESCRIPTION']
out_list = list(filter(None, out_list))
#find index of DESCRIPTION words
indices = [i for i, x in enumerate(out_list ) if x.upper() == "DESCRIPTION"]
tns_keys= ""
for i in indices:
#use index of DESCRIPTION to tns keys which is required for regx pattern below.
tns_keys=tns_keys+out_list[i-1]+"|"
if return_all_keys:
return tns_keys.replace('|',',')[:-1]
if logger:
logger.info('Keys found in tnsnames ora: {}'.format(tns_keys))
regex = r"\s+(?!^({}))".format(tns_keys)
result = re.sub(regex, '', clean_tns_file, 0, re.MULTILINE)
if view_file:
return result
if result:
for match in re.finditer(r'^((?:{}))(.*)'.format(tns_keys), result, re.MULTILINE):
if match.group(1) == tnskey:
#removing = sign from start of entry
if logger:
logger.info('Found tns entry: {} {}'.format(match.group(1),match.group(2)))
return match.group(2)[1:]
if logger:
logger.info('No tns entry found for {}'.format(tnskey))
return None

Issue with Python garbage collector?

I have a simple program which reads a large file containing few million rows, parses each row (numpy array) and converts into an array of doubles (python array) and later writes into an hdf5 file. I repeat this loop for multiple days. After reading each file, i delete all the objects and call garbage collector. When I run the program, First day is parsed without any error but on the second day i get MemoryError. I monitored the memory usage of my program, during first day of parsing, memory usage is around 1.5 GB. When the first day parsing is finished, memory usage goes down to 50 MB. Now when 2nd day starts and i try to read the lines from the file I get MemoryError. Following is the output of the program.
source file extracted at C:\rfadump\au\2012.08.07.txt
parsing started
current time: 2012-09-16 22:40:16.829000
500000 lines parsed
1000000 lines parsed
1500000 lines parsed
2000000 lines parsed
2500000 lines parsed
3000000 lines parsed
3500000 lines parsed
4000000 lines parsed
4500000 lines parsed
5000000 lines parsed
parsing done.
end time is 2012-09-16 23:34:19.931000
total time elapsed 0:54:03.102000
repacking file
done
> s:\users\aaj\projects\pythonhf\rfadumptohdf.py(132)generateFiles()
-> while single_date <= self.end_date:
(Pdb) c
*** 2012-08-08 ***
source file extracted at C:\rfadump\au\2012.08.08.txt
cought an exception while generating file for day 2012-08-08.
Traceback (most recent call last):
File "rfaDumpToHDF.py", line 175, in generateFile
lines = self.rawfile.read().split('|\n')
MemoryError
I am very sure that windows system task manager shows the memory usage as 50 MB for this process. It looks like the garbage collector or memory manager for Python is not calculating the free memory correcly. There should be lot of free memory but it thinks there is not enough.
Any idea?
EDIT
Adding my code here
I will put parts of my code. I am new to python, please pardon my python coding style.
module 1
def generateFile(self, current_date):
try:
print "*** %s ***" % current_date.strftime("%Y-%m-%d")
weekday=current_date.weekday()
if weekday >= 5:
print "skipping weekend"
return
self.taqdb = taqDB(self.index, self.offset)
cache_filename = os.path.join(self.cache_dir,current_date.strftime("%Y.%m.%d.h5"))
outputFile = config.hdf5.filePath(self.index, date=current_date)
print "cache file: ", cache_filename
print "output file: ", outputFile
tempdir = "C:\\rfadump\\"+self.region+"\\"
input_filename = tempdir + filename
print "source file extracted at %s " % input_filename
## universe
reader = rfaTextToTAQ.rfaTextToTAQ(self.tickobj) ## PARSER
count = 0
self.rawfile = open(input_filename, 'r')
lines = self.rawfile.read().split('|\n')
total_lines = len(lines)
self.rawfile.close()
del self.rawfile
print "parsing started"
start_time = dt.datetime.now()
print "current time: %s" % start_time
#while(len(lines) > 0):
while(count < total_lines):
#line = lines.pop(0) ## This slows down processing
result = reader.parseline(lines[count]+"|")
count += 1
if(count % 500000 == 0):
print "%d lines parsed" %(count)
if(result == None):
continue
ric, timestamp, quotes, trades, levelsUpdated, tradeupdate = result
if(len(levelsUpdated) == 0 and tradeupdate == False):
continue
self.taqdb.insert(result)
## write to hdf5 TODO
writer = h5Writer.h5Writer(cache_filename, self.tickobj)
writer.write(self.taqdb.groups)
writer.close()
del lines
del self.taqdb, self.tickobj
##########################################################
print "parsing done."
end_time = dt.datetime.now()
print "end time is %s" % end_time
print "total time elapsed %s" % (end_time - start_time)
defragger = hdf.HDF5Defragmenter()
defragger.Defrag(cache_filename,outputFile)
del defragger
print "done"
gc.collect(2)
except:
print "cought an exception while generating file for day %s." % current_date.strftime("%Y-%m-%d")
tb = traceback.format_exc()
print tb
module 2 - taqdb - to store parsed data in an array
class taqDB:
def __init__(self, index, offset):
self.index = index
self.tickcfg = config.hdf5.getTickConfig(index)
self.offset = offset
self.groups = {}
def getGroup(self,ric):
if (self.groups.has_key(ric) == False):
self.groups[ric] = {}
return self.groups[ric]
def getOrderbookArray(self, ric, group):
datasetname = orderBookName
prodtype = self.tickcfg.getProdType(ric)
if(prodtype == ProdType.INDEX):
return
orderbookArrayShape = self.tickcfg.getOrderBookArrayShape(prodtype)
if(group.has_key(datasetname) == False):
group[datasetname] = array.array("d")
orderbookArray = self.tickcfg.getOrderBookArray(prodtype)
return orderbookArray
else:
orderbookArray = group[datasetname]
if(len(orderbookArray) == 0):
return self.tickcfg.getOrderBookArray(prodtype)
lastOrderbook = orderbookArray[-orderbookArrayShape[1]:]
return np.array([lastOrderbook])
def addToDataset(self, group, datasetname, timestamp, arr):
if(group.has_key(datasetname) == False):
group[datasetname] = array.array("d")
arr[0,0]=timestamp
a1 = group[datasetname]
a1.extend(arr[0])
def addToOrderBook(self, group, timestamp, arr):
self.addToDataset(self, group, orderBookName, timestamp, arr)
def insert(self, data):
ric, timestamp, quotes, trades, levelsUpdated, tradeupdate = data
delta = dt.timedelta(hours=timestamp.hour,minutes=timestamp.minute, seconds=timestamp.second, microseconds=(timestamp.microsecond/1000))
timestamp = float(str(delta.seconds)+'.'+str(delta.microseconds)) + self.offset
## write to array
group = self.getGroup(ric)
orderbookUpdate = False
orderbookArray = self.getOrderbookArray(ric, group)
nonzero = quotes.nonzero()
orderbookArray[nonzero] = quotes[nonzero]
if(np.any(nonzero)):
self.addToDataset(group, orderBookName, timestamp, orderbookArray)
if(tradeupdate == True):
self.addToDataset(group, tradeName, timestamp, trades)
Module 3- Parser
class rfaTextToTAQ:
"""RFA Raw dump file reader. Readers single line (record) and returns an array or array of fid value pairs."""
def __init__(self,tickconfig):
self.tickconfig = tickconfig
self.token = ''
self.state = ReadState.SEQ_NUM
self.fvstate = fvstate.FID
self.quotes = np.array([]) # read from tickconfig
self.trades = np.array([]) # read from tickconfig
self.prodtype = ProdType.STOCK
self.allquotes = {}
self.alltrades = {}
self.acvol = 0
self.levelsUpdated = []
self.quoteUpdate = False
self.tradeUpdate = False
self.depth = 0
def updateLevel(self, index):
if(self.levelsUpdated.__contains__(index) == False):
self.levelsUpdated.append(index)
def updateQuote(self, fidindex, field):
self.value = float(self.value)
if(self.depth == 1):
index = fidindex[0]+(len(self.tickconfig.stkQuotes)*(self.depth - 1))
self.quotes[index[0]][fidindex[1][0]] = self.value
self.updateLevel(index[0])
else:
self.quotes[fidindex] = self.value
self.updateLevel(fidindex[0][0])
self.quoteUpdate = True
def updateTrade(self, fidindex, field):
#self.value = float(self.value)
if(self.tickconfig.tradeUpdate(self.depth) == False):
return
newacvol = float(self.value)
if(field == acvol):
if(self.value > self.acvol):
tradesize = newacvol - self.acvol
self.acvol = newacvol
self.trades[fidindex] = tradesize
if(self.trades.__contains__(0) == False):
self.tradeUpdate = True
else:
self.trades[fidindex] = self.value
if(not (self.trades[0,1]==0 or self.trades[0,2]==0)):
self.tradeUpdate = True
def updateResult(self):
field = ''
valid, field = field_dict.FIDToField(int(self.fid), field)
if(valid == False):
return
if(self.value == '0'):
return
if(self.prodtype == ProdType.STOCK):
fidindex = np.where(self.tickconfig.stkQuotes == field)
if(len(fidindex[0]) == 0):
fidindex = np.where(self.tickconfig.stkTrades == field)
if(len(fidindex[0]) == 0):
return
else:
self.updateTrade(fidindex, field)
else:
self.updateQuote(fidindex, field)
else:
fidindex = np.where(self.tickconfig.futQuotes == field)
if(len(fidindex[0]) == 0):
fidindex = np.where(self.tickconfig.futTrades == field)
if(len(fidindex[0]) == 0):
return
else:
self.updateTrade(fidindex, field)
else:
self.updateQuote(fidindex, field)
def getOrderBookTrade(self):
if (self.allquotes.has_key(self.ric) == False):
acvol = 0
self.allquotes[self.ric] = self.tickconfig.getOrderBookArray(self.prodtype)
trades = self.tickconfig.getTradesArray()
self.alltrades[self.ric] = [trades, acvol]
return self.allquotes[self.ric], self.alltrades[self.ric]
def parseline(self, line):
self.tradeUpdate = False
self.levelsUpdated = []
pos = 0
length = len(line)
self.state = ReadState.SEQ_NUM
self.fvstate = fvstate.FID
self.token = ''
ch = ''
while(pos < length):
prevChar = ch
ch = line[pos]
pos += 1
#SEQ_NUM
if(self.state == ReadState.SEQ_NUM):
if(ch != ','):
self.token += ch
else:
self.seq_num = int(self.token)
self.state = ReadState.TIMESTAMP
self.token = ''
# TIMESTAMP
elif(self.state == ReadState.TIMESTAMP):
if(ch == ' '):
self.token = ''
elif(ch != ','):
self.token += ch
else:
if(len(self.token) != 12):
print "Invalid timestamp format. %s. skipping line.\n", self.token
self.state = ReadState.SKIPLINE
else:
self.timestamp = datetime.strptime(self.token,'%H:%M:%S.%f')
self.state = ReadState.RIC
self.token = ''
# RIC
elif(self.state == ReadState.RIC):
if(ch != ','):
self.token += ch
else:
self.ric = self.token
self.token = ''
self.ric, self.depth = self.tickconfig.replaceRic(self.ric)
self.prodtype = self.tickconfig.getProdType(self.ric)
if(self.tickconfig.subscribed(self.ric)):
self.state = ReadState.UPDATE_TYPE
self.quotes, trades = self.getOrderBookTrade()
self.trades = trades[0]
self.acvol = trades[1]
else:
self.state = ReadState.SKIPLINE
# UPDATE_TYPE
elif(self.state == ReadState.UPDATE_TYPE):
if(ch != '|'):
self.token += ch
else:
self.update_type = self.token
self.token = ''
self.state = ReadState.FVPAIRS
#SKIPLINE
elif(self.state == ReadState.SKIPLINE):
return None
# FV PAIRS
elif(self.state == ReadState.FVPAIRS):
# FID
if(self.fvstate == fvstate.FID):
if(ch != ','):
if(ch.isdigit() == False):
self.token = self.value+ch
self.fvstate = fvstate.FIDVALUE
self.state = ReadState.FVPAIRS
else:
self.token += ch
else:
self.fid = self.token
self.token = ''
self.fvstate = fvstate.FIDVALUE
self.state = ReadState.FVPAIRS
# FIDVALUE
elif(self.fvstate == fvstate.FIDVALUE):
if(ch != '|'):
self.token += ch
else:
self.value = self.token
self.token = ''
self.state = ReadState.FVPAIRS
self.fvstate = fvstate.FID
# TODO set value
self.updateResult()
return self.ric, self.timestamp, self.quotes, self.trades, self.levelsUpdated, self.tradeUpdate
Thanks.
The only reliable way to free memory is to terminate the process.
So, if your main program spawns a worker process to do most of the work (the stuff that is done in one day) then when that worker process completes, the memory used will be freed:
import multiprocessing as mp
def work(date):
# Do most of the memory-intensive work here
...
while single_date <= self.end_date:
proc = mp.Process(target = work, args = (single_date,))
proc.start()
proc.join()

Python - GTK - applet similar to gnome dictionary error

I'm new to programing, this is my first python-gtk applet and I'm trying to make an applet similar to gnome-dictionary that retrieves the word meaning from the site http://www.priberam.pt/dlpo/. I'm doing it little-by-little but now I'm stuck, can someone help me to see what am I doing wrong?
I get this error:
"TypeError: unbound method enter_callback() must be called with x instance as first argument (got Entry instance instead)"
The code is as follows:
from BeautifulSoup import BeautifulSoup, BeautifulStoneSoup
import urllib2
import re
import HTMLParser
import sys
import gtk
import pango
import string
class x:
def enter_callback(self, widget, entry):
entry_text = entry.get_text()
wordTodefine = entry_text
url = "http://www.priberam.pt/dlpo/dlpo.aspx?pal="
url = '{0}{1}'.format(url, wordTodefine)
g = urllib2.urlopen(url)
s = g.read()
def extract(text, sub1, sub2):
"""extract a substring between two substrings sub1 and sub2 in text"""
return text.split(sub1)[-1].split(sub2)[0]
str4 = extract(s, ' <?xml version="1.0" encoding="utf-16"?><div><table style="background-color:#eee; width:100%;" cellpadding="4" cellspacing="0" border="0" bordercolor="#cccccc"><tr><td><div>', '<div id="ctl00_ContentPlaceHolder1_pnl_relacionadas">')
str5 = '{0}{1}{2}'.format('<html xmlns="http://www.w3.org/1999/xhtml" xmlns:svg="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink"><div><table style="background-color:#eee; width:100%;" cellpadding="4" cellspacing="0" border="0" bordercolor="#cccccc"><tr><td><div>', str4, '</html>')
return str5
class HTMLBuffer(HTMLParser.HTMLParser):
ignoreTags = ('title', 'table')
noTagTags = ('html', 'head')
newlineTags = ('p', 'h1', 'h2', 'li', 'div')
whiteSpaceNuker = re.compile(r"""\s+""", re.MULTILINE)
def __init__(self):
self.buffer = gtk.TextBuffer()
self.ignoreData = 0
self.inList = 0
self.currentTag = ''
self.startOfP = 0
HTMLParser.HTMLParser.__init__(self)
if gtk.gdk.screen_width() >= 800:
baseSize = 13
else:
baseSize = 10
baseFont = 'Times'
tag = self.buffer.create_tag('body')
tag.set_property('font', '%s %d' % (baseFont, baseSize))
tag = self.buffer.create_tag('p')
tag.set_property('pixels-above-lines', 5)
tag.set_property('pixels-below-lines', 5)
tag = self.buffer.create_tag('tt')
tag.set_property('font', 'Times %d' % (baseSize,))
tag = self.buffer.create_tag('a')
tag.set_property('font', '%s %d' % (baseFont, baseSize))
tag = self.buffer.create_tag('h1')
tag.set_property('font', '%s %d' % (baseFont, baseSize + 10))
tag.set_property('weight', pango.WEIGHT_BOLD)
tag = self.buffer.create_tag('h2')
tag.set_property('font', '%s %d' % (baseFont, baseSize + 4))
tag.set_property('weight', pango.WEIGHT_BOLD)
tag = self.buffer.create_tag('b')
tag.set_property('weight', pango.WEIGHT_BOLD)
tag = self.buffer.create_tag('i')
tag.set_property('style', pango.STYLE_ITALIC)
tag = self.buffer.create_tag('em')
tag.set_property('style', pango.STYLE_ITALIC)
tag = self.buffer.create_tag('ul')
tag.set_property('left-margin', 20)
# reset spacing in paragraphs incase this list is inside <p>
tag.set_property('pixels-above-lines', 0)
tag.set_property('pixels-below-lines', 0)
tag = self.buffer.create_tag('li')
tag.set_property('indent', -9)
self.iter = self.buffer.get_iter_at_offset(0)
self.offsets = {}
def get_buffer(self):
return self.buffer
def pushTag(self, tag, offset):
if self.offsets.has_key(tag):
self.offsets[tag].append(offset)
else:
self.offsets[tag] = [offset]
def popTag(self, tag):
if not self.offsets.has_key(tag):
raise RuntimeError, "impossible"
return self.offsets[tag].pop()
# structure markup
def handle_starttag(self, tag, attrs):
if tag in self.ignoreTags:
self.ignoreData += 1
return
self.currentTag = tag
if tag in self.noTagTags:
return
self.pushTag(tag, self.iter.get_offset())
if tag == 'li':
self.inList += 1
self.buffer.insert(self.iter, u'\u2022 ')
elif tag == 'p':
self.startOfP = 1
def handle_endtag(self, tag):
if tag in self.ignoreTags:
self.ignoreData -= 1
return
if tag == 'li':
self.inList -= 1
if tag in self.noTagTags:
return
offset = self.popTag(tag)
current = self.iter.get_offset()
if tag in self.newlineTags and offset != current:
if tag == 'p' and self.inList:
offset -= 2
# put a newline at the beginning
start = self.buffer.get_iter_at_offset(offset)
self.buffer.insert(start, '\n')
offset += 1
current += 1
self.iter = self.buffer.get_iter_at_offset(current)
start = self.buffer.get_iter_at_offset(offset)
self.buffer.apply_tag_by_name(tag, start, self.iter)
# all other markup
def handle_data(self, data):
if self.ignoreData == 0:
data = data.replace('\n', ' ')
data = self.whiteSpaceNuker.sub(' ', data)
if self.startOfP:
if data.startswith(' '):
data = data[1:]
self.startOfP = 0
#print '|%s|' % (data,)
self.buffer.insert(self.iter, data)
if __name__ == '__main__':
def quit(*args):
gtk.main_quit()
buffer = HTMLBuffer()
buffer.feed(x)
buffer.close()
#if __name__ == '__main__':
#def __init__():
window = gtk.Window()
vbox = gtk.VBox(False, 0)
view = gtk.TextView()
view.set_property("editable", False)
view.set_property("cursor_visible", False)
entry = gtk.Entry()
entry.connect("activate", x.enter_callback, entry, view)
vbox.pack_start(entry, False, False, 0)
vbox.pack_end(view, False, False, 0)
window.connect("destroy", lambda w: gtk.main_quit())
window.add(vbox)
window.show_all()
x()
gtk.main()
I used an HtmlParser made by Matt Wilson and tried to integrate it in my file...
Thanks in advance, and sorry for the mess that this code is.
Why is the function enter_callback a method of the class x? It doesn't seem like there is any good structural reason for it to be in x in the first place. Take it out of x and the error message will go away (the error message is complaining that self isn't being passed to enter_callback). Well, at least this one will go away, probably replaced by another one :)

Categories

Resources