I'm new to programing, this is my first python-gtk applet and I'm trying to make an applet similar to gnome-dictionary that retrieves the word meaning from the site http://www.priberam.pt/dlpo/. I'm doing it little-by-little but now I'm stuck, can someone help me to see what am I doing wrong?
I get this error:
"TypeError: unbound method enter_callback() must be called with x instance as first argument (got Entry instance instead)"
The code is as follows:
from BeautifulSoup import BeautifulSoup, BeautifulStoneSoup
import urllib2
import re
import HTMLParser
import sys
import gtk
import pango
import string
class x:
def enter_callback(self, widget, entry):
entry_text = entry.get_text()
wordTodefine = entry_text
url = "http://www.priberam.pt/dlpo/dlpo.aspx?pal="
url = '{0}{1}'.format(url, wordTodefine)
g = urllib2.urlopen(url)
s = g.read()
def extract(text, sub1, sub2):
"""extract a substring between two substrings sub1 and sub2 in text"""
return text.split(sub1)[-1].split(sub2)[0]
str4 = extract(s, ' <?xml version="1.0" encoding="utf-16"?><div><table style="background-color:#eee; width:100%;" cellpadding="4" cellspacing="0" border="0" bordercolor="#cccccc"><tr><td><div>', '<div id="ctl00_ContentPlaceHolder1_pnl_relacionadas">')
str5 = '{0}{1}{2}'.format('<html xmlns="http://www.w3.org/1999/xhtml" xmlns:svg="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink"><div><table style="background-color:#eee; width:100%;" cellpadding="4" cellspacing="0" border="0" bordercolor="#cccccc"><tr><td><div>', str4, '</html>')
return str5
class HTMLBuffer(HTMLParser.HTMLParser):
ignoreTags = ('title', 'table')
noTagTags = ('html', 'head')
newlineTags = ('p', 'h1', 'h2', 'li', 'div')
whiteSpaceNuker = re.compile(r"""\s+""", re.MULTILINE)
def __init__(self):
self.buffer = gtk.TextBuffer()
self.ignoreData = 0
self.inList = 0
self.currentTag = ''
self.startOfP = 0
HTMLParser.HTMLParser.__init__(self)
if gtk.gdk.screen_width() >= 800:
baseSize = 13
else:
baseSize = 10
baseFont = 'Times'
tag = self.buffer.create_tag('body')
tag.set_property('font', '%s %d' % (baseFont, baseSize))
tag = self.buffer.create_tag('p')
tag.set_property('pixels-above-lines', 5)
tag.set_property('pixels-below-lines', 5)
tag = self.buffer.create_tag('tt')
tag.set_property('font', 'Times %d' % (baseSize,))
tag = self.buffer.create_tag('a')
tag.set_property('font', '%s %d' % (baseFont, baseSize))
tag = self.buffer.create_tag('h1')
tag.set_property('font', '%s %d' % (baseFont, baseSize + 10))
tag.set_property('weight', pango.WEIGHT_BOLD)
tag = self.buffer.create_tag('h2')
tag.set_property('font', '%s %d' % (baseFont, baseSize + 4))
tag.set_property('weight', pango.WEIGHT_BOLD)
tag = self.buffer.create_tag('b')
tag.set_property('weight', pango.WEIGHT_BOLD)
tag = self.buffer.create_tag('i')
tag.set_property('style', pango.STYLE_ITALIC)
tag = self.buffer.create_tag('em')
tag.set_property('style', pango.STYLE_ITALIC)
tag = self.buffer.create_tag('ul')
tag.set_property('left-margin', 20)
# reset spacing in paragraphs incase this list is inside <p>
tag.set_property('pixels-above-lines', 0)
tag.set_property('pixels-below-lines', 0)
tag = self.buffer.create_tag('li')
tag.set_property('indent', -9)
self.iter = self.buffer.get_iter_at_offset(0)
self.offsets = {}
def get_buffer(self):
return self.buffer
def pushTag(self, tag, offset):
if self.offsets.has_key(tag):
self.offsets[tag].append(offset)
else:
self.offsets[tag] = [offset]
def popTag(self, tag):
if not self.offsets.has_key(tag):
raise RuntimeError, "impossible"
return self.offsets[tag].pop()
# structure markup
def handle_starttag(self, tag, attrs):
if tag in self.ignoreTags:
self.ignoreData += 1
return
self.currentTag = tag
if tag in self.noTagTags:
return
self.pushTag(tag, self.iter.get_offset())
if tag == 'li':
self.inList += 1
self.buffer.insert(self.iter, u'\u2022 ')
elif tag == 'p':
self.startOfP = 1
def handle_endtag(self, tag):
if tag in self.ignoreTags:
self.ignoreData -= 1
return
if tag == 'li':
self.inList -= 1
if tag in self.noTagTags:
return
offset = self.popTag(tag)
current = self.iter.get_offset()
if tag in self.newlineTags and offset != current:
if tag == 'p' and self.inList:
offset -= 2
# put a newline at the beginning
start = self.buffer.get_iter_at_offset(offset)
self.buffer.insert(start, '\n')
offset += 1
current += 1
self.iter = self.buffer.get_iter_at_offset(current)
start = self.buffer.get_iter_at_offset(offset)
self.buffer.apply_tag_by_name(tag, start, self.iter)
# all other markup
def handle_data(self, data):
if self.ignoreData == 0:
data = data.replace('\n', ' ')
data = self.whiteSpaceNuker.sub(' ', data)
if self.startOfP:
if data.startswith(' '):
data = data[1:]
self.startOfP = 0
#print '|%s|' % (data,)
self.buffer.insert(self.iter, data)
if __name__ == '__main__':
def quit(*args):
gtk.main_quit()
buffer = HTMLBuffer()
buffer.feed(x)
buffer.close()
#if __name__ == '__main__':
#def __init__():
window = gtk.Window()
vbox = gtk.VBox(False, 0)
view = gtk.TextView()
view.set_property("editable", False)
view.set_property("cursor_visible", False)
entry = gtk.Entry()
entry.connect("activate", x.enter_callback, entry, view)
vbox.pack_start(entry, False, False, 0)
vbox.pack_end(view, False, False, 0)
window.connect("destroy", lambda w: gtk.main_quit())
window.add(vbox)
window.show_all()
x()
gtk.main()
I used an HtmlParser made by Matt Wilson and tried to integrate it in my file...
Thanks in advance, and sorry for the mess that this code is.
Why is the function enter_callback a method of the class x? It doesn't seem like there is any good structural reason for it to be in x in the first place. Take it out of x and the error message will go away (the error message is complaining that self isn't being passed to enter_callback). Well, at least this one will go away, probably replaced by another one :)
Related
Does anybody know how to create/update a table with the docx library in a word document at a specific location.
So e.g. after a paragraph with text 'test1'?
The idea is to check if paragraph exists in the document, overwrite the existsing table underneath if not create a new paragraph and underneath at a certain location (at certain header level).
I managed to add a paragraph after a specific paragraph but it does not seem to work with tables in the same way.
I can not seem to find a link between the paragraph objects and creating a table object underneath that paragraph object or identifying the existing table object based on the paragraph.
A bit of context on the code, the code is first reading xaml files and writing this data in a word document. The first time the code is run it will create all subheadings and text/tables. With a second run the code will be updating the text/table values as the subheadings already exist.
e.g.
template:
02_BusinessProcess
00_Dispatcher
01_Initialization
after first run:
02_BusinessProcess
1.1 Xamlfilename
Text
Table
00_Dispatcher
2.1 Xamlfilenam
Text
01_Initialization
after second run:
02_BusinessProcess
1.1 Xamlfilename
Updated Text
Updated Table
00_Dispatcher
2.1 Xamlfilenam
Updated Text
01_Initialization
I want to add in the tables between these lines(newly created paragraphs).
paragraph1 = insert_paragraph_after(paragraph, xaml_obt.xamlfilename, style=document.styles['Heading 3'])
paragraph_annseq = insert_paragraph_after(paragraph1, xaml_obt.ann_seq, style=document.styles['No Spacing'])
paragraph_var = insert_paragraph_after(paragraph_annseq, "Variables - " + xaml_obt.xamlfilename,style=document.styles['Heading 4'])
paragraph_in_arg = insert_paragraph_after(paragraph_var, "In_Agruments - " + xaml_obt.xamlfilename, style=document.styles['Heading 4'])
paragraph_io_arg = insert_paragraph_after(paragraph_in_arg, "In_Out_Agruments - " + xaml_obt.xamlfilename, style=document.styles['Heading 4'])
insert_paragraph_after(paragraph_io_arg, "Out_Agruments - " + xaml_obt.xamlfilename, style=document.styles['Heading 4'])
Or update the table in this spot if the paragraph exists in the document:
if paragraph.style.name.startswith('Heading'):
if paragraph.text == xaml_obt.xamlfilename:
new_para = document.paragraphs[i + 1]
new_para.text = xaml_obt.ann_seq + "\n\n"
style = document.styles['No Spacing']
new_para.style = style
new_para.alignment = WD_ALIGN_PARAGRAPH.LEFT
Here is the complete code:
from tkinter import Tk, filedialog
import os
import json
from xml.etree import ElementTree as ET
import docx
from docx.oxml.xmlchemy import OxmlElement
from docx.text.paragraph import Paragraph
from docx.enum.text import WD_ALIGN_PARAGRAPH
import time
import win32com.client
import pandas as pd
class xamlinfo(object):
def __init__(self, name: object) -> object:
self.aut_block = str
self.xamlfilepath = str
self.xamlfilename = str
self.xaml_read = None
self.toplevelnaming = str
self.ann_seq = str
self.in_arguments = pd.DataFrame(columns=['Name', 'Type', 'Annotation'])
self.out_arguments = pd.DataFrame(columns=['Name', 'Type', 'Annotation'])
self.io_agruments = pd.DataFrame(columns=['Name', 'Type', 'Annotation'])
self.variables = pd.DataFrame(columns=['Name', 'Annotation'])
def selectfolder():
root = Tk() # pointing root to Tk() to use it as Tk() in program.
root.withdraw()
root.attributes('-topmost', True)
open_file = filedialog.askdirectory()
open_file = os.path.normpath(open_file)
print("Following filepath selected: ",open_file)
return open_file
def assignxamlobjects(listxamls, path):
if os.path.exists(path):
for root,dirs,files in os.walk(path):
for file in files:
xaml_obt = xamlinfo(os.path.basename(file))
xaml_obt.aut_block = os.path.basename(path)
xaml_obt.xamlfilename = file
xaml_obt.xamlfilepath = os.path.join(root,file)
tree = ET.parse(xaml_obt.xamlfilepath)
treeroot = tree.getroot()
xaml_obt.xaml_read = treeroot
top_sequence = treeroot.find(".//{*}Sequence")
xaml_obt.toplevelnaming = top_sequence.attrib["DisplayName"]
annotation = ""
annotationelements = [x for x in top_sequence.attrib if "annotationtext" in x.lower()]
if (len(annotationelements) > 0):
annotation = top_sequence.attrib[annotationelements[0]]
xaml_obt.ann_seq = annotation
listofelements = treeroot.findall(".//{*}Property")
for element in listofelements:
if "InArgument" in element.attrib["Type"]:
annotation = ""
annotationelements = [x for x in element.attrib if "annotationtext" in x.lower()]
if (len(annotationelements) > 0):
annotation = element.attrib[annotationelements[0]]
xaml_obt.in_arguments = xaml_obt.in_arguments.append({0:str(element.attrib["Name"]), 1:str(element.attrib["Type"]).replace("InArgument",""),2: annotation})
if "InOutArgument" in element.attrib["Type"]:
annotation = ""
annotationelements = [x for x in element.attrib if "annotationtext" in x.lower()]
if (len(annotationelements) > 0):
annotation = element.attrib[annotationelements[0]]
xaml_obt.io_agruments = xaml_obt.io_agruments.append({0:str(element.attrib["Name"]), 1:str(element.attrib["Type"]).replace("InOutArgument",""),2: annotation})
if "OutArgument" in element.attrib["Type"]:
annotation = ""
annotationelements = [x for x in element.attrib if "annotationtext" in x.lower()]
if (len(annotationelements) > 0):
annotation = element.attrib[annotationelements[0]]
xaml_obt.out_arguments = xaml_obt.out_arguments.append({0:str(element.attrib["Name"]), 1:str(element.attrib["Type"]).replace("OutArgument",""),2: annotation})
listofelements = treeroot.findall(".//{*}Variable")
for element in listofelements:
annotation = ""
annotationelements = [x for x in element.attrib if "annotationtext" in x.lower()]
if (len(annotationelements) > 0):
annotation = element.attrib[annotationelements[0]]
xaml_obt.variables = xaml_obt.variables.append({0:str(element.attrib["Name"]),1:annotation})
listxamls.append(xaml_obt)
else:
print("The following path does not exists, please amend your project structure: "+path)
return listxamls
def getworkflowinfo(openfile):
jsonpath = os.path.join(openfile,"project.json")
procestrans_path = os.path.join(openfile,"process","02_BusinessProcess")
dispatcher_path = os.path.join(openfile,"process","00_Dispatcher")
init_path = os.path.join(openfile,"process","01_Initialization")
process_path = os.path.join(openfile,"Process")
listxamls = []
listxamls = assignxamlobjects(listxamls, path=procestrans_path)
listxamls = assignxamlobjects(listxamls, path=dispatcher_path)
listxamls = assignxamlobjects(listxamls, path=init_path)
listxamls = assignxamlobjects(listxamls, path=process_path)
with open(jsonpath) as f:
uipathjson = json.load(f)
return uipathjson, listxamls
def insert_paragraph_after(paragraph, text, style):
new_p = OxmlElement('w:p')
paragraph._p.addnext(new_p)
new_para = Paragraph(new_p, paragraph._parent)
if text:
new_para.add_run(text)
if style is not None:
new_para.style = style
paragraph1 = new_para
return paragraph1
def fillxamldata(document, listofxamls):
print("Starting to update workflow information.")
for xaml_obt in listofxamls:
paraexists = False
for paragraph in document.paragraphs:
if paragraph.text == xaml_obt.xamlfilename:
paraexists = True
if paraexists is True:
for i, paragraph in enumerate(document.paragraphs):
# Check if the paragraph is a heading
if paragraph.style.name.startswith('Heading'):
if paragraph.text == xaml_obt.xamlfilename:
new_para = document.paragraphs[i + 1]
new_para.text = xaml_obt.ann_seq + "\n\n"
style = document.styles['No Spacing']
new_para.style = style
new_para.alignment = WD_ALIGN_PARAGRAPH.LEFT
else:
for paragraph in document.paragraphs:
# Check if the paragraph is a heading
if paragraph.style.name.startswith('Heading'):
if paragraph.text == xaml_obt.aut_block:
paragraph1 = insert_paragraph_after(paragraph, xaml_obt.xamlfilename, style=document.styles['Heading 3'])
paragraph_annseq = insert_paragraph_after(paragraph1, xaml_obt.ann_seq, style=document.styles['No Spacing'])
paragraph_var = insert_paragraph_after(paragraph_annseq, "Variables - " + xaml_obt.xamlfilename,style=document.styles['Heading 4'])
paragraph_in_arg = insert_paragraph_after(paragraph_var, "In_Agruments - " + xaml_obt.xamlfilename, style=document.styles['Heading 4'])
paragraph_io_arg = insert_paragraph_after(paragraph_in_arg, "In_Out_Agruments - " + xaml_obt.xamlfilename, style=document.styles['Heading 4'])
insert_paragraph_after(paragraph_io_arg, "Out_Agruments - " + xaml_obt.xamlfilename, style=document.styles['Heading 4'])
print("Workflow information updated successfully.\n")
return
def filldependencies(document, jsonUI):
print("Starting to fill dependencies.")
dict_depend = jsonUI['dependencies']
text = ""
for i in dict_depend:
text = text + i+": "+ dict_depend[i]+"\n"
for i, paragraph in enumerate(document.paragraphs): # Loop through all the paragraphs in the Word file
if paragraph.style.name.startswith('Heading'): # Check if the paragraph is a heading
if 'dependencies' == paragraph.text.lower():
new_para = document.paragraphs[i+1]
new_para.text = text
style = document.styles['No Spacing']
new_para.style = style
new_para.alignment = WD_ALIGN_PARAGRAPH.LEFT
print("Dependencies updated successfully.\n")
return
def fillgeneralinfo(document, jsonUI):
print("Starting to fill process info.")
text = ("Process name: "+"\t\t\t"+ jsonUI['name'] + "\n" +
"Process description:"+"\t\t" + jsonUI['description'] +"\n" +
"UIpath Studio version:"+"\t\t"+ jsonUI['studioVersion'] + "\n" +
"Project version:"+"\t\t\t" + jsonUI['projectVersion'] + "\n")
for i, paragraph in enumerate(document.paragraphs): # Loop through all the paragraphs in the Word file
if paragraph.style.name.startswith('Heading'): # Check if the paragraph is a heading
if 'general info' == paragraph.text.lower():
new_para = document.paragraphs[i+1]
new_para.text = text
style = document.styles['No Spacing']
new_para.style = style
new_para.alignment = WD_ALIGN_PARAGRAPH.LEFT
print("Process info successfully updated.\n")
return
def fillworddata(path, listofxamls):
print("You seleceted the following SDD file: "+path+"\n")
document = docx.Document(path)
with open(path, "w") as doc:
fillxamldata(document, listofxamls)
filldependencies(document, jsonUI)
fillgeneralinfo(document, jsonUI)
document.save(path)
return
def startmessage():
print("###############################################################\n"+
" SDD_AUT \n"+
"###############################################################\n")
starttimer = time.time()
startmessage()
openfile = selectfolder()
jsonUI, listxamls = getworkflowinfo(openfile)
correct_proc = input("The information for process | " + jsonUI['name'] + " | has been read.\n"+
"Do you want to continue? (y/n)\n")
if correct_proc.lower() == 'y':
sdd_doc = filedialog.askopenfilename(title='Select a file')
fillworddata(path=sdd_doc, listofxamls=listxamls)
print("Process has been executed successfully!")
else:
print("The process has been terminated as the incorrect project was selected.")
endtimer = time.time()
duration = endtimer - starttimer
print("Process took: " + str(duration))
This code is intended to create an inverted index but when working with a wikipedia xml dump (~80GB) it runs out of memory. I haven't been able to find out where the memory leak is happening and have explicitly deleted most of the data after using it. The xml dump is parsed using the sax parser, and I've attached the contentHandler and a cleaner class for reference.
Content handler:
class Handler(sx.ContentHandler):
def __init__(self, index_dir):
self.title = []
self.body = []
self.current = ''
self.id = None
self.cleaner = CleanerChunker()
self.pages = 0
self.index_dir = index_dir
self.titles = []
self.keys = ['t', 'b', 'i', 'c', 'r', 'l']
self.inv_index = {}
# This function is called whenever a page end tag is received
# It adds the words to the current inverted index which is
# written to a file every 1000 pages
def add_page(self, page=None, force_write=False):
if page:
c = 0
ind = {}
words = set()
for key in page.keys():
temp = {}
has = {}
for word in page[key]:
flag = False
for letter in word:
if letter not in has.keys():
has[letter] = 0
has[letter] += 1
for key in has.keys():
if has[key] > 5:
flag = True
has.clear()
if flag:
continue
if word not in temp.keys():
temp[word] = 0
temp[word] += 1
words.add(word)
ind[self.keys[c]] = temp
c += 1
del temp
del has
for word in words:
encoding = str(self.pages)
for key in ind.keys():
if word in ind[key].keys():
encoding += key + str(ind[key][word])
if word not in self.inv_index.keys():
self.inv_index[word] = []
self.inv_index[word].append(encoding)
del encoding
del ind
del words
if self.pages % 1000 == 0 or force_write:
f = open(f'{self.index_dir}/index{int((self.pages+9999)/10000)}.txt', "w")
for key in sorted(self.inv_index.keys()):
data = key + ' ' + ' '.join(self.inv_index[key]) + '\n'
f.write(data)
self.inv_index.clear()
f.close()
# if self.pages % 10000 == 0 or force_write:
# f = open(f'{self.index_dir}/titles{int((self.pages+9999)/10000)}.txt', 'w')
# f.write(' '.join(self.titles))
# del self.titles
# self.titles = []
# f.close()
if force_write:
f = open(f'{self.index_dir}/numdocs.txt', 'w')
f.write(str(self.pages))
f.close()
# Function called when parser receives an opening tag
def startElement(self, tag, attributes):
self.current = tag
# Function called whenever parser receives a closing tag
def endElement(self, tag):
if tag == 'page':
self.body = ' '.join(self.body)
self.title = ' '.join(self.title)
#self.titles.append(self.title.lower())
body, infobox, cat, ref, links = self.cleaner.chunk(self.body)
title = self.cleaner.clean(self.title)
page = {"title":title, "body":body, "infobox":infobox,
"categories":cat, "references":ref, "links":links}
self.pages += 1
self.add_page(page=page)
self.title = []
self.body = []
self.id = None
if self.pages % 1000 == 0:
print(f"Successfully parsed {self.pages} pages", flush=True)
if tag == 'mediawiki':
self.add_page(force_write=True)
# Function called whenever content is read
def characters(self, content):
if self.current == 'id' and not self.id:
self.id = content
elif self.current == 'text':
self.body.append(content)
elif self.current == 'title':
self.title.append(content)
def get_file_count(self):
return int((self.pages+9999)/10000)
Cleaner class:
from Stemmer import Stemmer
from nltk.corpus import stopwords
import re
class CleanerChunker:
def __init__(self):
self.stemmer = Stemmer('english')
self.stopwords = set(stopwords.words('english'))
extra_stops = set(['cite', 'https', 'http', 'com', 'url', 'categori'
'ref', 'reflist', 'title', 'name', 'author',
'data', 'also', 'link', 'org', 'publish', 'websit',
'caption', 'imag', 'infobox', 'wiki'])
self.stopwords = set.union(self.stopwords, extra_stops)
# Removes whitespace, non alphanumeric characters and stop words
def clean(self, text):
text = text.lower()
text = re.sub(r'http[^ ]*\ ', r' ', text)
text = re.sub(r'<|>|&|"|&apos| ', r' ', text)
text = re.sub(r'[^a-z0-9 ]', r' ', text)
tokens = text.split()
tokens_nostop = [word for word in tokens if word not in self.stopwords]
ret = self.stemmer.stemWords(tokens_nostop)
del tokens
del tokens_nostop
return ret
# Parses the wikipedia body from the entire page
def get_body(self, text):
body = []
prev = 0
for info in re.finditer(r'\{\{\ *infobox', text):
body.append(text[prev:info.start()])
i = info.start()+2
bracks = 2
while bracks != 0 and i < len(text):
if text[i] == '{':
bracks += 1
elif text[i] == '}':
bracks -= 1
i += 1
prev = i
body.append(text[prev:])
del prev
return self.clean(' '.join(body))
# Parses the infobox from the entire wikipedia body
def get_infobox(self, text):
infoboxes = []
for info in re.finditer(r'\{\{\ *infobox', text):
i = info.start()+2
bracks = 2
while bracks != 0 and i < len(text):
if text[i] == '{':
bracks += 1
elif text[i] == '}':
bracks -= 1
i += 1
infoboxes.append(text[info.start():i])
return self.clean(' '.join(infoboxes))
# Parses the references from the wikipedia body
def get_references(self, text):
res = []
for ref in re.finditer(r'==\ *references\ *==', text):
next_debar = re.search(r'==\ *[a-z]*\ *==|\[\[category', text[ref.end():])
if next_debar:
res.append(text[ref.end():ref.end()+next_debar.start()])
else:
res.append(text[ref.end():])
return self.clean(' '.join(res))
# Parses categories from the wiki body
def get_categories(self, text):
ret = re.findall(r'\[\[category:(.*)', text)
return self.clean(' '.join(ret))
# Parses links from the wiki body
def get_links(self, text):
res = []
for ref in re.finditer(r'==\ *external links\ *==', text):
next_debar = re.search(r'\[\[category', text[ref.end():])
if next_debar:
res.append(text[ref.end():ref.end()+next_debar.start()])
else:
res.append(text[ref.end():])
return self.clean(' '.join(res))
# Takes the wikipedia body as a string and returns separate
# strings for each part of the wikipedia article
def chunk(self, text):
text = text.lower()
chunks = (text, "")
res = re.search(r'==\ *references\ *==', text)
if res:
chunks = (text[:res.start()], text[res.start():])
return self.get_body(chunks[0]), \
self.get_infobox(chunks[0]), \
self.get_categories(chunks[1]), \
self.get_references(chunks[1]), \
self.get_links(chunks[1])
I am trying to create a list objects that holds data about professional golfers. The different data points are golfer name and putting percentages from different distances. I want to sort this list of objects by name once all the data has been entered for every player object. The list of these objects is called PlayerNumber. When I try to sort PlayerNumber by attribute 'name'. I get an error stating that 'int' has no attribute and I am not sure why PlayerNumber is being referred to as an integer and not a list.
Any help would be appreciated. Here is the code:
import operator
import numpy as np
import statistics
import matplotlib.pyplot as plt
from colour import Color
from bs4 import BeautifulSoup
import urllib3
############### ACCESS WEBPAGES ####################
def makeSoup(url):
http = urllib3.PoolManager()
response = http.request('GET', url)
soupdata = BeautifulSoup(response.data)
return soupdata
siteURL = []
for i in range(7):
siteURL.append(i)
siteURL[0] = ''
siteURL[1] = 'http://www.pgatour.com/stats/stat.408.html' #>25
siteURL[2] = 'http://www.pgatour.com/stats/stat.407.html' #20-25
siteURL[3] = 'http://www.pgatour.com/stats/stat.406.html' #15-20
siteURL[4] = 'http://www.pgatour.com/stats/stat.405.html' #10-15
siteURL[5] = 'http://www.pgatour.com/stats/stat.404.html' #5-10
siteURL[6] = 'http://www.pgatour.com/stats/stat.02427.html' #3-5
############### ACCESS TABLE DATA ###################
def row_number(soupdata):
for row in table.findAll('tr'):
tot_row = row
return tot_row
def parse_table(soupdata):
currRank = []
prevRank = []
playerName = []
rounds = []
pctMake = []
attempts = []
puttsMade = []
table = soupdata.find('tbody')
tot_row = 0
for row in table.findAll('tr'):
#for col in row.findAll('td'):
col = row.find_all('td')
#column_1 = col[0]
#currRank.append(column_1)
#column_2 = col[1]
#prevRank.append(column_2)
column_3 = col[2].text
column_3.strip()
playerName.append(column_3)
#column_4 = col[3]
#rounds.append(column_4)
column_5 = col[4].text
pctMake.append(column_5)
#column_6 = col[5]
#attempts.append(column_6)
#column_7 = col[6]
#puttsMade.append(column_7)
tot_row += 1
#return currRank, prevRank, playerName, rounds, pctMake, attempts, puttsMade
return playerName, pctMake, tot_row
"""
>25 ft: distance1
20-25 ft: distance2
15-20 ft: distance3
10-15 ft: distance4
5-10 ft: distance5
3-5 ft: distance6
"""
############### CLASS DEFINITION ###################
class Player:
id_list={}
def __init__(self,name, id, dis1=0.0, dis2=0.0, dis3=0.0, dis4=0.0, dis5=0.0, dis6=0.0):
self.name = name
self.dis1 = dis1
self.dis2 = dis2
self.dis3 = dis3
self.dis4 = dis4
self.dis5 = dis5
self.dis6 = dis6
self.id = id
Player.id_list[self.name] = self # save the id as key and self as he value
def addDis1(self,distance1):
self.dis1 = float(distance1)
def addDis2(self,distance2):
self.dis2 = float(distance2)
def addDis3(self,distance3):
self.dis3 = float(distance3)
def addDis4(self,distance4):
self.dis4 = float(distance4)
def addDis5(self,distance5):
self.dis5 = float(distance5)
def addDis6(self,distance6):
self.dis6 = float(distance6)
def displayPlayer(self):
print("Player: ", self.name, '\n'
">25 Ft %: ", self.dis1, '\n'
"20-25 Ft %: ", self.dis2, '\n'
"15-20 Ft %: ", self.dis3, '\n'
"10-15 Ft %: ", self.dis4, '\n'
"5-10 Ft %: ", self.dis5, '\n'
"3-5 Ft %: ", self.dis6, '\n')
#classmethod
def lookup_player_name_by_id(cls, name):
try:
return cls.id_list[name] # return the instance with the id
except KeyError: # error check for if id does not exist
raise KeyError("No user with id %s" % str(id))
############### DATA POPULATION ###################
PlayerNumber=[]
for i in range(0,195):
PlayerNumber.append(i)
for i in range(1,7):
soupdata = makeSoup(siteURL[i])
playerName, pctMake, tot_row = parse_table(soupdata)
for x in range(0,tot_row):
#PlayerNumber.append(x)
name = playerName[x]
name = name.replace("\xa0", " ")
name = name.replace("\n", "")
if i == 1:
PlayerNumber[x] = Player(name, x)
Player.addDis1(PlayerNumber[x],pctMake[x])
if i == 2:
val = Player.lookup_player_name_by_id(name)
Player.addDis2(PlayerNumber[val.id],pctMake[x])
if i == 3:
val = Player.lookup_player_name_by_id(name)
Player.addDis3(PlayerNumber[val.id],pctMake[x])
if i == 4:
val = Player.lookup_player_name_by_id(name)
Player.addDis4(PlayerNumber[val.id],pctMake[x])
if i == 5:
val = Player.lookup_player_name_by_id(name)
Player.addDis5(PlayerNumber[val.id],pctMake[x])
if i == 6:
val = Player.lookup_player_name_by_id(name)
Player.addDis6(PlayerNumber[val.id],pctMake[x])
PlayerNumber.sort(key = operator.attrgetter("name"))
#PlayerNumber[2].displayPlayer()
I'm using Python 3.4 spyder IDE. I'm relatively new to python as an FYI.
Thanks!
It isn't that PlayerNumber is being referred to as an integer, but rather that PlayerNumber is a list of integers, and every element of that list (and integer) doesn't has an attribute "name", which sort() is trying to access (in order to sort them).
Edit:
To elaborate, the second to last line in your sample:
PlayerNumber.sort(key = operator.attrgetter("name"))
is trying to sort PlayerNumber, using the comparison function: operator.attrgetter("name"), which means it must call that function on each element of PlayerNumber to get its rank in the sorted array. That is why you are trying to grab a .name attribute from the integers in PlayerNumber.
I have read the documentation for Labels in kivy. Properties like bold, italic, font, size, color, etc. work pretty well.
However, how can I make the Label text underlined?
Here is an implementation I found on Google Groups:
from kivy.app import App
from kivy import kivy_options
from extended_markup import LabelXMU
kivy_options['text']='pygame'
class LabelWithMarkup(App):
def build(self):
root = LabelXMU(text=r"Some [b]bold[/b] [i]italic[/i] [u] underlined[/u] [s] strikethrough[/s] and plain text", font_size=22)
return root
if __name__ == '__main__':
LabelWithMarkup().run()
extended.py:
from kivy.uix.label import Label
from kivy.core.text.markup import MarkupLabel
try:
import pygame
except:
raise
#
#pygame_cache = {}
#pygame_cache_order = []
#
#pygame.font.init()
class CoreLabelXMU(MarkupLabel):
''' A core label with extended markup capabilities (underline and strikethrough markups)
Brendan Scott 6 March 2013
'''
def __init__(self, *largs, **kwargs):
self._style_stack = {}
self._refs = {}
super(MarkupLabel, self).__init__(*largs, **kwargs)
self.options['underline'] = False
self.options['strike'] = False
def _pre_render(self):
# split markup, words, and lines
# result: list of word with position and width/height
# during the first pass, we don't care about h/valign
self._lines = lines = []
self._refs = {}
self._anchors = {}
spush = self._push_style
spop = self._pop_style
options = self.options
options['_ref'] = None
for item in self.markup:
if item == '[b]':
spush('bold')
options['bold'] = True
self.resolve_font_name()
elif item == '[/b]':
spop('bold')
self.resolve_font_name()
elif item == '[i]':
spush('italic')
options['italic'] = True
self.resolve_font_name()
elif item == '[/i]':
spop('italic')
self.resolve_font_name()
elif item =='[s]':
spush('strike')
options['strike']=True
elif item =='[/s]':
spop('strike')
elif item =='[u]':
spush('underline')
options['underline']=True
elif item =='[/u]':
spop('underline')
elif item[:6] == '[size=':
item = item[6:-1]
try:
if item[-2:] in ('px', 'pt', 'in', 'cm', 'mm', 'dp', 'sp'):
size = dpi2px(item[:-2], item[-2:])
else:
size = int(item)
except ValueError:
raise
size = options['font_size']
spush('font_size')
options['font_size'] = size
elif item == '[/size]':
spop('font_size')
elif item[:7] == '[color=':
color = parse_color(item[7:-1])
spush('color')
options['color'] = color
elif item == '[/color]':
spop('color')
elif item[:6] == '[font=':
fontname = item[6:-1]
spush('font_name')
options['font_name'] = fontname
self.resolve_font_name()
elif item == '[/font]':
spop('font_name')
self.resolve_font_name()
elif item[:5] == '[ref=':
ref = item[5:-1]
spush('_ref')
options['_ref'] = ref
elif item == '[/ref]':
spop('_ref')
elif item[:8] == '[anchor=':
ref = item[8:-1]
if len(lines):
x, y = lines[-1][0:2]
else:
x = y = 0
self._anchors[ref] = x, y
else:
item = item.replace('&bl;', '[').replace(
'&br;', ']').replace('&', '&')
self._pre_render_label(item, options, lines)
# calculate the texture size
w, h = self.text_size
if h < 0:
h = None
if w < 0:
w = None
if w is None:
w = max([line[0] for line in lines])
if h is None:
h = sum([line[1] for line in lines])
return w, h
def _render_text(self, text, x, y):
font = self._get_font()
if self.options['underline']:
font.set_underline(True)
else:
font.set_underline(False)
color = [c * 255 for c in self.options['color']]
color[0], color[2] = color[2], color[0]
try:
text = font.render(text, True, color)
if self.options['strike']:
''' draw a horizontal line through the vertical middle of this surface in the foreground colour'''
r = text.get_rect()
pygame.draw.line(text, color, r.midleft, r.midright )
self._pygame_surface.blit(text, (x, y), None, pygame.BLEND_RGBA_ADD)
except pygame.error:
pass
class LabelXMU(Label):
''' A label with extended markup capabilities (underline and strikethrough markups)
Brendan Scott 6 March 2013
'''
def __init__(self, **kwargs):
kwargs['markup']=True
super(LabelXMU, self).__init__(**kwargs)
d = Label._font_properties
dkw = dict(zip(d, [getattr(self, x) for x in d]))
self._label = CoreLabelXMU(**dkw)
trying to split text via text-defined boundary markers using recursion and create a list of lists and strings containing all of the organized parts of the original text file.
The split isn't happening.
Here is the short version: The real problem script:
def separate(text,boundary = None):
if boundary == None:
m = re.findall(r'(?<=boundary=).*',text)
i = 0
while i < len(m): #have all levels of Boundary/headers named
boundary = m[i]
textList = recursiveSplit(text,boundary)
i += 1
pdb.set_trace()
return textList
def recursiveSplit(chunk,boundary):
if type(chunk) is types.StringType:
ar = re.split(r'(?P<boundary>)(?!--)',chunk)
return ar
if type(chunk) is types.ListType:
i = 0
while i < len(chunk):
chunk[i] = recursiveSplit(chunk[i],boundary)
i += 1
return obj
I've posted this script before and people wanted me to post it in its entirety so I'll do that
#Textbasics email parser
#based on a "show original" file converted into text
from sys import argv
import re, os, pdb, types
script, filename = argv
text = open(filename).read()
type = "text only" #Set the default type of email
#cut the email up by sections
#--A section is defined as any time there are two line breaks in a row
textList = re.split(r"\n\n", text)
header = textList[0]
if re.search(r'MIME-Version',header):
type = "MIME"
# If mail has no attachments, parse as a text-only email
class Parser(object):
def __init__(self,textList):
a = 1
self.body = ""
self.textList = textList
self.header = textList[0]
while a < len(textList):
self.body = self.body + textList[a] + '\n\n'
a += 1
m = re.search(r'(?<=Subject: ).*', self.header)
self.subject = m.group(0)
m = re.search(r'(?<=From: ).*', self.header)
self.fromVar = m.group(0)
m = re.search(r'(?<=To: ).*', self.header)
self.toVar = m.group(0)
m = re.search(r'(?<=Date: )\w+\s\w+\s\w+', self.header)
self.date = m.group(0)
def returnParsed(self,descriptor = "all"):
if descriptor == "all":
retv = "Subject: " + self.subject + "\n" + "From: " + self.fromVar + "\n" + "To: " + self.toVar + "\n" + "Date: " + self.date + "\n" + "\n" + self.body
return retv
if descriptor == "subject":
return self.subject
if descriptor == "fromVar":
return self.fromVar
if descriptor == "toVar":
return self.toVar
if descriptor == "date":
return self.date
if descriptor == "body":
return self.body
class MIMEParser(Parser):
class MIMEDataDecoder(object):
def __init__(self,decodeString,type):
pass
def __init__(self,textList):
self.textList = textList
self.nestedItems = []
newItem = NestedItem(self)
newItem.setContentType("Header")
newItem.setValue(self.textList[0])
self.nestedItems.append(newItem)
if re.search(r'(boundary=)',newItem.value):
helperItem = NestedItem(self)
helperItem.value = (self.textList[0])
m = re.search(r'(?<=Content-Type: ).+(?=;)',newItem.value)
helperItem.setContentType(m.group(0))
self.nestedItems.append(helperItem)
self.organizeData()
"""i = 0
while i < len(self.textList):
newItem = NestedItem(self)
ct = self.nextContentType
newItem.setContentType(ct)
newItem.setValue(self.textList[i])
self.nestedItems.append(newItem)
m = re.search(r'(?<=Content-Type: ).+(?=;)',self.textList[i])
if m:
self.nextContentType = m.group(0)
i += 1
"""
def nestItem (self,item):
self.nestedItems.append(item)
def organizeData(self):
self.nestLevel = 1
self.currentSuper = self
m = re.search(r'(?<=boundary=).*',self.textList[0])
self.currentBoundary = m.group(0)
self.currentList = self.textList
self.currentList.remove(self.textList[0])
self.formerObjectDatabase = {}
pdb.set_trace()
while self.nestLevel > 0:
i = 0
while i < len(self.currentList):
boundary = self.currentBoundary
#If block is a "normal block", containing a current boundary identifier
p = re.search(r'--(?P<boundary>)(?!--)', text)
if p:
newItem = NestedItem(self.currentSuper)
newItem.setValue(self.currentList[i])
r = re.search(r'(?<=Content-Type: ).+(?=;)',newItem.value)
if r:
newItem.setContentType(r.group(0))
self.currentObject = newItem
self.currentSuper.nestItem(self.currentObject)
#If the block contains a new block boundary
m = re.search(r'(?<=boundary=).*',self.currentList[i])
if m:
#begin new layer of recursive commands
newFormerObject = self.FormerCurrentObject(self.currentList,self.currentSuper,self.currentBoundary)
self.formerObjectDatabase[self.nestLevel] = newFormerObject
self.currentSuper = self.currentObject
self.nestLevel += 1
self.currentBoundary = m.group(0)
boundary = self.currentBoundary
#self.currentList = re.split(r'--(?P<boundary>)(?!--)', self.currentList[i])
boundary = self.currentBoundary
#If block contains an "end of boundary" marker
q = re.search(r'(?P<boundary>)--', text)
if q:
self.nestLevel -= 1
currentObject = self.formerObjectDatabase[self.nestLevel]
self.currentList = currentObject.formerList
self.currentSuper = currentObject.formerSuper
self.currentBoundary = currentObject.formerBoundary
i += 1
class FormerCurrentObject:
def __init__(self,formerList,formerSuper,formerBoundary):
self.formerList = formerList
self.formerSuper = formerSuper
self.formerBoundary = formerBoundary
def printAll(self):
print "printing all: %d" % len(self.nestedItems)
i = 0
while i < len(self.nestedItems):
print "printing out item %d" % i
self.nestedItems[i].printOut()
i += 1
class NestedItem(object):
def __init__(self,superObject,contentType=" ",value = " "):
self.superObject = superObject
self.contentType = contentType
self.value = value
self.nestedItems = []
def nestItem(self,item):
self.nestedItems.append(item)
def printOut(self,printBuffer = ""):
print printBuffer + '++%s' % self.contentType
print printBuffer + self.value
a = 0
printBuffer = printBuffer + " "
while a < len(self.nestedItems):
self.nestedItems[a].printOut(printBuffer)
def setContentType(self,contentType):
self.contentType = contentType
def setValue(self,value):
self.value = value
if type == "text only":
p = Parser(textList)
print p.returnParsed()
# ---PROBLEM CODE STARTS HERE---
def separate(text,boundary = None):
pdb.set_trace()
if boundary == None:
m = re.findall(r'(?<=boundary=).*',text)
i = 0
textList = [text]
while i < len(m): #have all levels of Boundary/headers named
boundary = m[i]
textList = recursiveSplit(textList,boundary)
i += 1
return textList
def recursiveSplit(chunk,boundary):
if type(chunk) is types.ListType: #<<--error occurs here
for obj in chunk:
recursiveSplit(obj,boundary)
if type(chunk) is types.StringType:
list = re.split(r'(?P<boundary>)(?!--)',chunk)
return list
return None
#---PROBLEM CODE ENDS(?) HERE---
if type == "MIME":
#separate the text file instead by its boundary identifier
p = MIMEParser(separate(text))
p.printAll()
You can use any MIME type email for this to run. Here's the one I've been using for convenience
MIME-Version: 1.0
Received: by 10.112.170.40 with HTTP; Fri, 3 May 2013 05:08:21 -0700 (PDT)
Date: Fri, 3 May 2013 08:08:21 -0400
Delivered-To: MYEMAIL#gmail.com
Message-ID: <#mail.gmail.com>
Subject: MiB 5/3/13 7:43AM (EST)
From: ME<MYEMAIL#gmail.com>
To: SOMEONE <SOMEONE#aol.com>
Content-Type: multipart/mixed; boundary=BNDRY1
--BNDRY1
Content-Type: multipart/alternative; boundary=BNDRY2
--BNDRY2
Content-Type: text/plain; charset=ISO-8859-1
-changed signature methods to conform more to working clinic header
methods(please test/not testable in simulator)
-confirmed that signature image is showing up in simulator. Awaiting
further tests
-Modified findings spacing/buffer. See if you like it
--BNDRY2
Content-Type: text/html; charset=ISO-8859-1
<div dir="ltr">-changed signature methods to conform more to working clinic header methods(please test/not testable in simulator)<div style>-confirmed that signature image is showing up in simulator. Awaiting further tests</div>
<div style>-Modified findings spacing/buffer. See if you like it</div></div>
--BNDRY2--
--BNDRY1
Content-Type: application/zip; name="Make it Brief.ipa.zip"
Content-Disposition: attachment; filename="Make it Brief.ipa.zip"
Content-Transfer-Encoding: base64
X-Attachment-Id: f_hg9biuno0
<<FILE DATA>>
--BNDRY1--
The issue was in the regex. There may be a cooler way to do it, but I just created a search string literal based off of the variables.
def recursiveSplit(chunk,boundary):
if type(chunk) is types.StringType:
#ar = re.split(r'(?P<boundary>)(?!--)',chunk)
searchString = "--%s" % boundary
print searchString
ar = re.split(searchString,chunk)
return ar
if type(chunk) is types.ListType:
i = 0
while i < len(chunk):
chunk[i] = recursiveSplit(chunk[i],boundary)
i += 1
return obj