Parse HTML style text annotations to a list of dictionaries

Parse HTML style text annotations to a list of dictionaries - python

Currently I have the following problem:
Given a string
"<a>annotated <b>piece</b></a> of <c>text</c>"
construct a list such that the result is
[{"annotated": ["a"]}, {"piece": ["a", "b"]}, {"of": []}, {"text": ["c"]}].
My previous attempt looked something like
open_tag = '<[a-z0-9_]+>'
close_tag = '<\/[a-z0-9_]+>'
tag_def = "(" + open_tag + "|" + close_tag + ")"
def tokenize(str):
"""
Takes a string and converts it to a list of words or tokens
For example "<a>foo</a>, of" -> ['<a>', 'foo', '</a>', ',' 'of']
"""
tokens_by_tag = re.split(tag_def, str)
def tokenize(token):
if not re.match(tag_def, token):
return word_tokenize(token)
else:
return [token]
return list(chain.from_iterable([tokenize(token) for token in tokens_by_tag]))
def annotations(tokens):
"""
Process tokens into a list with {word : [tokens]} items
"""
mapping = []
curr = []
for token in tokens:
if re.match(open_tag, token):
curr.append(re.match('<([a-z0-9_]+)>',token).group(1))
elif re.match(close_tag, token):
tag = re.match('<\/([a-z0-9_]+)>',token).group(1)
try:
curr.remove(tag)
except ValueError:
pass
else:
mapping.append({token: list(curr)})
return mapping
Unfortunately this has a flaw since (n=54) resolves to {"n=54" : []} but (n=<n>52</n>) to [{"n=": []}, {52: ["n"]}] thus the length of the two lists differ, making it impossible to merge two different ones later on.
Is there a good strategy for doing parsing HTML/SGML style annotations in a way that two differently annotated (but otherwise identical) strings yield a list of equal size?
Note that I'm well aware that regexps are not suitable for this type of parsing, but also not the problem in this case.
EDIT Fixed a mistake in the example

Your xml data (or html) is not well formed.
Assuming an input file with following xml data well formed:
<root><a>annotated <b>piece</b></a> of <c>text</c></root>
you can use a sax parser to append and pop tags at start and end element events:
from xml.sax import make_parser
from xml.sax.handler import ContentHandler
import sys
class Xml2PseudoJson(ContentHandler):
def __init__(self):
self.tags = []
self.chars = []
self.json = []
def startElement(self, tag, attrs):
d = {''.join(self.chars): self.tags[:]}
self.json.append(d)
self.tags.append(tag)
self.chars = []
def endElement(self, tag):
d = {''.join(self.chars): self.tags[:]}
self.chars = []
self.tags.pop()
self.json.append(d)
def characters(self, content):
self.chars.append(content)
def endDocument(self):
print(list(filter(lambda x: '' not in x, self.json)))
parser = make_parser()
handler = Xml2PseudoJson()
parser.setContentHandler(handler)
parser.parse(open(sys.argv[1]))
Run it like:
python3 script.py xmlfile
That yields:
[
{'annotated ': ['root', 'a']},
{'piece': ['root', 'a', 'b']},
{' of ': ['root']},
{'text': ['root', 'c']}
]

Related

Is there a way to split the text I'm inputting into different strings that the dictionary looks at individually

I am mostly trying to create software that reads says the definition of every word you typed into the text box. Right now it only reads if there is one work and crashes if there is more than one. How would I go about fixing this?
import wolframalpha
client = wolframalpha.Client('8QR2WG-628657K83Q')
from multiprocessing import Process
import wikipedia
import PySimpleGUI as sg
import cv2
import random
import sys
import threading
import time
import nltk
nltk.download('punkt')
# from oxforddictionaries.words import OxfordDictionaries
# Oxford = OxfordDictionaries('b4170561','f32687e0ecbc219cfd723bb220dad34e')
# o = OxfordDictionaries('b4170561','f32687e0ecbc219cfd723bb220dad34e')
# relax = o.get_synonyms("Apple").json()
# synonyms = relax
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
nltk.download("stopwords")
stop_words = set(stopwords.words("english"))
filtered_list = []
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
nltk.download('averaged_perceptron_tagger')
stemmer = PorterStemmer()
trained_face_data = cv2.CascadeClassifier('haarcascade_frontalface_default.xml')
trained_body_data = cv2.CascadeClassifier('haarcascade_upperbody.xml')
trained_eye_data = cv2.CascadeClassifier('haarcascade_eye.xml')
webcam = cv2.VideoCapture(0)
sg.theme('graygraygray')
layout = [ [sg.Text("Enter Test Text")],
[sg.Input()],
[sg.Button('Ok')] ]
window = sg.Window('You', layout)
sg.Popup('About Me','Hello I am an AI devolped by Garrett Provence. I will be using your webcam to scan your suroundings for a quick few seconds and will open a text box where you will be able to ask me questions. By clicking ok below you agree to letting me acess everyhting said before. I am still in beta so please be patient.')
timeout = time.time() + 10;
while True:
##Webcam scanner
def infiniteloop1():
while True:
test = 0
if test == 5 or time.time() > timeout:
break
test = test - 1
successful_frame_read, frame = webcam.read()
grayscaled_img = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
face_coordinates = trained_face_data.detectMultiScale(grayscaled_img)
body_coordinates = trained_body_data.detectMultiScale(grayscaled_img)
eye_coordinates = trained_eye_data.detectMultiScale(grayscaled_img)
for (x,y,w,h) in face_coordinates:
cv2.rectangle(frame, (x, y),(x+w, y+h), (0,random.randrange(255),0), 2)
for (x,y,w,h) in body_coordinates:
cv2.rectangle(frame, (x, y),(x+w, y+h), (0,0,255), 2)
for (x,y,w,h) in eye_coordinates:
cv2.rectangle(frame, (x, y),(x+w, y+h), (random.randrange(255),0,0), 2)
cv2.imshow('FaceThing',frame)
cv2.waitKey(1)
thread1 = threading.Thread(target=infiniteloop1)
thread1.start()
event, values = window.read()
InputText = values[0]
import json
import requests
import os
import pprint
import Oxfordwords
from Oxfordwords import Word
import pprint
##end OF webcam scanner
#img = cv2.imread('Rdj.png')
while True:
##Test Text Scanner --
Text = values[0]
Word.get(Text)
if event == sg.WIN_CLOSED or event == 'Cancel':
break
sys.exit()
try:
words_in_excerpt = word_tokenize(Text)
nltk.pos_tag(words_in_excerpt)
print('Hello',nltk.pos_tag(words_in_excerpt), "")
sg.Popup('Test', nltk.pos_tag(words_in_excerpt))
sg.Popup('Def',Word.definitions())
break
except:
sg.Popup('There seems to be a error processing what you have said')
break
##End of test Text Scanner --
The oxford dictonary code -
#!/bin/env python3
""" oxford dictionary api """
from http import cookiejar
import requests
from bs4 import BeautifulSoup as soup
class WordNotFound(Exception):
""" word not found in dictionary (404 status code) """
pass
class BlockAll(cookiejar.CookiePolicy):
""" policy to block cookies """
return_ok = set_ok = domain_return_ok = path_return_ok = lambda self, *args, **kwargs: False
netscape = True
rfc2965 = hide_cookie2 = False
class Word(object):
""" retrive word info from oxford dictionary website """
entry_selector = '#entryContent > .entry'
header_selector = '.top-container'
title_selector = header_selector + ' .headword'
wordform_selector = header_selector + ' .pos'
property_global_selector = header_selector + ' .grammar'
br_pronounce_selector = '[geo=br] .phon'
am_pronounce_selector = '[geo=n_am] .phon'
br_pronounce_audio_selector = '[geo=br] [data-src-ogg]'
am_pronounce_audio_selector = '[geo=n_am] [data-src-ogg]'
definition_body_selector = '.senses_multiple'
namespaces_selector = '.senses_multiple > .shcut-g'
examples_selector = '.senses_multiple .sense > .examples .x'
definitions_selector = '.senses_multiple .sense > .def'
extra_examples_selector = '.res-g [title="Extra examples"] .x-gs .x'
phrasal_verbs_selector = '.phrasal_verb_links a'
idioms_selector = '.idioms > .idm-g'
other_results_selector = '#rightcolumn #relatedentries'
soup_data = None
#classmethod
def get_url(cls, word):
""" get url of word definition """
baseurl = 'https://www.oxfordlearnersdictionaries.com/definition/english/'
return baseurl + word
#classmethod
def delete(cls, selector):
""" remove tag with specified selector in cls.soup_data """
try:
for tag in cls.soup_data.select(selector):
tag.decompose()
except IndexError:
pass
#classmethod
def get(cls, word):
""" get html soup of word """
req = requests.Session()
req.cookies.set_policy(BlockAll())
page_html = req.get(cls.get_url(word), timeout=5, headers={'User-agent': 'mother animal'})
if page_html.status_code == 404:
raise WordNotFound
else:
cls.soup_data = soup(page_html.content, 'html.parser')
if cls.soup_data is not None:
# remove some unnecessary tags to prevent false positive results
cls.delete('[title="Oxford Collocations Dictionary"]')
cls.delete('[title="British/American"]') # edge case: 'phone'
cls.delete('[title="Express Yourself"]')
cls.delete('[title="Collocations"]')
cls.delete('[title="Word Origin"]')
#classmethod
def other_results(cls):
""" get similar words, idioms, phrases...
Return: {
'All matches': [
{'word1': word1, 'id1': id1, 'wordform1': wordform1},
{'word2': word2, 'id2': id2, 'wordform2': wordform2}
...
]
'Phrasal verbs': [
{'word1': word1, 'id1': id1, 'wordform1': wordform1},
{'word2': word2, 'id2': id2, 'wordform2': wordform2}
...
]
...
}
"""
info = []
try:
rightcolumn_tags = cls.soup_data.select(cls.other_results_selector)[0]
except IndexError:
return None
# there can be multiple other results table like All matches, Phrasal verbs, Idioms,...
header_tags = rightcolumn_tags.select('dt')
other_results_tags = rightcolumn_tags.select('dd')
# loop each other result table
for header_tag, other_results_tag in zip(header_tags, other_results_tags):
header = header_tag.text
other_results = []
for item_tag in other_results_tag.select('li'):
names = item_tag.select('span')[0].find_all(text=True, recursive=False)
wordform_tag = item_tag.select('pos')
names.append(wordform_tag[0].text if len(wordform_tag) > 0 else '')
other_results.append(names)
other_results = list(filter(None, other_results)) # remove empty list
ids = [cls.extract_id(tag.attrs['href'])
for tag in other_results_tag.select('li a')]
results = []
for other_result, id in zip(other_results, ids):
result = {}
result['name'] = ' '.join(list(map(lambda x: x.strip(), other_result[0:-1])))
result['id'] = id
try:
result['wordform'] = other_result[-1].strip()
except IndexError:
pass
results.append(result)
info.append({header: results})
return info
#classmethod
def name(cls):
""" get word name """
if cls.soup_data is None:
return None
return cls.soup_data.select(cls.title_selector)[0].text
#classmethod
def id(cls):
""" get id of a word. if a word has definitions in 2 seperate pages
(multiple wordform) it will return 'word_1' and 'word_2' depend on
which page it's on """
if cls.soup_data is None:
return None
return cls.soup_data.select(cls.entry_selector)[0].attrs['id']
#classmethod
def wordform(cls):
""" return wordform of word (verb, noun, adj...) """
if cls.soup_data is None:
return None
try:
return cls.soup_data.select(cls.wordform_selector)[0].text
except IndexError:
return None
#classmethod
def property_global(cls):
""" return global property (apply to all definitions) """
if cls.soup_data is None:
return None
try:
return cls.soup_data.select(cls.property_global_selector)[0].text
except IndexError:
return None
#classmethod
def get_prefix_from_filename(cls, filename):
""" get prefix (NAmE or BrE) from audio name when prefix is null """
if '_gb_' in filename:
return 'BrE'
elif '_us_' in filename:
return 'NAmE'
return None
#classmethod
def pronunciations(cls):
""" get britain and america pronunciations """
if cls.soup_data is None:
return None
britain = {'prefix': None, 'ipa': None, 'url': None}
america = {'prefix': None, 'ipa': None, 'url': None}
try:
britain_pron_tag = cls.soup_data.select(cls.br_pronounce_selector)[0]
america_pron_tag = cls.soup_data.select(cls.am_pronounce_selector)[0]
britain['ipa'] = britain_pron_tag.text
britain['prefix'] = 'BrE'
america['ipa'] = america_pron_tag.text
america['prefix'] = 'nAmE'
except IndexError:
pass
try:
britain['url'] = cls.soup_data.select(cls.br_pronounce_audio_selector)[0].attrs['data-src-ogg']
america['url'] = cls.soup_data.select(cls.am_pronounce_audio_selector)[0].attrs['data-src-ogg']
except IndexError:
pass
if britain['prefix'] == None and britain['url'] is not None:
britain['prefix'] = cls.get_prefix_from_filename(britain['url'])
if america['prefix'] == None and america['url'] is not None:
america['prefix'] = cls.get_prefix_from_filename(america['url'])
return [britain, america]
#classmethod
def extract_id(cls, link):
""" get word id from link
Argument: https://abc/definition/id
Return: id
"""
return link.split('/')[-1]
#classmethod
def get_references(cls, tags):
""" get info about references to other page
Argument: soup.select(<selector>)
Return: [{'id': <id>, 'name': <word>}, {'id': <id2>, 'name': <word2>}, ...]
"""
if cls.soup_data is None:
return None
references = []
for tag in tags.select('.xrefs a'): # see also <external link>
id = cls.extract_id(tag.attrs['href'])
word = tag.text
references.append({'id': id, 'name': word})
return references
#classmethod
def references(cls):
""" get global references """
if cls.soup_data is None:
return None
header_tag = cls.soup_data.select(cls.header_selector)[0]
return cls.get_references(header_tag)
#classmethod
def definitions(cls, full=False):
""" Return: list of definitions """
if cls.soup_data is None:
return None
if not full:
return [tag.text for tag in cls.soup_data.select(cls.definitions_selector)]
return cls.definition_full()
#classmethod
def examples(cls):
""" List of all examples (not categorized in seperate definitions) """
if cls.soup_data is None:
return None
return [tag.text for tag in cls.soup_data.select(cls.examples_selector)]
#classmethod
def phrasal_verbs(cls):
""" get phrasal verbs list (verb only) """
if cls.soup_data is None:
return None
phrasal_verbs = []
for tag in cls.soup_data.select(cls.phrasal_verbs_selector):
phrasal_verb = tag.select('.xh')[0].text
id = cls.extract_id(tag.attrs['href']) # https://abc/definition/id -> id
phrasal_verbs.append({'name': phrasal_verb, 'id': id})
return phrasal_verbs
#classmethod
def _parse_definition(cls, parent_tag):
""" return word definition + corresponding examples
A word can have a single (None) or multiple namespaces
Each namespace can have one or many definitions
Each definitions can have one, many or no examples
Some words can have specific property
(transitive/intransitive/countable/uncountable/singular/plural...)
A verb can have phrasal verbs
"""
if cls.soup_data is None:
return None
definition = {}
try: # property (countable, transitive, plural,...)
definition['property'] = parent_tag.select('.grammar')[0].text
except IndexError:
pass
try: # label: (old-fashioned), (informal), (saying)...
definition['label'] = parent_tag.select('.labels')[0].text
except IndexError:
pass
try: # refer to something (of people, of thing,...)
definition['refer'] = parent_tag.select('.dis-g')[0].text
except IndexError:
pass
definition['references'] = cls.get_references(parent_tag)
if not definition['references']:
definition.pop('references', None)
try: # sometimes, it just refers to other page without having a definition
definition['description'] = parent_tag.select('.def')[0].text
except IndexError:
pass
definition['examples'] = [example_tag.text
for example_tag in parent_tag.select('.examples .x')]
definition['extra_example'] = [
example_tag.text
for example_tag in parent_tag.select('[unbox=extra_examples] .examples .unx')
]
return definition
#classmethod
def definition_full(cls):
""" return word definition + corresponding examples
A word can have a single (None) or multiple namespaces
Each namespace can have one or many definitions
Each definitions can have one, many or no examples
Some words can have specific property
(transitive/intransitive/countable/uncountable/singular/plural...)
A verb can have phrasal verbs
"""
if cls.soup_data is None:
return None
namespace_tags = cls.soup_data.select(cls.namespaces_selector)
info = []
for namespace_tag in namespace_tags:
try:
namespace = namespace_tag.select('h2.shcut')[0].text
except IndexError:
# some word have similar definitions grouped in a multiple namespaces (time)
# some do not, and only have one namespace (woman)
namespace = None
definitions = []
definition_full_tags = namespace_tag.select('.sense')
for definition_full_tag in definition_full_tags:
definition = cls._parse_definition(definition_full_tag)
definitions.append(definition)
info.append({'namespace': namespace, 'definitions': definitions})
# no namespace. all definitions is global
if len(info) == 0:
info.append({'namespace': '__GLOBAL__', 'definitions': []})
def_body_tags = cls.soup_data.select(cls.definition_body_selector)
definitions = []
definition_full_tags = def_body_tags[0].select('.sense')
for definition_full_tag in definition_full_tags:
definition = cls._parse_definition(definition_full_tag)
definitions.append(definition)
info[0]['definitions'] = definitions
return info
#classmethod
def idioms(cls):
""" get word idioms
Idioms dont have namespace like regular definitions
Each idioms have one or more definitions
Each definitions can have one, many or no examples
"""
idiom_tags = cls.soup_data.select(cls.idioms_selector)
idioms = []
for idiom_tag in idiom_tags:
try:
# sometimes idiom is in multiple idm classes inside
# one idm-l class instead of a single idm class
idiom = idiom_tag.select('.idm-l')[0].text
except IndexError:
idiom = idiom_tag.select('.idm')[0].text
global_definition = {}
try: # label: (old-fashioned), (informal), (saying)...
global_definition['label'] = idiom_tag.select('.labels')[0].text
except IndexError:
pass
try: # refer to something (of people, of thing,...)
global_definition['refer'] = idiom_tag.select('.dis-g')[0].text
except IndexError:
pass
global_definition['references'] = cls.get_references(idiom_tag)
if not global_definition['references']:
global_definition.pop('references', None)
definitions = []
# one idiom can have multiple definitions, each can have multiple examples or no example
for definition_tag in idiom_tag.select('.sense'):
definition = {}
try: # sometimes, it just refers to other page without having a definition
definition['description'] = definition_tag.select('.def')[0].text
except IndexError:
pass
try: # label: (old-fashioned), (informal), (saying)...
definition['label'] = definition_tag.select('.labels')[0].text
except IndexError:
pass
try: # refer to something (of people, of thing,...)
definition['refer'] = definition_tag.select('.dis-g')[0].text
except IndexError:
pass
definition['references'] = cls.get_references(definition_tag)
if not definition['references']:
definition.pop('references', None)
definition['examples'] = [example_tag.text for example_tag in definition_tag.select('.x')]
definitions.append(definition)
idioms.append({'name': idiom, 'summary': global_definition, 'definitions': definitions})
return idioms
#classmethod
def info(cls):
""" return all info about a word """
if cls.soup_data is None:
return None
word = {
'id': cls.id(),
'name': cls.name(),
'wordform': cls.wordform(),
'pronunciations': cls.pronunciations(),
'property': cls.property_global(),
'definitions': cls.definitions(full=True),
'idioms': cls.idioms(),
'other_results': cls.other_results()
}
if not word['property']:
word.pop('property', None)
if not word['other_results']:
word.pop('other_results', None)
if word['wordform'] == 'verb':
word['phrasal_verbs'] = cls.phrasal_verbs()
return word
Any help will be appreciated thank you:)

just split values[0] into words and call Word.get(...) on each
import re
while True:
##Test Text Scanner --
words = re.findall(r"\w+", values[0].strip()) # can also use nltk.word_tokenize
for word in words:
Word.get(word)
if event == sg.WIN_CLOSED or event == 'Cancel':
break
try:
words_in_excerpt = word_tokenize(Text)
nltk.pos_tag(words_in_excerpt)
print('Hello', nltk.pos_tag(words_in_excerpt), "")
sg.Popup('Test', nltk.pos_tag(words_in_excerpt))
sg.Popup('Def', Word.definitions())
break
except:
sg.Popup('There seems to be a error processing what you have said')
break

How to get the scope of a class in terms of starting line and end line in Python?

I want to check if class c uses a certain module m. I am able to get two things:
- the module's usage in that file(f),
- the starting line number of the classes in file f.
However, in order to know whether a particular class uses the module, I need to know the starting and end line of class c. I don't know how to get the end line of class c.
I tried going through the documentation of ast but could not find any method of finding the scope of a class.
My current code is:
source_code = "car"
source_code_data = pyclbr.readmodule(source_code)
This gives the following in source_code_data variable:
1
As you can see in the image, there is lineno but no end lineno to denote the end line of a class.
I expect to get the end line of the class in order to know its scope and finally knowing the usage of the module. Currently, the module has this structure:
Module: vehicles used in file: /Users/aviralsrivastava/dev/generate_uml/inheritance_and_dependencies/car.py at: [('vehicles.Vehicle', 'Vehicle', 5), ('vehicles.Vehicle', 'Vehicle', 20)]
So, with this information, I will be able to know if a module is used in a range: used in a class.
My whole code base is:
import ast
import os
import pyclbr
import subprocess
import sys
from operator import itemgetter
from dependency_collector import ModuleUseCollector
from plot_uml_in_excel import WriteInExcel
class GenerateUML:
def __init__(self):
self.class_dict = {} # it will have a class:children mapping.
def show_class(self, name, class_data):
print(class_data)
self.class_dict[name] = []
self.show_super_classes(name, class_data)
def show_methods(self, class_name, class_data):
methods = []
for name, lineno in sorted(class_data.methods.items(),
key=itemgetter(1)):
# print(' Method: {0} [{1}]'.format(name, lineno))
methods.append(name)
return methods
def show_super_classes(self, name, class_data):
super_class_names = []
for super_class in class_data.super:
if super_class == 'object':
continue
if isinstance(super_class, str):
super_class_names.append(super_class)
else:
super_class_names.append(super_class.name)
for super_class_name in super_class_names:
if self.class_dict.get(super_class_name, None):
self.class_dict[super_class_name].append(name)
else:
self.class_dict[super_class_name] = [name]
# adding all parents for a class in one place for later usage: children
return super_class_names
def get_children(self, name):
if self.class_dict.get(name, None):
return self.class_dict[name]
return []
source_code = "car"
source_code_data = pyclbr.readmodule(source_code)
print(source_code_data)
generate_uml = GenerateUML()
for name, class_data in sorted(source_code_data.items(), key=lambda x: x[1].lineno):
print(
"Class: {}, Methods: {}, Parent(s): {}, File: {}".format(
name,
generate_uml.show_methods(
name, class_data
),
generate_uml.show_super_classes(name, class_data),
class_data.file
)
)
print('-----------------------------------------')
# create a list with all the data
# the frame of the list is: [{}, {}, {},....] where each dict is: {"name": <>, "methods": [], "children": []}
agg_data = []
files = {}
for name, class_data in sorted(source_code_data.items(), key=lambda x: x[1].lineno):
methods = generate_uml.show_methods(name, class_data)
children = generate_uml.get_children(name)
# print(
# "Class: {}, Methods: {}, Child(ren): {}, File: {}".format(
# name,
# methods,
# children,
# class_data.file
# )
# )
agg_data.append(
{
"Class": name,
"Methods": methods,
"Children": children,
"File": class_data.file
}
)
files[class_data.file] = name
print('-----------------------------------------')
# print(agg_data)
for data_index in range(len(agg_data)):
agg_data[data_index]['Dependents'] = None
module = agg_data[data_index]["File"].split('/')[-1].split('.py')[0]
used_in = []
for file_ in files.keys():
if file_ == agg_data[data_index]["File"]:
continue
collector = ModuleUseCollector(module)
source = open(file_).read()
collector.visit(ast.parse(source))
print('Module: {} used in file: {} at: {}'.format(
module, file_, collector.used_at))
if len(collector.used_at):
used_in.append(files[file_])
agg_data[data_index]['Dependents'] = used_in
'''
# checking the dependencies
dependencies = []
for data_index in range(len(agg_data)):
collector = ModuleUseCollector(
agg_data[data_index]['File'].split('/')[-1].split('.py')[0]
)
collector.visit(ast.parse(source))
dependencies.append(collector.used_at)
agg_data[source_code_index]['Dependencies'] = dependencies
# next thing, for each class, find the dependency in each class.
'''
print('------------------------------------------------------------------')
print('FINAL')
for data in agg_data:
print(data)
print('-----------')
print('\n')
# The whole data is now collected and we need to form the dataframe of it:
'''
write_in_excel = WriteInExcel(file_name='dependency_1.xlsx')
df = write_in_excel.create_pandas_dataframe(agg_data)
write_in_excel.write_df_to_excel(df, 'class_to_child_and_dependents')
'''
'''
print(generate_uml.class_dict)
write_in_excel = WriteInExcel(
classes=generate_uml.class_dict, file_name='{}.xlsx'.format(source_code))
write_in_excel.form_uml_sheet_for_classes()
'''

Edit: Invasive Method
I did manage to find a way to do this with pyclbr but it involves changing parts of the source code. Essentially you make the stack (which is normally a list) a custom class. When an item is removed from the stack (when its scope has ended) the ending line number is added. I tried to make it as un invasive as possible.
First define a stack class in the pyclbr module:
class Stack(list):
def __init__(self):
import inspect
super().__init__()
def __delitem__(self, key):
frames = inspect.stack()
setattr(self[key][0], 'endline', frames[1].frame.f_locals['start'][0] - 1)
super().__delitem__(key)
Then you change the stack in the _create_tree function which is originally on line 193:
stack = Stack()
Now you can access the ending line as well by using
class_data.endline
I was not able to find a solution that uses the libraries you are using. So I wrote a simple parser that finds the start and end lines of every class contained in a module. You must pass in either the text wrapper itself or the list created by readlines() on your file. Its also worth noting that if a class continues until the end of the file then the end is -1.
import re
def collect_classes(source):
classes = {}
current_class = None
for lineno, line in enumerate(source, start=1):
if current_class and not line.startswith(' '):
if line != '\n':
classes[current_class]['end'] = lineno - 1
current_class = None
if line.startswith('class'):
current_class = re.search(r'class (.+?)(?:\(|:)', line).group(1)
classes[current_class] = {'start': lineno}
if current_class:
classes[current_class]['end'] = -1
return classes
import datetime
file = open(datetime.__file__, 'r') # opening the source of datetime for a test
scopes = collect_classes(file)
print(scopes)
This outputs:
{'timedelta': {'start': 454, 'end': 768}, 'date': {'start': 774, 'end': 1084}, 'tzinfo': {'start': 1092, 'end': 1159}, 'time': {'start': 1162, 'end': 1502}, 'datetime': {'start': 1509, 'end': 2119}, 'timezone': {'start': 2136, 'end': 2251}}

Thanks! The endline is really a missing feature there.
The discussion on this led to an alternative (still invasive) method to achieve this but that is almost a one-liner that does not require to subclass list and override the del operator:
https://github.com/python/cpython/pull/16466#issuecomment-539664180
Basically, just add
stack[-1].endline = start - 1
before all del stack[-1] in pyclbr.py

Python: how to parse and and add contents to a text file

Hello I have a network in a particular format, i.e. .gdf. However this is a text file in the following format
network:
nodedef>name VARCHAR,label VARCHAR
0,' 0 '
1,' 1 '
2,' 2 '
edgedef>node1 VARCHAR,node2 VARCHAR,weight DOUBLE
0,1,0.2
0,2,0.2
0,3,0.2
0,4,0.333333
where the first part refers to nodes and the second part to edges.
I want to add feature to read the file and add a feature to the nodes and return the following:
network:
nodedef>name VARCHAR,label VARCHAR, att1 VARCHAR
0,' 0 ', 'Paul'
1,' 1 ', 'Jack'
2,' 2 ', 'John'
edgedef>node1 VARCHAR,node2 VARCHAR,weight DOUBLE
0,1,0.2
0,2,0.2
0,3,0.2
0,4,0.333333

Here is some code that does the first half of what you asked for. It will parse the .GDF file and make the information available to you. Adding attributes and writing them is left as an exercise for the reader.
import ast
import collections
import re
def main():
parser = GDFParser()
with open('network.gdf') as file:
parser.read(file)
print(*parser.data, sep='\n')
def pivot(iterable):
columns = []
for row in iterable:
columns.extend([] for _ in range(len(row) - len(columns)))
for column, cell in zip(columns, row):
column.append(cell)
return columns
class GDFParser:
HEADER = re.compile('\w+:')
DEF = re.compile('\w+>\w+ (?:DOUBLE|VARCHAR)(?:,\w+ (?:DOUBLE|VARCHAR))*')
CAST = dict(DOUBLE=float, VARCHAR=str)
def __init__(self):
self.__header = None
self.__type = []
self.__data = []
#property
def header(self):
return self.__header
#property
def data(self):
return tuple(self.__data)
def read(self, file):
for line in file:
self.__read_line(line.strip())
def __read_line(self, line):
if self.HEADER.fullmatch(line):
self.__process_header(line)
elif self.DEF.fullmatch(line):
self.__process_def(line)
else:
self.__process_data(line)
def __process_header(self, line):
if self.header:
raise ValueError('header was previously set')
self.__header = line[:-1]
def __process_def(self, line):
name, fields = line.split('>')
columns, casts = pivot(field.split() for field in fields.split(','))
self.__type.append((collections.namedtuple(name, columns),
tuple(map(self.CAST.__getitem__, casts))))
def __process_data(self, line):
if not self.__type:
raise ValueError('a definition must come before its data')
kind, casts = self.__type[-1]
self.__data.append(kind(*(cast(item) for cast, item in
zip(casts, ast.literal_eval(line)))))
if __name__ == '__main__':
main()

Best way to represent this using Python data structures

<specification>
<propertyName> string </propertyName>
<value>
<number>
<value> anyNumberHere </value>
</number>
<text>
<value> anyTextHere </value>
</text>
<URL>
<value> anyURIHere </value>
</URL>
</value>
<!-- ... 1 or more value nodes here ... -->
</specification>
<!-- ... 1 or more specification nodes here ... -->
I need to construct this request for an API, where user will pass these values. So, what should be the best way to represent this, so that it is easy for the user of the method to pass the respective values to the operation?
I am thinking:
List of List of Dicts:
[specification1, specification2, specification3]
Where:
specification1= [value1, value2]
Where:
value1 = {number:anyNumberHere, text:anyTextHere, URL:anyURIHere}
value2 = {number:anyNumberHere, text:anyTextHere, URL:anyURIHere}
But, I am not able to accommodate: <propertyName> here. Any suggestions?
More over, it sounds way to complicated. Can we have object encapsulation like we do it in Java? I understand, we can, but I am curious, what is the recommended way in python?
My logic for now, suggestions (incorrect due to propertyName):
#specification is a List of List of Dicts
for spec in specification:
specification_elem = etree.SubElement(root, "specification")
propertyName_elem = etree.SubElement(specification_elem,"propertyName")
propertyName_elem.text = spec_propertyName
for value in spec:
value_elem = etree.SubElement(specification_elem, "value")
for key in value:
key_elem = etree.SubElement(value_elem, key)
keyValue_elem = etree.SubElement(key_elem, "value")
keyValue_elem.text = value[key]
Here, I will pass spec_propertyName, as a diff parameter. So, user will pass: specification and spec_propertyName

How about a list of Specification objects, of which each has a property_name and a list of value dictionaries? (values could be a list of objects instead of dictionaries.)
For example:
class Value(object):
def __init__(self,
number=None,
text=None,
url=None):
self.number = number
self.text = text
self.url = url
class Specification(object):
def __init__(self, propertyName):
self.propertyName = propertyName
self.values = []
spec1 = Specification("Spec1")
spec1.values = [Value(number=3,
url="http://www.google.com"),
Value(text="Hello, World!",
url="http://www.jasonfruit.com")]
spec2 = Specification("Spec2")
spec2.values = [Value(number=27,
text="I can haz cheezburger?",
url="http://stackoverflow.com"),
Value(text="Running out of ideas.",
url="http://news.google.com")]

If propertyNames are unique, you can have a dict of list of dict. If they are not, you can have a list of list of list of dict.
If all those lists end up hard to keep track of, you can make a class to store the data:
class Specification:
def __init__(self):
self.propertyName = ""
self.values = []
Then, use:
spec = Specification()
spec.propertyName = "string"
value = {"URL":"someURI", "text":"someText"}
spec.values.append(value)

Here's a representation approach using named tuples. You can upgrade to using classes (adding code to do some validation on the caller's input, and allowing the omission of field=None for optional fields) at your leisure with no other change to the user API and little change to the ElementTree-building code.
# -*- coding: cp1252 -*-
from collections import namedtuple
import xml.etree.cElementTree as etree
Specifications = namedtuple('Specifications', 'specification_list')
Specification = namedtuple('Specification', 'propertyName value_list')
Value = namedtuple('Value', 'number text url')
def make_etree(specifications, encoding):
"""
Convert user's `specifications` to an ElementTree.
`encoding` is encoding of *input* `str` objects.
"""
def ensure_unicode(v):
if isinstance(v, str): return v.decode(encoding)
if isinstance(v, unicode): return v
return unicode(v) # convert numbers etc to unicode strings
root = etree.Element('specifications')
for spec in specifications.specification_list:
specification_elem = etree.SubElement(root, "specification")
propertyName_elem = etree.SubElement(specification_elem, "propertyName")
propertyName_elem.text = ensure_unicode(spec.propertyName)
for value in spec.value_list:
value_elem = etree.SubElement(specification_elem, "value")
for key in value._fields:
kv = getattr(value, key)
if kv is None: continue
key_elem = etree.SubElement(value_elem, key)
keyValue_elem = etree.SubElement(key_elem, "value")
keyValue_elem.text = ensure_unicode(kv)
return etree.ElementTree(root)
# === sample caller code follows ===
specs = Specifications(
specification_list=[
Specification(
propertyName='a prop',
value_list=[
Value(
number=42,
text='universe',
url='http://uww.everywhere',
),
Value(
number=0,
text=None, # optional
url='file:///dev/null',
),
],
),
Specification(
propertyName='b prop',
value_list=[
Value(
number=1,
text='Üñîçøðè', # str object, encoded in cp1252
url=u'Üñîçøðè', # unicode object
),
],
),
],
)
print repr(specs); print
import sys
tree = make_etree(specs, 'cp1252')
import cStringIO
f = cStringIO.StringIO()
tree.write(f, encoding='UTF-8', xml_declaration=True)
print repr(f.getvalue())
print
Output (folded at column 80):
Specifications(specification_list=[Specification(propertyName='a prop', value_li
st=[Value(number=42, text='universe', url='http://uww.everywhere'), Value(number
=0, text=None, url='file:///dev/null')]), Specification(propertyName='b prop', v
alue_list=[Value(number=1, text='\xdc\xf1\xee\xe7\xf8\xf0\xe8', url=u'\xdc\xf1\x
ee\xe7\xf8\xf0\xe8')])])
"<?xml version='1.0' encoding='UTF-8'?>\n<specifications><specification><propert
yName>a prop</propertyName><value><number><value>42</value></number><text><value
>universe</value></text><url><value>http://uww.everywhere</value></url></value><
value><number><value>0</value></number><url><value>file:///dev/null</value></url
></value></specification><specification><propertyName>b prop</propertyName><valu
e><number><value>1</value></number><text><value>\xc3\x9c\xc3\xb1\xc3\xae\xc3\xa7
\xc3\xb8\xc3\xb0\xc3\xa8</value></text><url><value>\xc3\x9c\xc3\xb1\xc3\xae\xc3\
xa7\xc3\xb8\xc3\xb0\xc3\xa8</value></url></value></specification></specification
s>"

Parse xml file while a tag is missing

I try to parse an xml file. The text which is in tags is parsed successfully (or it seems so) but I want to output as the text which is not contained in some tags and the following program just ignores it.
from xml.etree.ElementTree import XMLTreeBuilder
class HtmlLatex: # The target object of the parser
out = ''
var = ''
def start(self, tag, attrib): # Called for each opening tag.
pass
def end(self, tag): # Called for each closing tag.
if tag == 'i':
self.out += self.var
elif tag == 'sub':
self.out += '_{' + self.var + '}'
elif tag == 'sup':
self.out += '^{' + self.var + '}'
else:
self.out += self.var
def data(self, data):
self.var = data
def close(self):
print(self.out)
if __name__ == '__main__':
target = HtmlLatex()
parser = XMLTreeBuilder(target=target)
text = ''
with open('input.txt') as f1:
text = f1.read()
print(text)
parser.feed(text)
parser.close()
A part of the input I want to parse:
<p><i>p</i><sub>0</sub> = (<i>m</i><sup>3</sup>+(2<i>l</i><sub>2</sub>+<i>l</i><sub>1</sub>) <i>m</i><sup>2</sup>+(<i>l</i><sub>2</sub><sup>2</sup>+2<i>l</i><sub>1</sub> <i>l</i><sub>2</sub>+<i>l</i><sub>1</sub><sup>2</sup>) <i>m</i>) /(<i>m</i><sup>3</sup>+(3<i>l</i><sub>2</sub>+2<i>l</i><sub>1</sub>) ) }.</p>

Have a look at BeautifulSoup, a python library for parsing, navigating and manipulating html and xml. It has a handy interface and might solve your problem ...

Here's a pyparsing version - I hope the comments are sufficiently explanatory.
src = """<p><i>p</i><sub>0</sub> = (<i>m</i><sup>3</sup>+(2<i>l</i><sub>2</sub>+<i>l</i><sub>1</sub>) """ \
"""<i>m</i><sup>2</sup>+(<i>l</i><sub>2</sub><sup>2</sup>+2<i>l</i><sub>1</sub> <i>l</i><sub>2</sub>+""" \
"""<i>l</i><sub>1</sub><sup>2</sup>) <i>m</i>) /(<i>m</i><sup>3</sup>+(3<i>l</i><sub>2</sub>+""" \
"""2<i>l</i><sub>1</sub>) ) }.</p>"""
from pyparsing import makeHTMLTags, anyOpenTag, anyCloseTag, Suppress, replaceWith
# set up tag matching for <sub> and <sup> tags
SUB,endSUB = makeHTMLTags("sub")
SUP,endSUP = makeHTMLTags("sup")
# all other tags will be suppressed from the output
ANY,endANY = map(Suppress,(anyOpenTag,anyCloseTag))
SUB.setParseAction(replaceWith("_{"))
SUP.setParseAction(replaceWith("^{"))
endSUB.setParseAction(replaceWith("}"))
endSUP.setParseAction(replaceWith("}"))
transformer = (SUB | endSUB | SUP | endSUP | ANY | endANY)
# now use the transformer to apply these transforms to the input string
print transformer.transformString(src)
Gives
p_{0} = (m^{3}+(2l_{2}+l_{1}) m^{2}+(l_{2}^{2}+2l_{1} l_{2}+l_{1}^{2}) m) /(m^{3}+(3l_{2}+2l_{1}) ) }.

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Parse HTML style text annotations to a list of dictionaries - python

Related

Is there a way to split the text I'm inputting into different strings that the dictionary looks at individually

How to get the scope of a class in terms of starting line and end line in Python?

Python: how to parse and and add contents to a text file

Best way to represent this using Python data structures

Parse xml file while a tag is missing

Categories

Resources