Why is text of HTML node empty with HTMLParser? - python

In the following example I am expecting to get Foo for the <h2> text:
from io import StringIO
from html5lib import HTMLParser
fp = StringIO('''
<!DOCTYPE html>
<html xmlns="http://www.w3.org/1999/xhtml">
<body>
<h2>
<span class="section-number">1. </span>
Foo
<a class="headerlink" href="#foo">¶</a>
</h2>
</body>
</html>
''')
etree = HTMLParser(namespaceHTMLElements=False).parse(fp)
h2 = etree.findall('.//h2')[0]
h2.text
Unfortunately I get ''. Why?
Strangly, foo is in the text:
>>> list(h2.itertext())
['1. ', 'Foo', '¶']
>>> h2.getchildren()
[<Element 'span' at 0x7fa54c6a1bd8>, <Element 'a' at 0x7fa54c6a1c78>]
>>> [node.text for node in h2.getchildren()]
['1. ', '¶']
So where is Foo?

I think you are one level too shallow in the tree. Try this:
from io import StringIO
from html5lib import HTMLParser
fp = StringIO('''
<!DOCTYPE html>
<html xmlns="http://www.w3.org/1999/xhtml">
<body>
<h2>
<span class="section-number">1. </span>
Foo
<a class="headerlink" href="#foo">¶</a>
</h2>
</body>
</html>
''')
etree = HTMLParser(namespaceHTMLElements=False).parse(fp)
etree.findall('.//h2')[0][0].tail
More generally, to crawl all text and tail, try a loop like this:
for u in etree.findall('.//h2')[0]:
print(u.text, u.tail)

Using lxml:
fp2 = '''
<!DOCTYPE html>
<html xmlns="http://www.w3.org/1999/xhtml">
<body>
<h2>
<span class="section-number">1. </span>
Foo
<a class="headerlink" href="#foo">¶</a>
</h2>
</body>
</html>
'''
import lxml.html
tree = lxml.html.fromstring(fp2)
for item in tree.xpath('//h2'):
target = item.text_content().strip()
print(target.split('\n')[1].strip())
Output:
Foo

Related

Mixing HTML and Python code - how to refer to a changing filename within the HTML code

I'm trying to automate a process where I take a snapshot everyday but change the filename to that date. For example, I'd like to reference today's file as "20200219 snapshot.png" and change it to "20200220 snapshot.png" tomorrow. The problem is, I can't input the variable name filename after the img src and have to put in the hardcoded exact String.
date = date.strftime('%Y%m%d')
filename = date + " snapshot.png"
html = """\
<html>
<head></head>
<body>
<img src="Directory/snapshot.png"/>
</body>
</html>
"""
You can use ElementTree to parse through the HTML DOM, use the find method to search for img tag. Then you can assign the src attribute value. The attributes are returned as a dict with the attrib parameter and you just need to look for the 'src' key:
import datetime
date = datetime.datetime.now().strftime('%Y%m%d')
filename = date + " snapshot.png"
import xml.etree.ElementTree as et
html = """\
<html>
<head></head>
<body>
<img src="Directory/snapshot.png"/>
</body>
</html>
"""
tree = et.fromstring(html)
image_attributes = tree.find('body/img').attrib
for k in image_attributes.keys():
if 'src' in k:
image_attributes[k] = filename
html_new = et.tostring(tree)
print(html_new)
Output:
b'<html>\n <head />\n <body>\n <img src="20200220 snapshot.png" />\n </body>\n</html>'
To pretty print this output, you can use the method provided in official docs here and just do:
et.dump(tree)
Output:
<html>
<head />
<body>
<img src="20200220 snapshot.png" />
</body>
</html>
Just make it a string preceded by f and add your variable between {} to the string
import datetime
date = datetime.datetime.now().strftime('%Y%m%d')
filename = date + " snapshot.png"
html = f"""\
<html>
<head></head>
<body>
<img src="Directory/{filename}"/>
</body>
</html>
"""
print(html)
Or use simple string concatenation instead
import datetime
date = datetime.datetime.now().strftime('%Y%m%d')
filename = date + " snapshot.png"
html = f"""\
<html>
<head></head>
<body>
<img src="Directory/"""
html += filename
html += """/>
</body>
</html>
"""
print(html)

Remove all the Html content from a string in python

I would like to remove the all HTML contents from the string.
I have a string
str= "I am happy with <body> <h1>This is a Heading</h1> <p>This is a paragraph.</p> </body> 3333 <body> <h1>This is a Heading</h1> <p>This is a paragraph.</p> </body> your code"
I want the final string
str= "I am happy with 3333 your code"
I have written this code to do above task.
def removetags(input_str):
result = ''
startflag = 0
start=True
count=0
for ch in input_str:
if ch == '<':
if count!=len(input_str)-1:
if input_str[count+1]!='/':
start=True
startflag += 1
elif (ch == '>') and startflag :
if not start:
startflag -= 1
start=False
elif (not startflag) :
result += ch
count += 1
return result
print(removetags(str))
This works fine but if you have a < in the text then that will not output correctly. So I want to remove using html parsing. Is there any way to do that? I found this library but I couldn't find the way to do that.Thanks in advance.
from html.parser import HTMLParser
str = "I am happy with <body> <h1>This is a Heading</h1> <p>This is a paragraph.</p> </body> 3333 > <body> <h1>This is a Heading</h1> <p>This is a paragraph.</p> </body> your code"
class MyHTMLParser(HTMLParser):
got_html_in_tags = False
html_free_text = []
def handle_starttag(self, tag, attrs):
self.got_html_in_tags = True
def handle_endtag(self, tag):
self.got_html_in_tags = False
def handle_data(self, data):
if not self.got_html_in_tags:
self.html_free_text.append(data)
parser = MyHTMLParser()
parser.feed(str)
print("".join(parser.html_free_text))
This will print I am happy with 3333 your code even with '>' or '<' in text
Another re solution:
re.sub(r'(<(?P<tag>[a-zA-Z0-9]+)>.*?</(?P=tag)>)', '', string)
Tests:
>>> re.sub(r'(<(?P<tag>[a-zA-Z0-9]+)>.*?</(?P=tag)>)', '', string)
'I am happy with 3333 your code'
>>> string = "I am happy with <body> <h1>This is a Heading</h1> <p>This is a paragraph.</p> </body> 3333 > <body> <h1>This is a Heading</h1> <p>This is a paragraph.</p> </body> your code"
>>> re.sub(r'(<(?P<tag>[a-zA-Z0-9]+)>.*?</(?P=tag)>)', '', string)
'I am happy with 3333 > your code'
>>> string = "I am <a happy with <body> </body> lal"
>>> re.sub(r'(<(?P<tag>[a-zA-Z0-9]+)>.*?</(?P=tag)>)', '', string)
'I am <a happy with lal'
You can use regex library for that,
import re
str= "I am happy with <body> <h1>This is a Heading</h1> <p>This is a paragraph.</p> </body> 3333 <body> <h1>This is a Heading</h1> <p>This is a paragraph.</p> </body> your code"
comp = re.compile(r'<([\w]+)[^>]*>(.*?)<\/\1>')
data = re.sub(comp, '', str)
print(data)
May be this help
Let's do this recursively ;)
Base case 1: when the text is an empty string,
return an empty string
Base case 2: when the first letter of the text is a caret,
search for the closing tag and return call to function with remaining text after closing tag.
def remove_tags(text, tags=[]):
if text == '':
return text
if text[0] == '<':
closing_caret_pos = text.find('>')
tag = text[0:closing_caret_pos+1]
is_open_tag = '/' not in tag
is_close_tag = not is_open_tag
is_valid_tag = tag[1:-1].isalpha() or tag[2:-1].isalpha()
if is_valid_tag and is_open_tag:
tags.append(tag)
return remove_tags(text[1:], tags)
if is_valid_tag and is_close_tag:
tags.pop()
return remove_tags(text[len(tag):], tags)
if len(tags) != 0: # when an open tag exists keeping looking
return remove_tags(text[1:], tags)
return text[0] + remove_tags(text[1:], tags)
Test runs:
text = "I am happy with <body> <h1>This is a Heading</h1> <p>This is a paragraph.</p> </body> 3333 <body> <h1>This is a Heading</h1> <p>This is a paragraph.</p> </body> your code"
print(remove_tags(text))
>
I am happy with 3333 your code
text = "x<=1 <div> cookies </div>"
print(remove_tags(text))
>
x<=1
text = "I am <a happy with <body> </body> lal"
print(remove_tags(text))
>
I am <a happy with lal

Python Transcrypt addEventListener

I have written a Python program for translation with Transcrypt to Javascript.
I can not get the addEventListener function to work. Any ideas?
Here is the code as dom7.py:
class TestSystem:
def __init__ (self):
self.text = 'Hello, DOM!'
self.para = 'A new paragraph'
self.textblock = 'This is an expandable text block.'
self.button1 = document.getElementById("button1")
self.button1.addEventListener('mousedown', self.pressed)
def insert(self):,
document.querySelector('output').innerText = self.text
# document.querySelector('test').innerText = "Test"+self.button1+":"
def pressed(self):
container = document.getElementById('textblock')
newElm = document.createElement('p')
newElm.innerText = self.para
container.appendChild(newElm)
testSystem = TestSystem()
And here follows the corresponding dom7.html for it:
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="utf-8" />
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
<script src="__javascript__/dom7.js"></script>
<title>Titel</title>
</head>
<body onload=dom7.testSystem.insert()>
<button id="button1">Click me</button><br>
<main>
<h1>DOM examples</h1>
<p>Testing DOM</p>
<p>
<output></output>
</p>
<p>
<test>Test String:</test>
</p>
<div id="textblock">
<p>This is an expandable text block.</p>
</div>
</main>
</body>
</html>
The problem is that your TestSystem constructor is called before the DOM tree is ready. There are three ways to deal with this, the last of which is the best.
The first way is to include your script after you populated your body:
class TestSystem:
def __init__ (self):
self.text = 'Hello, DOM!'
self.para = 'A new paragraph'
self.textblock = 'This is an expandable text block.'
self.button1 = document.getElementById("button1")
self.button1.addEventListener('mousedown', self.pressed)
def insert(self):
document.querySelector('output').innerText = self.text
# document.querySelector('test').innerText = "Test"+self.button1+":"
def pressed(self):
container = document.getElementById('textblock')
newElm = document.createElement('p')
newElm.innerText = self.para
container.appendChild(newElm)
testSystem = TestSystem()
and:
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="utf-8" />
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
<title>Titel</title>
</head>
<body onload=dom7.testSystem.insert()>
<button id="button1">Click me</button><br>
<main>
<h1>DOM examples</h1>
<p>
Testing DOM
</p>
<p>
<output></output>
</p>
<p>
<test>Test String:</test>
</p>
<div id="textblock">
<p>This is an expandable text block.</p>
</div>
<script src="__javascript__/dom7.js"></script>
</main>
</body>
</html>
Still your insert function may be called too early, so may not work.
The second way is to include the script at the beginning and call an initialization function to connect event handlers to the DOM:
class TestSystem:
def __init__ (self):
self.text = 'Hello, DOM!'
self.para = 'A new paragraph'
self.textblock = 'This is an expandable text block.'
self.button1 = document.getElementById("button1")
self.button1.addEventListener('mousedown', self.pressed)
def insert(self):
document.querySelector('output').innerText = self.text
# document.querySelector('test').innerText = "Test"+self.button1+":"
def pressed(self):
container = document.getElementById('textblock')
newElm = document.createElement('p')
newElm.innerText = self.para
container.appendChild(newElm)
def init ():
testSystem = TestSystem()
and:
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="utf-8" />
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
<script src="__javascript__/dom7.js"></script>
<title>Titel</title>
</head>
<body onload=dom7.testSystem.insert()>
<button id="button1">Click me</button><br>
<main>
<h1>DOM examples</h1>
<p>
Testing DOM
</p>
<p>
<output></output>
</p>
<p>
<test>Test String:</test>
</p>
<div id="textblock">
<p>This is an expandable text block.</p>
</div>
<script>dom7.init ();</script>
</main>
</body>
</html>
Still there is a possibility that some browsers call the initialization function before the page is loaded, although this is rare. In addition to this the insert method is again called too early.
Third and best way, to solve both problems, is to run your initialization after a page load event, and call insert after you create your testSystem, so e.g. in the initalization function:
class TestSystem:
def __init__ (self):
self.text = 'Hello, DOM!'
self.para = 'A new paragraph'
self.textblock = 'This is an expandable text block.'
self.button1 = document.getElementById("button1")
self.button1.addEventListener('mousedown', self.pressed)
def insert(self):
document.querySelector('output').innerHTML = self.text
# document.querySelector('test').innerText = "Test"+self.button1+":"
def pressed(self):
container = document.getElementById('textblock')
newElm = document.createElement('p')
newElm.innerText = self.para
container.appendChild(newElm)
def init ():
testSystem = TestSystem()
testSystem.insert ()
and:
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="utf-8" />
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
<script src="__javascript__/dom7.js"></script>
<title>Titel</title>
</head>
<body onload="dom7.init ();">
<button id="button1">Click me</button><br>
<main>
<h1>DOM examples</h1>
<p>
Testing DOM
</p>
<p>
<output></output>
</p>
<p>
<test>Test String:</test>
</p>
<div id="textblock">
<p>This is an expandable text block.</p>
</div>
</main>
</body>
</html>
I looked at your mondrian example in the tutorial section and I saw that there is also a very simple way to attach the addEventListener to a document after it has loaded. You can use the DOMContentLoaded attribute in the header of the html doc for doing so:
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="utf-8" />
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
<script src="__javascript__/addEventListener_example1.js"></script>
<script>document.addEventListener("DOMContentLoaded", addEventListener_example1.init)</script>
<title>Titel</title>
</head>
<body>
<button id="button1">Click me</button><br>
<main>
<h1>DOM examples</h1>
<p>
Testing DOM
</p>
<p>
<output></output>
</p>
<p>
<test>Test String:</test>
</p>
<div id="textblock">
<p>This is an expandable text block.</p>
</div>
</main>
</body>
</html>
and the code for addEventListener_example1.py would be:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
def init():
insert()
def insert():
document.querySelector('output').innerHTML = 'Hello, DOM!'
button1 = document.getElementById("button1")
button1.addEventListener('mousedown', pressed)
def pressed():
para = 'A new paragraph'
container = document.getElementById('textblock')
newElm = document.createElement('p')
newElm.innerText = para
container.appendChild(newElm)

Add meta tag using BeautifulSoup

How to add a meta tag just after title tag in a HTML page by using Beautiful Soup(library). I am using python language for coding and unable to do this.
Use soup.create_tag() to create a new <meta> tag, set attributes on that and add it to your document <head>.
metatag = soup.new_tag('meta')
metatag.attrs['http-equiv'] = 'Content-Type'
metatag.attrs['content'] = 'text/html'
soup.head.append(metatag)
Demo:
>>> from bs4 import BeautifulSoup
>>> soup = BeautifulSoup('''\
... <html><head><title>Hello World!</title>
... </head><body>Foo bar</body></html>
... ''')
>>> metatag = soup.new_tag('meta')
>>> metatag.attrs['http-equiv'] = 'Content-Type'
>>> metatag.attrs['content'] = 'text/html'
>>> soup.head.append(metatag)
>>> print soup.prettify()
<html>
<head>
<title>
Hello World!
</title>
<meta content="text/html" http-equiv="Content-Type"/>
</head>
<body>
Foo bar
</body>
</html>

wrap implicit section of an HTML document into section tags using lxml.etree

I'm trying to write a small function to wrap implicit section of an HTML document into section tags. I'm trying to do so with lxml.etree.
Let say my input is:
<html>
<head></head>
<body>
<h1>title</h1>
<p>some text</p>
<h1>title</h1>
<p>some text</p>
</body>
</html>
I'd like to end up with:
<html>
<head></head>
<body>
<section>
<h1>title</h1>
<p>some text</p>
</section>
<section>
<h1>title</h1>
<p>some text</p>
</section>
</body>
</html>
Here is what I have at the moment
def outline(tree):
pattern = re.compile('^h(\d)')
section = None
for child in tree.iterchildren():
tag = child.tag
if tag is lxml.etree.Comment:
continue
match = pattern.match(tag.lower())
# If a header tag is found
if match:
depth = int(match.group(1))
if section is not None:
child.addprevious(section)
section = lxml.etree.Element('section')
section.append(child)
else:
if section is not None:
section.append(child)
else:
pass
if child is not None:
outline(child)
which I call like this
outline(tree.find('body'))
But it doesn't work with subheadings at the moment, eg.:
<section>
<h1>ONE</h1>
<section>
<h3>TOO Deep</h3>
</section>
<section>
<h2>Level 2</h2>
</section>
</section>
<section>
<h1>TWO</h1>
</section>
Thanks
when it comes to transforming xml, xslt is the best way to go, see lxml and xslt docs.
this is only a direction as requested, let me know if you need further help writing that xslt
Here is the code I ended up with, for the record:
def outline(tree, level=0):
pattern = re.compile('^h(\d)')
last_depth = None
sections = [] # [header, <section />]
for child in tree.iterchildren():
tag = child.tag
if tag is lxml.etree.Comment:
continue
match = pattern.match(tag.lower())
#print("%s%s" % (level * ' ', child))
if match:
depth = int(match.group(1))
if depth <= last_depth or last_depth is None:
#print("%ssection %d" % (level * ' ', depth))
last_depth = depth
sections.append([child, lxml.etree.Element('section')])
continue
if sections:
sections[-1][1].append(child)
for section in sections:
outline(section[1], level=((level + 1) * 4))
section[0].addprevious(section[1])
section[1].insert(0, section[0])
Works pretty well for me

Categories

Resources