Make BeautifulSoup recognize word breaks caused by HTML <li> elements - python

BeautifulSoup4 does not recognize that it should would break between <li> elements when extracting text:
Demo program:
#!/usr/bin/env python3
HTML="""
<html>
<body>
<ul>
<li>First Element</li><li>Second element</li>
</ul>
</body>
"""
from bs4 import BeautifulSoup
soup = BeautifulSoup( HTML, 'html.parser' )
print(soup.find('body').text.strip())
Output:
First ElementSecond element
Desired output:
First Element Second element
I guess I could just globally add a space before all <li> elements. That seems like a hack?

Try using .stripped_strings of soup to extract the text while preserving the whitespaces between elements
from bs4 import BeautifulSoup
HTML = """
<html>
<body>
<ul>
<li>First Element</li><li>Second element</li>
</ul>
</body>
"""
soup = BeautifulSoup(HTML, 'html.parser')
print(' '.join(soup.body.stripped_strings))
Or extract the text of each <li> element separately and then join them
from bs4 import BeautifulSoup
HTML="""
<html>
<body>
<ul>
<li>First Element</li><li>Second element</li>
</ul>
</body>
"""
soup = BeautifulSoup( HTML, 'html.parser' )
lis = soup.find_all('li')
text = ' '.join([li.text.strip() for li in lis])
print(text)

Related

How do I soup this without it doing nothing?

I'm tampering around with BS4 and web scraping, and when I soup it, the variable becomes blank.
<!-- example.html -->
<html><head><title>The Website Title</title></head>
<body>
<p>Download my <strong>Python</strong> book from <a href="http://
inventwithpython.com">my website</a>.</p>
<p class="slogan">Learn Python the easy way!</p>
<p>By <span id="author">Al Sweigart</span></p>
</body></html>
import bs4
example = open('example.html')
soup = bs4.BeautifulSoup(example.read())
print(soup) # returns '[]'

Wrap text within element LXML

I have html-code some like this
<body>
<p> String </p>
Some string
</body>
I need to wrap all unwrapped text inside a body with a paragraph.
I can do it with javascript Node.nodeTypes, but i need solution on Python (i try to use lxml).
In output i need
<body>
<p> String </p>
<p> Some string </p>
</body>
My solution on javascript
$(document).ready(function() {
$('article').contents().filter(function() {
return this.nodeType == 3 && $.trim(this.nodeValue).length;
}).wrap('</p>');
})
<script src="https://ajax.googleapis.com/ajax/libs/jquery/2.1.1/jquery.min.js"></script>
<article>
<p>Some text</p>
Some unwrapped text
<p>Some text</p>
</article>
Here's how it can be done using lxml:
html = '''
<html>
<body>
Text
<p>String</p>
Tail
<p>String</p>
Tail
</body>
</html>
'''
from lxml import etree
import lxml.html
doc = lxml.html.fromstring(html)
for doc_child in doc:
if doc_child.tag == 'body':
body = doc_child
if body.text and body.text.strip():
p = etree.Element('p')
p.text = body.text.strip()
body.text = None
body.insert(0, p)
for elem in body:
if elem.tail and elem.tail.strip():
p = etree.Element('p')
p.text = elem.tail.strip()
elem.tail = None
elem.addnext(p)
print(lxml.html.tostring(doc).decode('utf8'))
Output:
<html>
<body><p>Text</p><p>String</p><p>Tail</p><p>String</p><p>Tail</p></body>
</html>
You can use BeautifulSoup module to parse html pages.
There were many ways available to do this.
But this is one of the easiest method to parse html to text.
from bs4 import BeautifulSoup # from BeautifulSoup import BeautifulSoup
html = '''<script src="https://ajax.googleapis.com/ajax/libs/jquery/2.1.1/jquery.min.js"></script>
<article>
<p>Some text</p>
Some unwrapped text
<p>Some text</p>
</article>'''
parsed_html = BeautifulSoup(html, "lxml")
print parsed_html.text
Output:
Some text
Some unwrapped text
Some text
Python, with lxml:
from lxml.etree import fromstring
body = fromstring("""
<body>
<p> String </p>
Some string
</body>
""")
for text_node in body.xpath("//text()"):
parent = text_node.getparent()
if text_node.strip() and parent.tag != "p":
wrapper = fromstring("<p/>")
parent.replace(text_node, wrapper)
wrapper.append(text_node)

Wrap found tag inside new tag in bs4

How to wrap a tag with new tag in bs4.
for example I have html like this.
<html>
<body>
<p>Demo</p>
<p>world</p>
</body>
</html>
I want to convert it to this.
<html>
<body>
<b><p>Demo</p></b>
<b> <p>world</p> </b>
</body>
</html>
Here is Exemplification.
from bs4 import BeautifulSoup
html = """
<html>
<body>
<p>Demo</p>
<p>world</p>
</body>
</html>"""
soup = BeautifulSoup(html, 'html.parser')
for tag in soup.find_all('p'):
# wrap tag with '<b>'
Document:
from bs4 import BeautifulSoup
html = """
<html>
<body>
<p>Demo</p>
<p>world</p>
</body>
</html>"""
soup = BeautifulSoup(html, 'html.parser')
for p in soup('p'): # shortcut for soup.find_all('p')
p.wrap(soup.new_tag("b"))
out:
<html>
<body>
<b><p>Demo</p></b>
<b><p>world</p></b>
</body>
</html>

How can I find a comment with specified text string

I'm using robobrowser to parse some html content. I has a BeautifulSoup inside. How can I find a comment with specified string inside
<html>
<body>
<div>
<!-- some commented code here!!!<div><ul><li><div id='ANY_ID'>TEXT_1</div></li>
<li><div>other text</div></li></ul></div>-->
</div>
</body>
</html>
In fact I need to get TEXT_1 if I know ANY_ID
Thanks
Use the text argument and check the type to be Comment. Then, load the contents with BeautifulSoup again and find the desired element by id:
from bs4 import BeautifulSoup
from bs4 import Comment
data = """
<html>
<body>
<div>
<!-- some commented code here!!!<div><ul><li><div id='ANY_ID'>TEXT_1</div></li>
<li><div>other text</div></li></ul></div>-->
</div>
</body>
</html>
"""
soup = BeautifulSoup(data, "html.parser")
comment = soup.find(text=lambda text: isinstance(text, Comment) and "ANY_ID" in text)
soup_comment = BeautifulSoup(comment, "html.parser")
text = soup_comment.find("div", id="ANY_ID").get_text()
print(text)
Prints TEXT_1.

Parsing Html with python using lxml

I have this html page:
<html>
<head></head>
<body>
Some Text
<a href="aLink">
Other Text
</a>
<a href="aLink2.html">
Another Text
</a>
</body>
</html>
I am interested in capturing the 3 texts in the file. By doing this, i get in output the texts from the two links:
from lxml import html
from lxml import etree
import requests
page = requests.get('myUrl')
tree = html.fromstring(page.text)
aLink = tree.xpath('//a')
for link in aLink:
print link.text #it works
But I am not able to obtain the text from the body section, since the following code doesn't work:
body = tree.xpath('//body')
print body.text
What am I supposed to do? Thanks for the answers

Categories

Resources