Selecting second child using BeautifulSoup - python

Let's say I have the following HTML:
<div>
<p>this is some text</p>
<p>...and this is some other text</p>
</div>
How can I retrieve the text from the second paragraph using BeautifulSoup?

You can use a CSS selector to do this:
>>> from bs4 import BeautifulSoup
>>> soup = BeautifulSoup("""<div>
.... <p>this is some text</p>
.... <p>...and this is some other text</p>
.... </div>""", "html.parser")
>>> soup.select('div > p')[1].get_text(strip=True)
'...and this is some other text'

You can use nth-of-type:
h = """<div>
<p>this is some text</p>
<p>...and this is some other text</p>
</div>"""
soup = BeautifulSoup(h)
print(soup.select_one("div p:nth-of-type(2)").text)

secondp = [div.find('p') for div in soup.find('div')]
In : secondp[1].text
Out : Your text
Or you can use the findChildren directly -
div_ = soup.find('div').findChildren()
for i, child in enumerate(div_):
if i == 1:
print child.text

You could solve this with gazpacho:
from gazpacho import Soup
html = """\
<div>
<p>this is some text</p>
<p>...and this is some other text</p>
</div>
"""
soup = Soup(html)
soup.find('p')[1].text
Which would output:
'...and this is some other text'

Related

Python: Deleting all divs without class

I want to delete all divs without classes (but not the content that is in the div).
My input
<h1>Test</h1>
<div>
<div>
<div class="test">
<p>abc</p>
</div>
</div>
</div>
The output I want
<h1>Test</h1>
<div class="test">
<p>abc</p>
</div>
My try 1
Based on "Deleting a div with a particular class":
from bs4 import BeautifulSoup
soup = BeautifulSoup('<h1>Test</h1><div><div><div class="test"><p>abc</p></div></div></div>', 'html.parser')
for div in soup.find_all("div", {'class':''}):
div.decompose()
print(soup)
# <h1>Test</h1>
My try 2
from htmllaundry import sanitize
myinput = '<h1>Test</h1><div><div><div class="test"><p>abc</p></div></div></div>'
myoutput = sanitize(myinput)
print myoutput
# <p>Test</p><p>abc</p> instead of <h1>Test</h1><div class="test"><p>abc</p></div>
My try 3
Based on "Clean up HTML in python"
from lxml.html.clean import Cleaner
def sanitize(dirty_html):
cleaner = Cleaner(remove_tags=('font', 'div'))
return cleaner.clean_html(dirty_html)
myhtml = '<h1>Test</h1><div><div><div class="test"><p>abc</p></div></div></div>'
print(sanitize(myhtml))
# <div><h1>Test</h1><p>abc</p></div>
My try 4
from html_sanitizer import Sanitizer
sanitizer = Sanitizer() # default configuration
output = sanitizer.sanitize('<h1>Test</h1><div><div><div class="test"><p>abc</p></div></div></div>')
print(output)
# <h1>Test</h1><p>abc</p>
Problem: A div element is used to wrap the HTML fragment for the parser, therefore div tags are not allowed. (Source: Manual)
If you want to exclude div without class, preserving its content:
from bs4 import BeautifulSoup
markup = '<h1>Test</h1><div><div><div class="test"><p>abc</p></div></div></div>'
soup = BeautifulSoup(markup,"html.parser")
for tag in soup.find_all():
empty = tag.name == 'div' and not(tag.has_attr('class'))
if not(empty):
print(tag)
Output:
<h1>Test</h1>
<div class="test"><p>abc</p></div>
<p>abc</p>
Please checkout this.
from bs4 import BeautifulSoup
data="""
<div>
<div>
<div class="test">
<p>abc</p>
</div>
</div>
</div>
"""
soup = BeautifulSoup(data, features="html5lib")
for div in soup.find_all("div", class_=True):
print(div)

Exclude data from tag

I want to exclude a specific text inside an html span tag. In the given example below I just wanted to fetch all test2 text from span with class under a-list-item.
my code:
<span class="a-list-item">test1</span>
<span class="a-list-item">test2</span>
<span class="a-list-item">test2</span>
my code: tag = tag.find_all("span", {"class" : "a-list-item"})
How to get all test2 only. Thanks for your response
It looks like you are using Beautiful Soup. In Beautiful Soup 4.7+, this is easy to do just by using select instead of find_all. You can use :contains() wrapped in :not() to exclude spans that contain specific text.
from bs4 import BeautifulSoup
markup = '''
<span class="a-list-item">test1</span>
<span class="a-list-item">test2</span>
<span class="a-list-item">test2</span>
'''
soup = BeautifulSoup(markup)
print(soup.select("span.a-list-item:not(:contains(test1))"))
Output
[<span class="a-list-item">test2</span>, <span class="a-list-item">test2</span>]
You could go with applying an xpath to exclude containing test1
//span[#class='a-list-item' and not(contains(text(), 'test1'))]
E.g.
from lxml.html import fromstring
# url = ''
# tree = html.fromstring( requests.get(url).content)
h = '''
<html>
<head></head>
<body>
<span class="a-list-item">test1</span>
<span class="a-list-item">test2</span>
<span class="a-list-item">test2</span>
</body>
</html>
'''
tree = fromstring(h)
items = [item.text for item in tree.xpath("//span[#class='a-list-item' and not(contains(text(), 'test1'))]")]
print(items)
Or test each css qualifying node (based on tag and class) text value
from bs4 import BeautifulSoup as bs
h = '''
<html>
<head></head>
<body>
<span class="a-list-item">test1</span>
<span class="a-list-item">test2</span>
<span class="a-list-item">test2</span>
</body>
</html>
'''
soup = bs(h, 'lxml')
items = [item.text for item in soup.select('span.a-list-item') if 'test1' not in item.text]
print(items)
Use regular expression re to find specific text.
from bs4 import BeautifulSoup
import re
html = '''
<span class="a-list-item">test1</span>
<span class="a-list-item">test2</span>
<span class="a-list-item">test2</span>
'''
soup = BeautifulSoup(html,'html.parser')
items=soup.find_all('span',text=re.compile("test2"))
for item in items:
print(item.text)
Output:
test2
test2

Find tag with certain child tag using bs4 python

I have an html in the below format.
<div class="consider">
<div class="row">
<p>Text1</p>
</div>
</div>
<div class="consider">
<h2>Hello</h2>
</div>
<div class="Consider">
<div class="row">
<p>Text2
</div>
</div>
I want to get the tag div only where its child tag(div) has class as "row"
this is how you can access it :
from bs4 import BeautifulSoup
content = '<div class="consider"><div class="row"><p>Text1</p></div></div><div class="consider"><h2>Hello</h2></div><div class="Consider"><div class="row"><p>Text2</p></div></div>'
soup = BeautifulSoup(content, 'lxml')
for div in soup.find_all('div', class_='row'):
if div.parent.name == "div":
#do whatever you want with div.parent which is the element you want.
With select('div > div.row') we select all div tags with class row as direct children of div tag and then through list comprehension we select all parents of these tags:
data = '<div class="consider"><div class="row"><p>Text1</p></div></div><div class="consider"><h2>Hello</h2></div><div class="Consider"><div class="row"><p>Text2</p></div></div>'
from bs4 import BeautifulSoup
soup = BeautifulSoup(data, 'lxml')
divs = [div.parent for div in soup.select('div > div.row')]
print(divs)
Outputs:
[<div class="consider"><div class="row"><p>Text1</p></div></div>, <div class="Consider"><div class="row"><p>Text2</p></div></div>]

Get links from HTML after a certain title

See this HTML code:
<html>
<body>
<p class="fixedfonts">
LINK1
</p>
<h2>Results</h2>
<p class="fixedfonts">
LINK2
</p>
<p class="fixedfonts">
LINK3
</p>
</body>
</html>
It contains 3 links. However I need to retrieve only the links after the title Results
I am using python with BeautifulSoup:
from bs4 import BeautifulSoup, SoupStrainer
# at this point html contains the code as string
# parse the HTML file
soup = BeautifulSoup(html.replace('\n', ''), parse_only=SoupStrainer('a'))
# kill all script and style elements
for script in soup(["script", "style"]):
script.extract() # rip it out
links = list()
for link in soup:
if link.has_attr('href'):
links.append(link['href'].replace('%20', ' '))
print(links)
With the the presented code I get all the links in the document, but as I said I only need those that are after the Results tag/title.
Guidance is appreciated
You can solve that using the find_all_next() method:
results = soup.find("h2", text="Results")
for link in results.find_all_next("a"):
print(link.get("href"))
Demo:
>>> from bs4 import BeautifulSoup
>>>
>>> data = """
... <html>
... <body>
... <p class="fixedfonts">
... LINK1
... </p>
...
... <h2>Results</h2>
...
... <p class="fixedfonts">
... LINK2
... </p>
...
... <p class="fixedfonts">
... LINK3
... </p>
... </body>
... </html>"""
>>>
>>> soup = BeautifulSoup(data, "html.parser")
>>> results = soup.find("h2", text="Results")
>>> for link in results.find_all_next("a"):
... print(link.get("href"))
...
B.pdf
C.pdf
Split the html data into two parts, before and after the "Results" Then use the one after to process it:
data = html.split("Results")
need = data[1]
So just implement that:
from bs4 import BeautifulSoup, SoupStrainer
data = html.split("Results")
need = data[1]
soup = BeautifulSoup(need.replace('\n', ''), parse_only=SoupStrainer('a'))
Tested and seemed to work.
from bs4 import BeautifulSoup, SoupStrainer
html = '''<html>
<body>
<p class="fixedfonts">
LINK1
</p>
<h2>Results</h2>
<p class="fixedfonts">
LINK2
</p>
<p class="fixedfonts">
LINK2
</p>
<p class="fixedfonts">
LINK3
</p>
</body>
</html>'''
# at this point html contains the code as string
# parse the HTML file
dat = html.split("Result")
need = dat[1]
soup = BeautifulSoup(html.replace('\n', ''), parse_only=SoupStrainer('a'))
# kill all script and style elements
for script in soup(["script", "style"]):
script.extract() # rip it out
links = list()
for link in soup:
if link.has_attr('href'):
links.append(link['href'].replace('%20', ' '))
n_links = list()
for i in set(links):
if need.count(i) > 0:
for x in range(1, need.count(i) + 1):
n_links.append(i)
print(n_links)

Using page text to select `html` element using`Beautiful Soup`

I have a page which contains several repetitions of: <div...><h4>...<p>... For example:
html = '''
<div class="proletariat">
<h4>sickle</h4>
<p>Ignore this text</p>
</div>
<div class="proletariat">
<h4>hammer</h4>
<p>This is the text we want</p>
</div>
'''
from bs4 import BeautifulSoup
soup = BeautifulSoup(html)
If I write print soup.select('div[class^="proletariat"] > h4 ~ p'), I get:
[<p>Ignore this text</p>, <p>This is the text we want</p>]
How do I specify that I only want the text of p when it is preceded by <h4>hammer</h4>?
Thanks
html = '''
<div class="proletariat">
<h4>sickle</h4>
<p>Ignore this text</p>
</div>
<div class="proletariat">
<h4>hammer</h4>
<p>This is the text we want</p>
</div>
'''
import re
from bs4 import BeautifulSoup
soup = BeautifulSoup(html)
print(soup.find("h4", text=re.compile('hammer')).next_sibling.next.text)
This is the text we want
:contains() could help here, but it is not supported.
Taking this into account, you can use select() in conjunction with the find_next_sibling():
print next(h4.find_next_sibling('p').text
for h4 in soup.select('div[class^="proletariat"] > h4')
if h4.text == "hammer")

Categories

Resources