Beautiful Soup: Test if a div is children of a div - python

Is it possible to test with Beautiful Soup whether a div is a (not necessarily immediate) child of a div?
Eg.
<div class='a'>
<div class='aa'>
<div class='aaa'>
<div class='aaaa'>
</div>
</div>
</div>
<div class='ab'>
<div class='aba'>
<div class='abaa'>
</div>
</div>
</div>
</div>
Now I want to test whether the div with class aaaa and the div with class abaa are (not necessarily immediate) children of the div with class aa.
import bs4
with open('test.html','r') as i_file:
soup = bs4.BeautifulSoup(i_file.read(), 'lxml')
div0 = soup.find('div', {'class':'aa'})
div1 = soup.find('div', {'class':'aaaa'})
div2 = soup.find('div', {'class':'abaa'})
print(div1 in div0) # must return True, but returns False
print(div2 in div0) # must return False
How can this be done?
(Of course, the actual HTML is more complicated, with more nested divs.)

You can use find_parent method from Beautifulsoup.
import bs4
with open("test.html", "r") as i_file:
soup = bs4.BeautifulSoup(i_file.read(), "lxml")
div0 = soup.find("div", {"class": "aa"})
div1 = soup.find("div", {"class": "aaaa"})
div2 = soup.find("div", {"class": "abaa"})
print(div1.find_parent(div0.name, attrs=div0.attrs) is not None) # Returns True
print(div2.find_parent(div0.name, attrs=div0.attrs) is not None) # Returns False

try finding all the child elements using find_all_next and see if the child elements has the required class attribute.
from bs4 import BeautifulSoup
soup = BeautifulSoup(text, "html.parser")
def is_child(element, parent_class, child_class):
return any(
child_class in i.attrs['class']
for i in soup.find("div", attrs={"class": parent_class}).find_all_next(element)
)
print(is_child("div", "aa", "aaa")) # True
print(is_child("div", "abaa", "aa")) # False

Okay, I think I found a way. You gotta get all children divs of the parent div with find_all:
import bs4
with open('test.html','r') as i_file:
soup = bs4.BeautifulSoup(i_file.read(), 'lxml')
div0 = soup.find('div', {'class':'aa'})
div1 = soup.find('div', {'class':'aaaa'})
div2 = soup.find('div', {'class':'abaa'})
children = div0.find_all('div')
print(div1 in children)
print(div2 in children)

Related

Extracting only single tags in beautifulsoup

I'm looking for a way to extract only tags that don't have another tag in it
For example:
from bs4 import BeautifulSoup
html = """
<p><a href='XYZ'>Text1</a></p>
<p>Text2</p>
<p><a href='QWERTY'>Text3</a></p>
<p>Text4</p>
"""
soup = BeautifulSoup(html, 'html.parser')
soup.find_all('p')
Gives
[<p>Text1</p>,
<p>Text2</p>,
<p>Text3</p>,
<p>Text4</p>]
This is what I want to achieve:
[<p>Text2</p>,
<p>Text4</p>]
You can filter Tags without other tags in them as follows:
for tag in soup.find_all('p'):
if isinstance(tag.next, str):
print(tag)
Which returns
<p>Text2</p>
<p>Text4</p>
I would simply filter it afterwards using if/else on the length of the tags, if it's only p then it'll be empty, otherwise it will get filtered out:
for x in soup.find_all('p'):
if len([x.tag for x in x.find_all()]) == 0:
print(x)
Returns only:
<p>Text2</p>
<p>Text4</p>
from bs4 import BeautifulSoup
html = """
<p><a href='XYZ'>Text1</a></p>
<p>Text2</p>
<p><a href='QWERTY'>Text3</a></p>
<p>Text4</p>
<p>Text6: <a href='QWERTY'>Text5</a></p>
"""
soup = BeautifulSoup(html, 'html.parser')
def p_tag_with_only_strings_as_children(tag):
return tag.name == "p" and all(isinstance(x, str) for x in tag.children)
result = soup.find_all(p_tag_with_only_strings_as_children)
print(result)
Result:
[<p>Text2</p>, <p>Text4</p>]
BeautifulSoup-Documentation for using function-filters on .find_all().
For checking for types within a list credits go to.
https://stackoverflow.com/a/32705845/5288820.
Or use CSS-Selectors:
https://beautiful-soup-4.readthedocs.io/en/latest/#css-selectors
from bs4 import BeautifulSoup
html = """
<p><a href='XYZ'>Text1</a></p>
<p>Text2</p>
<p><a href='QWERTY'>Text3</a></p>
<p>Text4</p>
<p>Text6: <a href='QWERTY'>Text5</a></p>
""".replace('\n',"")
soup = BeautifulSoup(html, 'html.parser')
print(soup.select('p:not(:has(*))'))
#or in case you only want to filter out "a" tags:
print(soup.select('p:not(:has(a))'))
Result:
[<p>Text2</p>, <p>Text4</p>]

How to get text inside tag using python and BeautifulSoup

I'm trying to get text (example text) inside tags using beautiful soup
The html structure looks like this:
...
<div>
<div>Description</div>
<span>
<div><span>example text</span></div>
</span>
</div>
...
What i tried:
r = requests.get(url)
soup = bs(r.content, 'html.parser')
desc = soup.find('div.div.span.div.span')
print(str(desc))
You cannot use .find() with multiple tag names in it stacked like this. You need to repeatedly call .find() to get desired result. Check out docs for more information. Below code will give you desired output:
soup.find('div').find('span').get_text()
Your selector is wrong.
>>> from bs4 import BeautifulSoup
>>> data = '''\
... <div>
... <div>Description</div>
... <span>
... <div><span>example text</span></div>
... </span>
... </div>'''
>>> soup = BeautifulSoup(data, 'html.parser')
>>> desc = soup.select_one('div span div span')
>>> desc.text
'example text'
>>>
r = requests.get(url)
soup = bs(r.content, 'html.parser')
desc = soup.find('div').find('span')
print(desc.getText())
Check this out -
soup = BeautifulSoup('''<div>
<div>Description</div>
<span>
<div><span>example text</span></div>
</span>
</div>''',"html.parser")
text = soup.span.get_text()
print(text)

How to take link from onclickvalue in BeautifulSoup?

Need help scrubbing a link to an image that is stored in the onclick= value.
I do this, but I stopped how to remove everything in onclick except for the link.
<a onclick="ShowEnlargedImagePreview( 'https://steamuserimages-a.akamaihd.net/ugc/794261971268711656/69C39CF2A2BBCDDC7C04C17DF1E88A6ED875DBE7/' );"></a>
links = soup.find('div', class_='workshopItemPreviewImageMain')
links = links.findChild('a', attrs={'onclick': re.compile("^https://")})
But nothing is output.
links = soup.find('div', class_='workshopItemPreviewImageMain')
links = links.findChild('a')
links = links.get("onclick")
The entire value of onclick is displayed:
howEnlargedImagePreview( 'https://steamuserimages-a.akamaihd.net/ugc/794261971268711656/69C39CF2A2BBCDDC7C04C17DF1E88A6ED875DBE7/' )
But only a link is needed.
You just need to change your regular expression.
from bs4 import BeautifulSoup
import re
pattern = re.compile(r'''(?P<quote>['"])(?P<href>https?://.+?)(?P=quote)''')
data = '''
<div class="workshopItemPreviewImageMain">
<a onclick="ShowEnlargedImagePreview( 'https://steamuserimages-a.akamaihd.net/ugc/794261971268711656/69C39CF2A2BBCDDC7C04C17DF1E88A6ED875DBE7/' );"></a>
</div>
'''
soup = BeautifulSoup(data, 'html.parser')
div = soup.find('div', class_='workshopItemPreviewImageMain')
links = div.find_all('a', {'onclick': pattern})
for a in links:
print(pattern.search(a['onclick']).group('href'))

Is it possible to just get the tags without a class or id with BeautifulSoup?

I have several thousands HTML sites and I am trying to filter the text from these sites.
I am doing this with beautiful soup. get_text() gives me to much unecessary information from these sites.
Therefore I wrote a loop:
l = []
for line in text5:
soup = bs(line, 'html.parser')
p_text = ' '.join(p.text for p in soup.find_all('p'))
k = p_text.replace('\n', '')
l.append(k)
But this loop gives me everything that was in a tag that starts with <p.
For example:
I want everything between two plain <p> tags.
But I also get the content from someting like this:
<p class="header-main__label"> bla ba </p>.
Can I tell BeautifulSoup to just get the plain <p> tags?
You can set False for class and id and it will get tags without class and id
soup.find_all('p', {'class': False, 'id': False})
or (word class_ has _ because there is keyword class in Python)
soup.find_all('p', class_=False, id=False)
from bs4 import BeautifulSoup as BS
text = '<p class="A">text A</p> <p>text B</p> <p id="C">text C</p>'
soup = BS(text, 'html.parser')
# ----
all_items = soup.find_all('p', {'class': False, 'id': False})
for item in all_items:
print(item.text)
# ---
all_items = soup.find_all('p', class_=False, id=False)
for item in all_items:
print(item.text)
EDIT: If you want tags without any attributes then you can filter items using not item.attrs
for item in all_items:
if not item.attrs:
print(item.text)
from bs4 import BeautifulSoup as BS
text = '<p class="A">text A</p> <p>text B</p> <p id="C">text C</p> <p data="D">text D</p>'
soup = BS(text, 'html.parser')
all_items = soup.find_all('p')
for item in all_items:
if not item.attrs:
print(item.text)

Python HTML Parsing via CSS Selectors

I'm trying to collect the plain text/business title from the following:
<div class = "business-detail-text>
<h1 class = "business-title" style="position:relative;" itemprop="name">H&H Construction Co.</h1>
What is the best way to do this? The style & itemprop attribute's are where I get stuck. I know I can use soup.select but I've had no luck so far.
Here is my code so far:
def bbb_profiles(profile_urls):
sauce_code = requests.get(profile_urls)
plain_text = sauce_code.text
soup = BeautifulSoup(plain_text, "html.parser")
for profile_info in soup.findAll("h1", {"class": "business-title"}):
print(profile_info.string)
is it what you need?
>>> from bs4 import BeautifulSoup
>>> txt='''<div class = "business-detail-text">
<h1 class = "business-title" style="position:relative;" itemprop="name">H&H Construction Co.</h1></div>'''
>>> soup = BeautifulSoup(txt, "html.parser")
>>> soup.find_all('h1', 'business-title')
[<h1 class="business-title" itemprop="name" style="position:relative;">H&H; Construction Co.</h1>]
>>> soup.find_all('h1', 'business-title')[0].text
u'H&H; Construction Co.'
I see your html is missing " after "business-detail-text and < /div> in the very end

Categories

Resources