python Beautifulsoup count element which has no content - python

How to count element which has no content?
By saying element which has no content, I mean <div class="myclass" id="myid"></div>
Here is the code I wrote with attempting to achieve the goal:
from bs4 import BeautifulSoup
html_doc = """
<dl>
<dt class="details-row-7">Overall</dt>
<dd id="c0r11" class=" alternate details-row-7">
<div class="mobile-headings">Overall</div>
<div class="mobile-value">
<div class="ca-rating-star" data-size="1"><i class="icon-star icon-1x" style="color: #FF9900"></i>
<i class="icon-star icon-1x" style="color: #FF9900"></i>
<i class="icon-star icon-1x" style="color: #FF9900"></i>
<i class="icon-star icon-1x" style="color: #FF9900"></i>
<i class="icon-star-empty icon-1x" style="color: #FF9900"></i>
</div>
</div>
</dd>
</dl>
"""
soup = BeautifulSoup(html_doc)
ele = soup.find("dd", {"id": "c0r11"}, {"class": "alternate details-row-7"})
if ele.find(text=False):
con_str = ele.find("div", {"class":"mobile-value"})
if con_str.find(text=False):
star_ele = con_str.find("div", {"class":"ca-rating-star"})
if star_ele.find(text=False):
star = star_ele.find_all("i", {"class":"icon-star icon-1x"})
i = 0
for s in star:
if s.find(text=False):
i += 1
print(i)
But the result is 0.....

I answered your question in a gist here.
https://gist.github.com/greatghoul/c2fab58e798a91a736a4

The problem is you're looking for children of the <i> elements where text=False when you say s.find(text=False), but the <i> tags don't have children. You want to see if the <i> tags themselves have empty text. So replace s.find(text=False) with s.get_text() == "".
from bs4 import BeautifulSoup
html_doc = """
<dl>
<dt class="details-row-7">Overall</dt>
<dd id="c0r11" class=" alternate details-row-7">
<div class="mobile-headings">Overall</div>
<div class="mobile-value">
<div class="ca-rating-star" data-size="1"><i class="icon-star icon-1x" style="color: #FF9900"></i>
<i class="icon-star icon-1x" style="color: #FF9900"></i>
<i class="icon-star icon-1x" style="color: #FF9900"></i>
<i class="icon-star icon-1x" style="color: #FF9900"></i>
<i class="icon-star-empty icon-1x" style="color: #FF9900"></i>
</div>
</div>
</dd>
</dl>
"""
soup = BeautifulSoup(html_doc)
ele = soup.find("dd", {"id": "c0r11"}, {"class": "alternate details-row-7"})
if ele.find(text=False):
con_str = ele.find("div", {"class":"mobile-value"})
if con_str.find(text=False):
star_ele = con_str.find("div", {"class":"ca-rating-star"})
if star_ele.find(text=False):
star = star_ele.find_all("i", {"class":"icon-star icon-1x"})
i = 0
for s in star:
if s.get_text() == "": # CHANGE ON THIS LINE
i += 1
print(i)

Related

how to get text of the latest post with BeautifulSoup, select()

I'd like to get the latest posts text using BeautifulSoup and select() method.
import requests
from bs4 import BeautifulSoup
headers = 'User-Agent':'Mozilla/5.0'
url = "https:// "
req = requests.get(url, headers=headers)
html = req.text
soup = BeautifulSoup(html, 'html.parser')
link = soup.select('#flagList > div.clear.ab-webzine > div > a')
title = soup.select('#flagList > div.clear.ab-webzine > div > div.wz-item-header > a > span')
latest_link = link[0] # link of latest post
latest_title = title[0].text # title of latest post
# to get the text of latest post
t_url = latest_link
t_req = requests.get(t_url, headers=headers)
t_html = c_res.text
t_soup = BeautifulSoup(t_html, 'html.parser')
maintext = t_soup.select ('#flagArticle > div.document_1234567_0.rhymix_content.xe_content')
print(maintext)
It returns [].
I copied #flagArticle > div.document_1234567_0.rhymix_content.xe_content from chrome developer tools on the posts. so it has specific post number "1234567"
But I want the text of "latest post" not certain post.
So I changed it to just #flagArticle
And it returns as below.
[<article id="flagArticle">
<!--BeforeDocument(1234567,0)-->
<div class="document_1234567_0 rhymix_content xe_content"><p>TEXTTEXTTEXT 1</p>
<p>TEXTTEXTTEXT 2</p>
<p>TEXTTEXTTEXT 3</p></div><!--AfterDocument(1234567,0)-->
<!--
-- color class --
vb-white
vb-green
vb-blue
vb-skyblue
vb-orange
vb-red
-->
<div class="vote">
<button class="vb-btn vb-orange" onclick="vote_doVote('Up','1234567');return false;" type="button">
<span class="lang">
<i class="fas fa-star fa-spin fa-fw"></i>
recommended </span>
<span class="num" id="vm_v_count">
4 </span>
</button> <button class="vb-btn vb-skyblue" onclick="vote_doVote('Declare','1234567');return false;" type="button">
<span class="lang">
<i class="fa fa-times-circle"></i>
report </span>
<span class="num" id="vm_d_count">
</span>
</button></div> </article>]
But I want to get
TEXTTEXTTEXT 1
TEXTTEXTTEXT 2
TEXTTEXTTEXT 3
What should I change?
(I can't share the URL because it's private site)
Just get the first div.
from bs4 import BeautifulSoup
data = '''\
<article id="flagArticle">
<!--BeforeDocument(1234567,0)-->
<div class="document_1234567_0 rhymix_content xe_content"><p>TEXTTEXTTEXT 1</p>
<p>TEXTTEXTTEXT 2</p>
<p>TEXTTEXTTEXT 3</p></div><!--AfterDocument(1234567,0)-->
<!--
-- color class --
vb-white
vb-green
vb-blue
vb-skyblue
vb-orange
vb-red
-->
<div class="vote">
<button class="vb-btn vb-orange" onclick="vote_doVote('Up','1234567');return false;" type="button">
<span class="lang">
<i class="fas fa-star fa-spin fa-fw"></i>
recommended </span>
<span class="num" id="vm_v_count">
4 </span>
</button> <button class="vb-btn vb-skyblue" onclick="vote_doVote('Declare','1234567');return false;" type="button">
<span class="lang">
<i class="fa fa-times-circle"></i>
report </span>
<span class="num" id="vm_d_count">
</span>
</button></div> </article>
'''
soup = BeautifulSoup(data, 'html.parser')
div = soup.select_one('#flagArticle div.xe_content.rhymix_content')
for p in div.select('p'):
print(p.text)

Trying to loop through profile lists using Selenium

I'm trying to loop through all profiles and store the name of the person, the job profile and the location in a list. Here is the screenshot of the screen LinkedIn screen I am on:
Here is the li html tag that I'll have to loop over:
<li class="reusable-search__result-container ">
<div class="entity-result ">
<div class="entity-result__item">
<div class="entity-result__image">
<div class="display-flex align-items-center">
<a class="app-aware-link" aria-hidden="true" href="https://www.linkedin.com/search/results/people/headless?geoUrn=%5B103644278%5D&origin=FACETED_SEARCH&keywords=python%20developer">
<div id="ember522" class="ivm-image-view-model ember-view"> <div class="
ivm-view-attr__img-wrapper ivm-view-attr__img-wrapper--use-img-tag display-flex
">
<div class="EntityPhoto-circle-3-ghost-person ivm-view-attr__ghost-entity ">
<!----> </div>
</div>
</div>
</a>
</div>
</div>
<div class="entity-result__content entity-result__divider pt3 pb3 t-12 t-black--light">
<div class="mb1">
<div class="linked-area flex-1 cursor-pointer">
<div class="t-roman t-sans">
<span class="entity-result__title">
<div class="display-flex">
<span class="entity-result__title-line flex-shrink-1 entity-result__title-text--black ">
<span class="entity-result__title-text t-16">
<a class="app-aware-link" href="https://www.linkedin.com/search/results/people/headless?geoUrn=%5B103644278%5D&origin=FACETED_SEARCH&keywords=python%20developer">
<!---->LinkedIn Member<!---->
</a>
<!----> </span>
</span>
<!----></div>
</span>
</div>
<div>
<div class="entity-result__primary-subtitle t-14 t-black">
<!---->Software Developer<!---->
</div>
<div class="entity-result__secondary-subtitle t-14">
<!---->United States<!---->
</div>
</div>
</div>
</div>
<div class="linked-area flex-1 cursor-pointer">
<p class="entity-result__summary entity-result__summary--2-lines t-12 t-black--light ">
<!---->Current: Full Stack Software<span class="white-space-pre"> </span><strong><!---->Developer<!----></strong><span class="white-space-pre"> </span>at GE Healthcare<!---->
</p>
</div>
<!----> </div>
<div class="entity-result__actions entity-result__divider entity-result__actions--empty">
<!----> <!---->
</div>
</div>
</div>
</li>
Currently, I'm able to get the profile names using this code:
profile_names = []
linkedin_members = browser.find_elements_by_xpath('//span[#class="entity-result__title"]')
for linkedin_member in linkedin_members:
name = linkedin_member.find_element_by_xpath('.//a[#class="app-aware-link"]').get_attribute('text').strip()
profile_names.append(name)
But I'm unable to get the job locations and job profiles. Can anyone guide me on the code for that?
I tried something like this but it threw an error:
profile_names = []
job_profiles = []
linkedin_members = browser.find_elements_by_xpath('//div[#class="linked-area flex-1 cursor-pointer"]')
for linkedin_member in linkedin_members:
name = linkedin_member.find_element_by_xpath('.//a[#class="app-aware-link"]').get_attribute('text').strip()
job_profile = linkedin_member.find_element_by_xpath('.//div[#class="entity-result__primary-subtitle"]').text
profile_names.append(name)
job_profiles.append(job_profiles)
Another way to do this is:
members_serach_results_xpath = '//div[#class="entity-result__item"]'
member_name_xpath = '//span[contains(#class,"entity-result__title-text")]//span[#dir]'
member_location_xpath = '//div[contains(#class,"entity-result__secondary-subtitle")]'
member_job_title_xpath = '//div[#class="entity-result__item"]//div[contains(#class,"entity-result__primary-subtitle")]'
profile_names = []
profile_addresses = []
profile_job_titles = []
linkedin_members = browser.find_elements_by_xpath(members_serach_results_xpath)
for linkedin_member in linkedin_members:
name = linkedin_member.find_element_by_xpath('.' + member_name_xpath).get_attribute('text').strip()
profile_names.append(name)
address = linkedin_member.find_element_by_xpath('.' + member_location_xpath).get_attribute('text').strip()
profile_addresses.append(address)
job_title = linkedin_member.find_element_by_xpath('.' + member_job_title_xpath).get_attribute('text').strip()
profile_job_titles.append(job_title)
Here I put the locators as parameters out of the code.
It's one of best practices not to put locators hardcoded inside the methods using it.
You just have to identify those elements (and I think you can do so using the class with a css selector), then loop through the elements and append the text to the appropriate array.
profile_names = []
linkedin_members = browser.find_elements_by_xpath('//span[#class="entity-result__title"]')
for linkedin_member in linkedin_members:
name = linkedin_member.find_element_by_xpath('.//a[#class="app-aware-link"]').get_attribute('text').strip()
profile_names.append(name)
user_positions = []
positions = browser.find_elements_by_css_selector('div.entity-result__primary-subtitle')
for position in positions:
user_positions.append(position.text.strip())
user_locations = []
locations = browser.find_elements_by_css_selector('div.entity-result__secondary-subtitle')
for location in locations:
user_locations.append(location.text.strip())

Python BeautifulSoup extracting the text right after a particular tag

I'm trying to extract information from a webpage using beautifulsoup and python. I want to extract the information right below a particular tag. To know if its the right tag I would like to do a comparison of its text and then extract the text in the next immediate tag.
Say for example, if the following is a part of an HTML page-source,
<div class="row">
::before
<div class="four columns">
<p class="title">Procurement type</p>
<p class="data strong">Services</p>
</div>
<div class="four columns">
<p class="title">Reference</p>
<p class="data strong">ANAJSKJD23423-Commission</p>
</div>
<div class="four columns">
<p class="title">Funding Agency</p>
<p class="data strong">Health Commission</p>
</div>
::after
</div>
<div class="row">
::before
::after
</div>
<hr>
<div class="row">
::before
<div class="twelve columns">
<p class="title">Countries</p>
<p class="data strong">
<span class>Belgium</span>
", "
<span class>France</span>
", "
<span class>Luxembourg</span>
</p>
<p></p>
</div>
::after
</div>
I want to check if the <p class="title"> has text value as Procurement type then I want to print out Services Similarly, if the <p class="title"> has text value as Reference then I want to print out ANAJSKJD23423-Commission and if <p class="title"> has value as Countries then print out all the countries i.e. Belgium,France,Luxembourg.
I know I can extract all the texts with <p class="data strong"> and append them to a list and later fetch all values using indexing. But the thing is, the order of the occurrence of these <p class="title> is not fixed....at some places countries could be mentioned before procurement-type. I, therefore, want to perform a check on the text values and then extract the next immediate tag's text value. I'm still new to BeautifulSoup so any help is appreciated. Thanks
You can do it many ways.Here you go.
from bs4 import BeautifulSoup
htmldata='''<div class="row">
::before
<div class="four columns">
<p class="title">Procurement type</p>
<p class="data strong">Services</p>
</div>
<div class="four columns">
<p class="title">Reference</p>
<p class="data strong">ANAJSKJD23423-Commission</p>
</div>
<div class="four columns">
<p class="title">Funding Agency</p>
<p class="data strong">Health Commission</p>
</div>
::after
</div>
<div class="row">
::before
::after
</div>
<hr>
<div class="row">
::before
<div class="twelve columns">
<p class="title">Countries</p>
<p class="data strong">
<span class>Belgium</span>
", "
<span class>France</span>
", "
<span class>Luxembourg</span>
</p>
<p></p>
</div>
::after
</div>'''
soup=BeautifulSoup(htmldata,'html.parser')
items=soup.find_all('p', class_='title')
for item in items:
if ('Procurement type' in item.text) or ('Reference' in item.text):
print(item.findNext('p').text)
You can also use :contains pseudo class with bs4 4.7.1. Although I have passed as a list you can separate out each condition
from bs4 import BeautifulSoup as bs
import re
html = 'yourHTML'
soup = bs(html, 'lxml')
items=[re.sub(r'\n\s+','', item.text.strip()) for item in soup.select('p.title:contains("Procurement type") + p, p.title:contains(Reference) + p, p.title:contains(Countries) + p')]
print(items)
Output:
You can add the argument to check for specific text when you use .find() or .find_all() then use .next_sibling or findNext() to grab the next tags with the content
Ie:
soup.find('p', {'class':'title'}, text = 'Procurement type')
Given:
html = '''<div class="row">
::before
<div class="four columns">
<p class="title">Procurement type</p>
<p class="data strong">Services</p>
</div>
<div class="four columns">
<p class="title">Reference</p>
<p class="data strong">ANAJSKJD23423-Commission</p>
</div>
<div class="four columns">
<p class="title">Funding Agency</p>
<p class="data strong">Health Commission</p>
</div>
::after
</div>
<div class="row">
::before
::after
</div>
<hr>
<div class="row">
::before
<div class="twelve columns">
<p class="title">Countries</p>
<p class="data strong">
<span class>Belgium</span>
", "
<span class>France</span>
", "
<span class>Luxembourg</span>
</p>
<p></p>
</div>
::after
</div>'''
you could do something like:
from bs4 import BeautifulSoup
soup = BeautifulSoup(html, 'html.parser')
alpha = soup.find('p', {'class':'title'}, text = 'Procurement type')
for sibling in alpha.next_siblings:
try:
print (sibling.text)
except:
continue
Output:
Services
or
ref = soup.find('p', {'class':'title'}, text = 'Reference')
for sibling in ref.next_siblings:
try:
print (sibling.text)
except:
continue
Output:
ANAJSKJD23423-Commission
or
countries = soup.find('p', {'class':'title'}, text = 'Countries')
names = countries.findNext('p', {'class':'data strong'}).text.replace('", "','').strip().split('\n')
names = [name.strip() for name in names if not name.isspace()]
for country in names:
print (country)
Output:
Belgium
France
Luxembourg

how to extract span info from div with soup

I have a piece of HTML code below:
<div class="user-tagline ">
<span class="username " data-avatar="aaaaaaa">player1</span>
<span class="user-rating">(1357)</span>
<span class="country-flag-small flag-113" tip="Portugal"></span>
</div>
<div class="user-tagline ">
<span class="username " data-avatar="bbbbbbb">player2</span>
<span class="user-rating">(1387)</span>
<span class="country-flag-small flag-70" tip="Indonesia"></span>
</div>
I want to extract "Portugal" from it, note the span class is a dynamic one, it is not always class="country-flag-small flag-113" but indeed changes per the value of country generated for this div block.
To get the player1 and 1357, I am using the following cumbersome code:
player1info = soup.findAll('div', attrs={'class':'user-tagline'})[0].text.split("\n")
player1 = player1info[1]
pscore1 = player1info[1].replace('(','').replace(')', '')
It would be appreciated if someone can share with your better solution here. Thank you in advance
UPDATE:
With the initial HTML div info extracted, now I would like to expand it to extract more for the entire row, here is the row:
<tr board-popover="" fen="r1bk2r1/1p2n3/pN6/1B1qQp2/P2Pp2p/1P6/2P2PPP/R3K1R1 b Q -" flip-board="1" highlight-squares="c4b6">
<td>
<a class="clickable-link td-user" href="https://www.chess.com/live/game/2249663029?username=belemnarmada" target="_self">
<span class="time-control">
<i class="icon-rapid">
</i>
</span>
<div class="user-tagline ">
<span class="username " data-avatar="https://betacssjs.chesscomfiles.com/bundles/web/images/noavatar_l.1c5172d5.gif" data-country="Portugal" data-enabled="true" data-flag="113" data-joined="Joined Jun 19, 2016" data-logged="Online 6 hrs ago" data-membership="basic" data-name="Atikinounette" data-popup="hover" data-title="" data-username="Atikinounette">
Atikinounette
</span>
<span class="user-rating">
(1357)
</span>
<span class="country-flag-small flag-113" tip="Portugal">
</span>
</div>
<div class="user-tagline ">
<span class="username " data-avatar="https://images.chesscomfiles.com/uploads/v1/user/28196414.83e31ff1.50x50o.3a6f77e4aa44.jpeg" data-country="Indonesia" data-enabled="true" data-flag="70" data-joined="Joined May 15, 2016" data-logged="Online Nov 7, 2017" data-membership="basic" data-name="belemnarmada" data-popup="hover" data-title="" data-username="belemnarmada">
belemnarmada
</span>
<span class="user-rating">
(1387)
</span>
<span class="country-flag-small flag-70" tip="Indonesia">
</span>
</div>
</a>
</td>
<td>
<a class="clickable-link text-middle" href="https://www.chess.com/live/game/2249663029?username=belemnarmada" target="_self">
<div class="pull-left">
<span class="game-result">
1
</span>
<span class="game-result">
0
</span>
</div>
<div class="result">
<i class="icon-square-minus loss" tip="Lost">
</i>
</div>
</a>
</td>
<td class="text-center">
<a class="clickable-link" href="https://www.chess.com/live/game/2249663029?username=belemnarmada" target="_self">
30 min
</a>
</td>
<td class="text-right">
<a class="clickable-link text-middle moves" href="https://www.chess.com/live/game/2249663029?username=belemnarmada" target="_self">
25
</a>
</td>
<td class="text-right miniboard">
<a class="clickable-link archive-date" href="https://www.chess.com/live/game/2249663029?username=belemnarmada" target="_self">
Aug 9, 2017
</a>
</td>
<td class="text-center miniboard">
<input class="checkbox" game-checkbox="" game-id="2249663029" game-is-live="true" ng-model="model.gameIds[2249663029].checked" type="checkbox"/>
</td>
</tr>
Needed info are:
player's info (answer provided by #balderman already got that)
game-result (1, 0)
playing time (30 min in this row)
total moves (25)
playing date (Aug 9, 2017)
Thank you so much here.
How about the code below?
The idea that the user attributes are 3 spans under the div. So the code points to those spans and extract the data.
from bs4 import BeautifulSoup
html = '''<html><body> <div class="user-tagline ">
<span class="username " data-avatar="aaaaaaa">player1</span>
<span class="user-rating">(1357)</span>
<span class="country-flag-small flag-113" tip="Portugal"></span>
</div>
<div class="user-tagline ">
<span class="username " data-avatar="bbbbbbb">player2</span>
<span class="user-rating">(1387)</span>
<span class="country-flag-small flag-70" tip="Indonesia"></span>
</div><body></html>'''
soup = BeautifulSoup(html, 'html.parser')
users = soup.findAll('div', attrs={'class': 'user-tagline'})
for user in users:
user_properties = user.findAll('span')
for idx, prop in enumerate(user):
if idx == 1:
print('user name: {}'.format(prop.text))
elif idx == 3:
print('user rating: {}'.format(prop.text))
elif idx == 5:
print('user country: {}'.format(prop.attrs['tip']))
Output
user name: player1
user rating: (1357)
user country: Portugal
user name: player2
user rating: (1387)
user country: Indonesia
This is a more readable solution:
div1 = soup.select("div.user-tagline")[0]
player1 = div1.select_one("span.user-rating").text
pscore1 = div1.select_one("span.country-flag-small").text
To extract data of all divs, just use a loop. And replace "0" with "i".
If you are interested only in the first div, you can go with this:
res = bsobj.find('div', {'class':'user-tagline'}).findAll('span')
print(res[0].text, res[1].text, res[2]['tip'])

Python Beautifulsoup: finding an element after a specific string

I have the following html code:
<div class="xyOfqd">
<div class="aAAD">
<div class="Bgbcca">Updated</div>
<span class="hthtb">
<div>
<span class="hthtb">September 30, 2018</span>
</div>
</span>
</div>
<div class="aAAD">
<div class="Bgbcca">Text1</div>
<span class="hthtb">
<div><span class="hthtb">Text2</span></div>
</span>
</div>
<div
class="aAAD">
<div class="Bgbcca">MyText</div>
<span class="hthtb">
<div>
<span class="hthtb">Text3</span>
</div>
</span>
</div>
<div class="aAAD">
<div class="Bgbcca">Text4</div>
<span class="hthtb">
<div><span
class="hthtb">Text5</span></div>
</span>
</div>
<div class="aAAD">
<div
class="Bgbcca">Text6</div>
<span class="hthtb">
<div><span
class="hthtb">Text7</span></div>
</span>
</div>
<div class="aAAD">
<div
class="Bgbcca">
Text8/div>
<span class="hthtb">
<div>
<span class="hthtb">
<div>Text9</div>
<div>Text10</div>
</span>
</div>
</span>
</div>
<div class="aAAD">
<div
class="Bgbcca">Text11</div>
<span class="hthtb">
<div><span class="hthtb">Text12</span></div>
</span>
</div>
How can I find Text3 which is located right after the div element with the string of MyText?
You can use lxml.html solution:
from lxml import html
source = """
<div class="xyOfqd">
<div class="aAAD">
<div class="Bgbcca">Updated</div>
...
<span class="hthtb">
<div><span class="hthtb">Text12</span></div>
</span>
</div>"""
tree = html.fromstring(source)
print(tree.xpath('//div[.="MyText"]/following-sibling::span/div/span/text()'))
Only if your structure is the final one, you can have the right value doing this:
from bs4 import BeautifulSoup as bfs
html = """<div class="xyOfqd">
<div class="aAAD">
<div class="Bgbcca">Updated</div>
<span class="hthtb">
<div>
<span class="hthtb">September 30, 2018</span>
</div>
</span>
</div>
<div class="aAAD">
<div class="Bgbcca">Text1</div>
<span class="hthtb">
<div><span class="hthtb">Text2</span></div>
</span>
</div>
<div
class="aAAD">
<div class="Bgbcca">MyText</div>
<span class="hthtb">
<div>
<span class="hthtb">Text3</span>
</div>
</span>
</div>
<div class="aAAD">
<div class="Bgbcca">Text4</div>
<span class="hthtb">
<div><span
class="hthtb">Text5</span></div>
</span>
</div>
<div class="aAAD">
<div
class="Bgbcca">Text6</div>
<span class="hthtb">
<div><span
class="hthtb">Text7</span></div>
</span>
</div>
<div class="aAAD">
<div
class="Bgbcca">
Text8/div>
<span class="hthtb">
<div>
<span class="hthtb">
<div>Text9</div>
<div>Text10</div>
</span>
</div>
</span>
</div>
<div class="aAAD">
<div
class="Bgbcca">Text11</div>
<span class="hthtb">
<div><span class="hthtb">Text12</span></div>
</span>
</div>"""
soup = bfs(html, 'html.parser')
result = ''
for div0 in soup.find_all('div',{'class':'aAAD'}):
for div1 in div0.find_all('div', {'class':'Bgbcca'}):
if div1.get_text() == 'MyText':
span = div0.find('span',{'class':'hthtb'})
if span:
span_to_return = span.find('span',{'class':'hthtb'})
if span_to_return:
result = span_to_return.get_text()
print(result)
You can build a custom query function to pass into find():
def has_my_text(tag):
found = tag.select_one('.Bgbcca')
# important to assign the result to avoid calling
# .get_text() on a NoneType, resulting in an error.
if found:
return found.get_text() == "MyText"
soup = bs4.... # assign your soup object
found = soup.find(has_my_text)
# <div class="Bgbcca">MyText</div>
# <span class="hthtb">
# <div>
# <span class="hthtb">Text3</span>
# </div>
# </span>
# </div>
# Note your span class is nested so we go two level in
result = found.select_one('.hthtb').select_one('.hthtb').get_text()
# 'Text3'
# This below also works if your other span are always empty texts
result = found.select_one('.hthtb').get_text().strip()
Note, the find() and select_one assume we only need the first match found. If you need to handle multiple matches, you'll need to use find_all() and select() and make changes to your code accordingly.
If you want to handle variable texts, you can define your function like this:
def has_my_text(tag, text):
found = tag.select_one('.Bgbcca')
if found:
return found.get_text() == text
And wrap the function in your find() like this:
txt = "MyText"
soup.find(lambda tag: has_my_text(tag, txt))

Categories

Resources