I have the following code to scrape this page:
soup = BeautifulSoup(html)
result = u''
# Find Starting point
start = soup.find('div', class_='main-content-column')
if start:
news.image_url_list = []
for item in start.find_all('p'):
The problem I'm facing is that it also grabs the <p> inside <div class="type-gallery">, which I would like to avoid. But can't find a way to achieve it.
Any ideas please?
You want direct children, not just any descendant, which is what element.find_all() returns. Your best bet here is to use a CSS selector instead:
for item in soup.select('div.main-content-column > div > p'):
The > operator limits this to p tags that are a direct child nodes of div tags within the div with the given class. You can make this as specific as you like; adding in the itemprop attribute for example:
for item in soup.select('div.main-content-column > div[itemprop="articleBody"] > p'):
The alternative is to loop over the element.children iterable:
start = soup.find('div', class_='main-content-column')
if start:
news.image_url_list = []
for item in start.children:
if item.name != 'div':
# skip children that are not <div> tags
continue
for para in item.children:
if item.name != 'p':
# skip children that are not <p> tags
continue
Related
i am trying to scrape this 4.1 rating in span tag using this python code but it is returning empty.
for item in soup.select("._9uwBC wY0my"):
n = soup.find("span").text()
print(n)
---------------------------------------
<div class="_9uwBC wY0my">
<span class="icon-star _537e4"></span>
<span>4.1</span>
</div>
#Aditya, I think soup.find("span") will only return the first "span" and you want the text from the second one.
I would try:
for item in soup.select("div._9uwBC.wY0my"):
spans = item.find_all("span")
for span in spans:
n = span.text
if n != '':
print(n)
Which should print the text of the non-empty span tags, under the you specified.
Does accomplish what you want?
OK, here's one approach for getting the names and stars for each restaurant on the page. It's not necessarily the most elegant way to do it, but I've tried it a couple of times and it seems to work:
divs = soup.find_all('div')
for div in divs:
if div.has_attr('class'):
if div['class'] == ['nA6kb']: ## the class of the divs with the name
name = div.text
k = div.find_next('div') ## the next div
l = k.find_next('div') ## the div with the stars
spans = l.find_all('span') ## this part is same as the answer above
for span in spans:
n = span.text
if n != '':
print(name, n)
This assumes that the div that contains the stars span is always the second div after the div that contains the restaurant name. It looks like that's always the case, but I'm not positive that it never changes.
If I find a certain tag using beautifulsoup:
styling = paragraphs.find_all('w:rpr')
I look at the next tag. I only want to use that tag if it is a <w:t> tag. How do I check what type of tag the next tag is?
I tried element.find_next_sibling().startswith('<w:t') for the element but it says NoneType object is not callable. I also tried element.find_next_sibling().find_all('<w:t'>) but it doesn't return anything.
for element in styling:
next = element.find_next_sibling()
if(#next is a <w:t> tag):
...
i am using beautifulsoup and would like to stick with it and not add eTree or other parser if possible with bs4.
Using item.name you can see tag's name.
Problem is that between tags there are elements NavigableString which are also treated as sibling elements and they gives None.
You would have to skip these elements or you could get all siblings and use for loop to find first <w:t> and exit loop with break
from bs4 import BeautifulSoup as BS
text = '''<div>
<w:rpr></w:rpr>
<w:t>A</w:t>
</div>'''
soup = BS(text, 'html.parser')
all_wrpr = soup.find_all('w:rpr')
for wrpr in all_wrpr:
next_tag = wrpr.next_sibling
print('name:', next_tag.name) # None
next_tag = wrpr.next_sibling.next_sibling
#next_tag = next_tag.next_sibling
print('name:', next_tag.name) # w:t
print('text:', next_tag.text) # A
#name: None
#name: w:t
#text: A
print('---')
all_siblings = wrpr.next_siblings
for item in all_siblings:
if item.name == 'w:t':
print('name:', item.name) # w:t
print('text:', item.text) # A
break # exit after first <w:t>
#name: w:t
#text: A
EDIT: If you test code with HTML formated little different
text = '''<div>
<w:rpr></w:rpr><w:t>A</w:t>
</div>'''
then there will be no NavigableString between tags and first method will fail but second method will still work.
My html page looks like this:
<div class="some class">
<p>
<i class="class1"></i>
Some Text
</p>
<p>
<i class="class2"></i>
Some Text
</p>
. . .
. . .
. . .
</div
I want to get Some text. Currently I am trying:
elem = browser.find_element_by_xpath("//div[#class='some class']")
text = elem.find_element_by_xpath("//p/i[#class='class1']").text
But it returns an empty string. I cant understand why. I am new to selenium. Please help.
You use xpath below:
# Find "i" element with "class1" css class and get first parent "p" element
elem = browser.find_element_by_xpath("//i[#class='class1']/ancestor::p[1]")
# Same as previous with added "div"
elem = browser.find_element_by_xpath("//div[#class='some class']//i[#class='class1']/ancestor::p[1]")
# Find "p" element with child "i" element with "class1" css class
elem = browser.find_element_by_xpath("//p[./i[#class='class1']]")
# Same as previous with added "div"
elem = browser.find_element_by_xpath("//div[#class='some class']//p[./i[#class='class1']]")
Your selector is grabbing the element i that has attribute class="class1". i has no text, which is why it's an empty string, so to fix that:
elem = browser.find_element_by_xpath("//div[#class='some class']")
# Now let's find the i element you want
i_elem = elem.find_element_by_xpath("//i[#class='class1']")
# Now find the parent of that i_elem, which is p
p_elem = [p for p in i_elem.iterancestors() if p.tag=='p'][0]
txt = p_elem.text
you can use execute_script
xPath = "//div[#class='some class']"
try:
element = driver.find_element_by_xpath(xPath)
b1Text = driver.execute_script("return arguments[0].childNodes[2].textContent", element);
print(b1Text)
except:
print()
try changing the value inside childNodes[N] for example childNodes[2], childNodes[1]
Assuming that your class1 and class2 are different, you can use this css selector
div.some class > p:nth-child(1) to get the text inside it. Since the text is inside the <p> para tag, you can get the text from the first <p> tag.
elem = browser.find_element_by_css_selector("div.some class > p:nth-child(1)")
text = elem.text
This should get you the text inside the element.
I have multiple inline divs (which are 'headers), and paragraph tags beneath (not IN the divs), that are theoretically 'children'... I would like to convert this to a dictionary. I can't quite figure out the best way to do it. Here is roughly what the site looks like:
<div><span>This should be dict key1</span></div>
<p>This should be the value of key1</p>
<p>This should be the value of key1</p>
<div><span>This should be dict key2</span></div>
<p>This should be the value of key2</p>
The Python code I have working looks like this:
soup = bs.BeautifulSoup(source,'lxml')
full_discussion = soup.find(attrs={'class' : 'field field-type-text field-field-discussion'})
ava_discussion = full_discussion.find(attrs = {'class': 'field-item odd'})
for div in ava_discussion.find_all("div"):
discussion = []
if div.findNextSibling('p'):
discussion.append(div.findNextSibling('p').get_text())
location = div.get_text()
ava_dict.update({location: {"discussion": discussion}}
However, the problem is that this code only adds the FIRST <p> tag, then it moves onto the next div. Ultimately, I think I'd like to add each <p> into a list into discussion. Help!
UPDATE:
Adding a while loop yields me duplicates of the first tags for how many exist. Here is the code:
for div in ava_discussion.find_all("div"):
ns = div.nextSibling
discussion = []
while ns is not None and ns.name != "div":
if ns.name == "p":
discussion.append(div.findNextSibling('p').get_text())
ns = ns.nextSibling
location = div.get_text()
ava_dict.update({location : {"discussion": discussion}})
print(json.dumps(ava_dict, indent=2))
I wasn't adding the correct text. This code works:
for div in ava_discussion.find_all("div"):
ns = div.nextSibling
discussion = []
while ns is not None and ns.name != "div":
if ns.name == "p":
discussion.append(ns.get_text())
ns = ns.nextSibling
location = div.get_text()
ava_dict.update({location : {"discussion": discussion}})
print(json.dumps(ava_dict, indent=2))
What about this?
paragraphs = div.findNextSiblings('p')
for sibling in div.findNextSiblings():
if sibling in paragraphs:
discussion.append(sibling.get_text())
else:
break
now, who can show me how to make this more elegant :)
This is the HTML:
<div><div id="NhsjLK">
<li class="EditableListItem NavListItem FollowersNavItem NavItem not_removable">
Followers <span class="list_count">92</span></li></div></div>
I want to extract the text 92 and convert it into integer and print in python2. How can I?
Code:
i = soup.find('div', id='NhsjLK')
print "Followers :", i.find('span', id='list_count').text
I'd not go with getting it by the class directly, since I think "list_count" is too broad of a class value and might be used for other things on the page.
There are definitely several different options judging by this HTML snippet alone, but one of the nicest, from my point of you, is to use that "Followers" text/label and get the next sibling of it:
from bs4 import BeautifulSoup
data = """
<div><div id="NhsjLK">
<li class="EditableListItem NavListItem FollowersNavItem NavItem not_removable">
Followers <span class="list_count">92</span></li></div></div>"""
soup = BeautifulSoup(data, "html.parser")
count = soup.find(text=lambda text: text and text.startswith('Followers')).next_sibling.get_text()
count = int(count)
print(count)
Or, an another, a very concise and reliable approach would be to use the partial match (the *= part below) on the href value of the parent a element:
count = int(soup.select_one("a[href*=followers] .list_count").get_text())
Or, you might check the class value of the parent li element:
count = int(soup.select_one("li.FollowersNavItem .list_count").get_text())