Get links from HTML after a certain title - python

See this HTML code:
<html>
<body>
<p class="fixedfonts">
LINK1
</p>
<h2>Results</h2>
<p class="fixedfonts">
LINK2
</p>
<p class="fixedfonts">
LINK3
</p>
</body>
</html>
It contains 3 links. However I need to retrieve only the links after the title Results
I am using python with BeautifulSoup:
from bs4 import BeautifulSoup, SoupStrainer
# at this point html contains the code as string
# parse the HTML file
soup = BeautifulSoup(html.replace('\n', ''), parse_only=SoupStrainer('a'))
# kill all script and style elements
for script in soup(["script", "style"]):
script.extract() # rip it out
links = list()
for link in soup:
if link.has_attr('href'):
links.append(link['href'].replace('%20', ' '))
print(links)
With the the presented code I get all the links in the document, but as I said I only need those that are after the Results tag/title.
Guidance is appreciated

You can solve that using the find_all_next() method:
results = soup.find("h2", text="Results")
for link in results.find_all_next("a"):
print(link.get("href"))
Demo:
>>> from bs4 import BeautifulSoup
>>>
>>> data = """
... <html>
... <body>
... <p class="fixedfonts">
... LINK1
... </p>
...
... <h2>Results</h2>
...
... <p class="fixedfonts">
... LINK2
... </p>
...
... <p class="fixedfonts">
... LINK3
... </p>
... </body>
... </html>"""
>>>
>>> soup = BeautifulSoup(data, "html.parser")
>>> results = soup.find("h2", text="Results")
>>> for link in results.find_all_next("a"):
... print(link.get("href"))
...
B.pdf
C.pdf

Split the html data into two parts, before and after the "Results" Then use the one after to process it:
data = html.split("Results")
need = data[1]
So just implement that:
from bs4 import BeautifulSoup, SoupStrainer
data = html.split("Results")
need = data[1]
soup = BeautifulSoup(need.replace('\n', ''), parse_only=SoupStrainer('a'))

Tested and seemed to work.
from bs4 import BeautifulSoup, SoupStrainer
html = '''<html>
<body>
<p class="fixedfonts">
LINK1
</p>
<h2>Results</h2>
<p class="fixedfonts">
LINK2
</p>
<p class="fixedfonts">
LINK2
</p>
<p class="fixedfonts">
LINK3
</p>
</body>
</html>'''
# at this point html contains the code as string
# parse the HTML file
dat = html.split("Result")
need = dat[1]
soup = BeautifulSoup(html.replace('\n', ''), parse_only=SoupStrainer('a'))
# kill all script and style elements
for script in soup(["script", "style"]):
script.extract() # rip it out
links = list()
for link in soup:
if link.has_attr('href'):
links.append(link['href'].replace('%20', ' '))
n_links = list()
for i in set(links):
if need.count(i) > 0:
for x in range(1, need.count(i) + 1):
n_links.append(i)
print(n_links)

Related

Python: Deleting all divs without class

I want to delete all divs without classes (but not the content that is in the div).
My input
<h1>Test</h1>
<div>
<div>
<div class="test">
<p>abc</p>
</div>
</div>
</div>
The output I want
<h1>Test</h1>
<div class="test">
<p>abc</p>
</div>
My try 1
Based on "Deleting a div with a particular class":
from bs4 import BeautifulSoup
soup = BeautifulSoup('<h1>Test</h1><div><div><div class="test"><p>abc</p></div></div></div>', 'html.parser')
for div in soup.find_all("div", {'class':''}):
div.decompose()
print(soup)
# <h1>Test</h1>
My try 2
from htmllaundry import sanitize
myinput = '<h1>Test</h1><div><div><div class="test"><p>abc</p></div></div></div>'
myoutput = sanitize(myinput)
print myoutput
# <p>Test</p><p>abc</p> instead of <h1>Test</h1><div class="test"><p>abc</p></div>
My try 3
Based on "Clean up HTML in python"
from lxml.html.clean import Cleaner
def sanitize(dirty_html):
cleaner = Cleaner(remove_tags=('font', 'div'))
return cleaner.clean_html(dirty_html)
myhtml = '<h1>Test</h1><div><div><div class="test"><p>abc</p></div></div></div>'
print(sanitize(myhtml))
# <div><h1>Test</h1><p>abc</p></div>
My try 4
from html_sanitizer import Sanitizer
sanitizer = Sanitizer() # default configuration
output = sanitizer.sanitize('<h1>Test</h1><div><div><div class="test"><p>abc</p></div></div></div>')
print(output)
# <h1>Test</h1><p>abc</p>
Problem: A div element is used to wrap the HTML fragment for the parser, therefore div tags are not allowed. (Source: Manual)
If you want to exclude div without class, preserving its content:
from bs4 import BeautifulSoup
markup = '<h1>Test</h1><div><div><div class="test"><p>abc</p></div></div></div>'
soup = BeautifulSoup(markup,"html.parser")
for tag in soup.find_all():
empty = tag.name == 'div' and not(tag.has_attr('class'))
if not(empty):
print(tag)
Output:
<h1>Test</h1>
<div class="test"><p>abc</p></div>
<p>abc</p>
Please checkout this.
from bs4 import BeautifulSoup
data="""
<div>
<div>
<div class="test">
<p>abc</p>
</div>
</div>
</div>
"""
soup = BeautifulSoup(data, features="html5lib")
for div in soup.find_all("div", class_=True):
print(div)

BeautifulSoup 4: Extracting multiple titles and links from different ptag(s)

HTML Code:
<div>
<p class="title">
title_1
</p>
</div>
<div>
<p class="title">
title_2
</p>
</div>
My Code:
def web(WebUrl):
site = urlparse(WebUrl)
code = requests.get(WebUrl)
plain = code.text
s = BeautifulSoup(plain, "html.parser")
p_containers = s.find('p', {'class':'title'})
for title in s.find_all('p', {'class':'title'}):
line = title.get_text()
print(line)
for link in p_containers.find_all('a'):
line2 = link.get('href')
print(site.netloc + str(line2))
Hi guys, I need some help with this, my task is to extract titles and links from a webpage, I was able to extract the titles but not the links. When I try to scrape the links, I got only the first link successfully scraped, the following links got ignored and replaced with the first scraped link.
You have most of the bits in your code, but are just a little bit off. I think the most simple way to get the titles and links is by using the below.
site = """<div>
<p class="title">
title_1
</p>
</div>
<div>
<p class="title">
title_2
</p>
</div>"""
s = BeautifulSoup(site, "html.parser")
for title in s.find_all('p', {'class':'title'}):
links = [x['href'] for x in title.find_all('a', href=True)]
line = title.get_text()
print(line)
print(links)
You can see that the links object is a list, that's just in case there's a situation where there's multiple links for each title.
Try this way it will help to find_all values from it.
from bs4 import BeautifulSoup
text = """<div>
<p class="title">
title_1
</p>
</div>
<div>
<p class="title">
title_2
</p>
</div>
"""
soup = BeautifulSoup(text, 'html.parser')
for i in soup.find_all('p', attrs={'class': 'title'}):
link = None
if i.find('a'):
link = i.find('a').get('href')
print('Title:', i.get_text(strip=True), 'Link:', link)
# Output as:
# Title: title_1 Link: /news/123456
# Title: title_2 Link: /news/789000

BeautifulSoup extract text from different tags

I'm trying to extract text of article from webpage with the following HTML:
<html>
<body>
<div id='article_body'>
<h2 class='article_subtitle'>subtitle_1</h2>
<p class ='article_paragraph'>text_1</p2>
<p class ='article_paragraph'>text_2</p2>
<p class ='article_paragraph'>text_3</p2>
<h2 class='article_subtitle'>subtitle_2</h2>
<p class ='article_paragraph'>text_4</p2>
<p class ='article_paragraph'>text_5</p2>
<h4 class='videoTitle'>I DONT WANT THIS TEXT</h4>
</div>
</body>
</html>
I've tried:
import urllib.request
from bs4 import BeautifulSoup
url = "http://......."
source = urllib.request.urlopen(url).read()
soup = BeautifulSoup(source, 'lxml')
article_text = ''
article = soup.find('div', {'id': 'article_body'}).find_all(text=True)
for element in article:
article_text += '\n'+ ''.join(element)
print(article_text)
But then I'm getting also text from <h4>. Any advice how to avoid this?
try this
soup = BeautifulSoup(source, 'lxml')
article_text = ''
article = soup.find('div', {'id': 'article_body'}).find_all()
for element in article:
if '<h4' in str(element):
continue
# with html tag
# article_text += '\n'+ ''.join(str(element))
# inner text only
article_text += '\n'+ ''.join(element.text)
print(article_text)
Assuming everything you want has a class that starts with article
articles = soup.find_all(class_ = lambda c: c and c.startswith('article'))
print('\n'.join(articles))

Selecting second child using BeautifulSoup

Let's say I have the following HTML:
<div>
<p>this is some text</p>
<p>...and this is some other text</p>
</div>
How can I retrieve the text from the second paragraph using BeautifulSoup?
You can use a CSS selector to do this:
>>> from bs4 import BeautifulSoup
>>> soup = BeautifulSoup("""<div>
.... <p>this is some text</p>
.... <p>...and this is some other text</p>
.... </div>""", "html.parser")
>>> soup.select('div > p')[1].get_text(strip=True)
'...and this is some other text'
You can use nth-of-type:
h = """<div>
<p>this is some text</p>
<p>...and this is some other text</p>
</div>"""
soup = BeautifulSoup(h)
print(soup.select_one("div p:nth-of-type(2)").text)
secondp = [div.find('p') for div in soup.find('div')]
In : secondp[1].text
Out : Your text
Or you can use the findChildren directly -
div_ = soup.find('div').findChildren()
for i, child in enumerate(div_):
if i == 1:
print child.text
You could solve this with gazpacho:
from gazpacho import Soup
html = """\
<div>
<p>this is some text</p>
<p>...and this is some other text</p>
</div>
"""
soup = Soup(html)
soup.find('p')[1].text
Which would output:
'...and this is some other text'

Using page text to select `html` element using`Beautiful Soup`

I have a page which contains several repetitions of: <div...><h4>...<p>... For example:
html = '''
<div class="proletariat">
<h4>sickle</h4>
<p>Ignore this text</p>
</div>
<div class="proletariat">
<h4>hammer</h4>
<p>This is the text we want</p>
</div>
'''
from bs4 import BeautifulSoup
soup = BeautifulSoup(html)
If I write print soup.select('div[class^="proletariat"] > h4 ~ p'), I get:
[<p>Ignore this text</p>, <p>This is the text we want</p>]
How do I specify that I only want the text of p when it is preceded by <h4>hammer</h4>?
Thanks
html = '''
<div class="proletariat">
<h4>sickle</h4>
<p>Ignore this text</p>
</div>
<div class="proletariat">
<h4>hammer</h4>
<p>This is the text we want</p>
</div>
'''
import re
from bs4 import BeautifulSoup
soup = BeautifulSoup(html)
print(soup.find("h4", text=re.compile('hammer')).next_sibling.next.text)
This is the text we want
:contains() could help here, but it is not supported.
Taking this into account, you can use select() in conjunction with the find_next_sibling():
print next(h4.find_next_sibling('p').text
for h4 in soup.select('div[class^="proletariat"] > h4')
if h4.text == "hammer")

Categories

Resources