How to access text from both <p> using beautifulsoup4? - python

I want to grab text from both <p>, how do I get that?
for first <p> my code is working but I couldn't able to get the second <p>.
<p>
<a href="https://www.japantimes.co.jp/news/2019/03/19/world/crime-legal-world/emerging-online-threats-changing-homeland-securitys-role-merely-fighting-terrorism/">
Emerging online threats changing Homeland Security's role from merely fighting terrorism
</a>
</p>
</hgroup>
</header>
<p>
Homeland Security Secretary Kirstjen Nielsen said Monday that her department may have been founded to combat terrorism, but its mission is shifting to also confront emerging online threats.
China, Iran and other countries are mimicking the approach that Russia used to interfere in the U.S. ...
<a class="more_link" href="https://www.japantimes.co.jp/news/2019/03/19/world/crime-legal-world/emerging-online-threats-changing-homeland-securitys-role-merely-fighting-terrorism/">
<span class="icon-arrow-2">
</span>
</a>
</p>
My code is:
from bs4 import BeautifulSoup
ssl._create_default_https_context = ssl._create_unverified_context
article = "https://www.japantimes.co.jp/tag/cybersecurity/page/1/"
page = urllib.request.urlopen(article)
soup = BeautifulSoup(page, 'html.parser')
article = soup.find('div', class_="content_col")
date = article.h3.find('span', class_= "right date")
date = date.text
headline = article.p.find('a')
headline = headline.text
content = article.p.text
print(date, headline,content)

Use the parent id and p selector and index into returned list for required number of paragraphs. You can use the time tag for when posted
import requests
from bs4 import BeautifulSoup as bs
r = requests.get('https://www.japantimes.co.jp/news/2019/03/19/world/crime-legal-world/emerging-online-threats-changing-homeland-securitys-role-merely-fighting-terrorism/#.XJIQNDj7TX4')
soup = bs(r.content, 'lxml')
posted = soup.select_one('time').text
print(posted)
paras = [item.text.strip() for item in soup.select('#jtarticle p')]
print(paras[:2])

You could use the .find_next(). However, it's not the full article:
from bs4 import BeautifulSoup
import requests
article = "https://www.japantimes.co.jp/tag/cybersecurity/page/1/"
page = requests.get(article)
soup = BeautifulSoup(page.text, 'html.parser')
article = soup.find('div', class_="content_col")
date = article.h3.find('span', class_= "right date")
date_text = date.text
headline = article.p.find('a')
headline_text = headline.text
content_text = article.p.find_next('p').text
print(date_text, headline_text ,content_text)

Related

Getting only numbers from BeautifulSoup instead of whole div

I am trying to learn python by creating a small websraping program to make life easier, although I am having issues with only getting number when using BS4. I was able to get the price when I scraped an actual ad, but I would like to get all the prices from the page.
Here is my code:
from bs4 import BeautifulSoup
import requests
prices = []
url = 'https://www.kijiji.ca/b-cars-trucks/calgary/new__used/c174l1700199a49?ll=51.044733%2C-114.071883&address=Calgary%2C+AB&radius=50.0'
result = requests.get(url)
print (result.status_code)
src = result.content
soup = BeautifulSoup(src, 'html.parser')
print ("CLEARING")
price = soup.findAll("div", class_="price")
prices.append(price)
print (prices)
Here is my output
[<div class="price">
$46,999.00
<div class="dealer-logo">
<div class="dealer-logo-image">
<img src="https://i.ebayimg.com/00/s/NjBYMTIw/z/xMQAAOSwi9ZfoW7r/$_69.PNG"/>
</div>
</div>
</div>
Ideally, I would only want the output to be "46,999.00".
I tried with text=True, although this did not work and I would not get any output from it besides an empty list.
Thank you
You need to get the text portion of tag and then perform some regex processing on it.
import re
def get_price_from_div(div_item):
str_price = re.sub('[^0-9\.]','', div_item.text)
float_price = float(str_price)
return float_price
Just call this method in your code after you find the divs
from bs4 import BeautifulSoup
import requests
prices = []
url = 'https://www.kijiji.ca/b-cars-trucks/calgary/new__used/c174l1700199a49?ll=51.044733%2C-114.071883&address=Calgary%2C+AB&radius=50.0'
result = requests.get(url)
print (result.status_code)
src = result.content
soup = BeautifulSoup(src, 'html.parser')
print ("CLEARING")
price = soup.findAll("div", class_="price")
prices.extend([get_price_from_div(curr_div) for curr_div in price])
print (prices)
An option without using RegEx, is to filter out tags that startwith() a dollar sign $:
import requests
from bs4 import BeautifulSoup
URL = 'https://www.kijiji.ca/b-cars-trucks/calgary/new__used/c174l1700199a49?ll=51.044733%2C-114.071883&address=Calgary%2C+AB&radius=50.0'
soup = BeautifulSoup(requests.get(URL).content, "html.parser")
price_tags = soup.find_all("div", class_="price")
prices = [
tag.get_text(strip=True)[1:] for tag in price_tags
if tag.get_text(strip=True).startswith('$')
]
print(prices)
Output:
['48,888.00', '21,999.00', '44,488.00', '5,500.00', '33,000.00', '14,900.00', '1,750.00', '35,600.00', '1,800.00', '25,888.00', '36,888.00', '32,888.00', '30,888.00', '18,888.00', '21,888.00', '29,888.00', '22,888.00', '30,888.00', '17,888.00', '17,888.00', '16,888.00', '22,888.00', '22,888.00', '34,888.00', '31,888.00', '32,888.00', '30,888.00', '21,888.00', '15,888.00', '21,888.00', '28,888.00', '19,888.00', '18,888.00', '30,995.00', '30,995.00', '30,995.00', '19,888.00', '47,995.00', '21,888.00', '46,995.00', '32,888.00', '29,888.00', '26,888.00', '21,888.00']

How would I scrape the sic code description?

Hi I am using BS4 to scrape the sic codes and descriptions. I currently have the following code which does exactly what I want but I don't know how to scrape the description pictures below in the inspect element view as well as the view source.
To be clear the bit I want is "State commercial banks" and "LABORATORY ANALYTICAL INSTRUMENTS"
https://www.sec.gov/cgi-bin/browse-edgar?CIK=866054&owner=exclude&action=getcompany&Find=Search
<div class="companyInfo">
<span class="companyName">COMMERCIAL NATIONAL FINANCIAL CORP /PA <acronym title="Central Index Key">CIK</acronym>#: 0000866054 (see all company filings)</span>
<p class="identInfo"><acronym title="Standard Industrial Code">SIC</acronym>: 6022 - STATE COMMERCIAL BANKS<br />State location: PA | State of Inc.: <strong>PA</strong> | Fiscal Year End: 1231<br />(Office of Finance)<br />Get <b>insider transactions</b> for this <b>issuer</b>.
for cik_num in cik_num_list:
try:
url = r"https://www.sec.gov/cgi-bin/browse-edgar?CIK={}&owner=exclude&action=getcompany".format(cik_num)
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')
try:
comp_name = soup.find_all('div', {'class':'companyInfo'})[0].find('span').text
sic_code = soup.find_all('p', {'class':'identInfo'})[0].find('a').text
import requests
from bs4 import BeautifulSoup
url = 'https://www.sec.gov/cgi-bin/browse-edgar?CIK=866054&owner=exclude&action=getcompany&Find=Search'
soup = BeautifulSoup(requests.get(url).content, 'html.parser')
sic_code_desc = soup.select_one('.identInfo').a.find_next_sibling(text=True).split(maxsplit=1)[-1]
print(sic_code_desc)
Prints:
STATE COMMERCIAL BANKS
For url = 'https://www.sec.gov/cgi-bin/browse-edgar?CIK=1090872&owner=exclude&action=getcompany&Find=Search' it prints:
LABORATORY ANALYTICAL INSTRUMENTS

Finding tag of text-searched element in HTML

I am trying to scrape multiple web pages to compare the prices of books. Because every site has a different layout (and class names), I want to find the title of the book using regex and then the surrounding elements. An example of the code is given below.
from bs4 import BeautifulSoup
import re
html_page1 = """
<div class='product-box'>
<h2 class='title'>Title Book</h2>
<p class='price>18.45</p>
</div>
"""
html_page2 = """
<div class='page-box'>
<h2 class='orange-heading'>Title Book</h2>
<p class='blue-price'>18.45</p>
</div>
"""
# turn page into soup
soup1 = BeautifulSoup(html_page1, 'html.parser')
# find book titles
names1 = soup1.find_all(string=re.compile("[A-Z]([a-z]+,|\.|[a-z]+)(?:\s{1}[A-Z]([a-z]+,|\.|[a-z]+))"))
# print titles
print('Names1: ', names1)
# turn page into soup
soup2 = BeautifulSoup(html_page2, 'html.parser')
# find book titles
names2 = soup2.find_all(string=re.compile("[A-Z]([a-z]+,|\.|[a-z]+)(?:\s{1}[A-Z]([a-z]+,|\.|[a-z]+))"))
# print titles
print('Names2: ', names2)
This returns:
Names1: ['Title Book']
Names2: ['Title Book']
Now I want to use this information to find the corresponding price. I know that when an element has been selected using the tags and class names, "next_sibling" can be used, however this doesn't work for the element selected by text:
select_title = soup1.find('h2', {"class": "title"})
next_sib = new_try.next_sibling
print(next_sib) # returns <p class='price>18.45
# now try the same thing on element selected by name, this will result in an error
next_sib = names1.next_sibling
How can I use the same method to find the price when I have found the element using its text?
A similiar question can be found here: Find data within HTML tags using Python However, it still uses the html tags.
EDIT The problem is that I have many pages with different layouts and class names. Because of that I cannot use the tag/class/id name to find the elements and I have to find the book titles using regex.
To get the price Include 'h2' tag while doing it find_all() And then use find_next('p')
The first example of p tag where string was missing for classname I have added the string class='price'.
from bs4 import BeautifulSoup
import re
html_page1 = """
<div class='product-box'>
<h2 class='title'>Title Book</h2>
<p class='price'>18.45</p>
</div>
"""
html_page2 = """
<div class='page-box'>
<h2 class='orange-heading'>Title Book</h2>
<p class='blue-price'>18.45</p>
</div>
"""
# turn page into soup
soup1 = BeautifulSoup(html_page1, 'html.parser')
# find book titles
names1 = soup1.find_all('h2',string=re.compile("[A-Z]([a-z]+,|\.|[a-z]+)(?:\s{1}[A-Z]([a-z]+,|\.|[a-z]+))"))
# print titles
print('Names1: ', names1[0].find_next('p').text)
# turn page into soup
soup2 = BeautifulSoup(html_page2, 'html.parser')
# find book titles
names2 = soup2.find_all('h2',string=re.compile("[A-Z]([a-z]+,|\.|[a-z]+)(?:\s{1}[A-Z]([a-z]+,|\.|[a-z]+))"))
# print titles
print('Names2: ', names2[0].find_next('p').text)
Or change string to text
from bs4 import BeautifulSoup
import re
html_page1 = """
<div class='product-box'>
<h2 class='title'>Title Book</h2>
<p class='price'>18.45</p>
</div>
"""
html_page2 = """
<div class='page-box'>
<h2 class='orange-heading'>Title Book</h2>
<p class='blue-price'>18.45</p>
</div>
"""
# turn page into soup
soup1 = BeautifulSoup(html_page1, 'html.parser')
# find book titles
names1 = soup1.find_all(text=re.compile("[A-Z]([a-z]+,|\.|[a-z]+)(?:\s{1}[A-Z]([a-z]+,|\.|[a-z]+))"))
# print titles
print('Names1: ', names1[0].find_next('p').text)
# turn page into soup
soup2 = BeautifulSoup(html_page2, 'html.parser')
# find book titles
names2 = soup2.find_all(text=re.compile("[A-Z]([a-z]+,|\.|[a-z]+)(?:\s{1}[A-Z]([a-z]+,|\.|[a-z]+))"))
# print titles
print('Names2: ', names2[0].find_next('p').text)
EDITED
Use text to get the element without tag and next_element to get the value of price.
from bs4 import BeautifulSoup
import re
html_page1 = """
<div class='product-box'>
<h2 class='title'>Title Book</h2>
<p class='price'>18.45</p>
</div>
"""
html_page2 = """
<div class='page-box'>
<h2 class='orange-heading'>Title Book</h2>
<p class='blue-price'>18.45</p>
</div>
"""
# turn page into soup
soup1 = BeautifulSoup(html_page1, 'html.parser')
# find book titles
names1 = soup1.find_all(text=re.compile("[A-Z]([a-z]+,|\.|[a-z]+)(?:\s{1}[A-Z]([a-z]+,|\.|[a-z]+))"))
# print titles
print('Names1: ', names1[0])
print('Price1: ', names1[0].next_element.next_element.next_element)
# turn page into soup
soup2 = BeautifulSoup(html_page2, 'html.parser')
# find book titles
names2 = soup2.find_all(text=re.compile("[A-Z]([a-z]+,|\.|[a-z]+)(?:\s{1}[A-Z]([a-z]+,|\.|[a-z]+))"))
# print titles
print('Names2: ', names2[0])
print('Price2: ', names2[0].next_element.next_element.next_element)
Output:
Names1: Title Book
Price1: 18.45
Names2: Title Book
Price2: 18.45
You missed class closure comma for the p.price in html_page1.
With names1 = soup1.find_all(text=re.compile("[A-Z]([a-z]+,|\.|[a-z]+)(?:\s{1}[A-Z]([a-z]+,|\.|[a-z]+))")) you get NavigableString, that's why you'll get None for the next_sibling.
You can find solution with regex in #Kunduk answer.
Alternative more clear and simple solution for the both html_page1 and html_page2:
soup = BeautifulSoup(html_page1, 'html.parser')
# or BeautifulSoup(html_page2, 'html.parser')
books = soup.select('div[class*=box]')
for book in books:
book_title = book.select_one('h2').text
book_price = book.select_one('p[class*=price]').text
print(book_title, book_price)
div[class*=box] mean div where class contains box.

Certain content not loading when scraping a site with Beautiful Soup

I'm trying to scrape the ratings off recipes on NYT Cooking but having issues getting the content I need. When I look at the source on the NYT page, I see the following:
<div class="ratings-rating">
<span class="ratings-header ratings-content">194 ratings</span>
<div class="ratings-stars-wrap">
<div class="ratings-stars ratings-content four-star-rating avg-rating">
The content I'm trying to pull out is 194 ratings and four-star-rating. However, when I pull in the page source via Beautiful Soup I only see this:
<div class="ratings-rating">
<span class="ratings-header ratings-content"><%= header %></span>
<div class="ratings-stars-wrap">
<div class="ratings-stars ratings-content <%= ratingClass %> <%= state %>">
The code I'm using is:
url = 'https://cooking.nytimes.com/recipes/1020049-lemony-chicken-soup-with-fennel-and-dill'
r = get(url, headers = headers, timeout=15)
page_soup = soup(r.text,'html.parser')
Any thoughts why that information isn't pulling through?
Try using below code
import requests
import lxml
from lxml import html
import re
url = "https://cooking.nytimes.com/recipes/1019706-spiced-roasted-cauliflower-with-feta-and-garlic?action=click&module=Recirculation%20Band%20Recipe%20Card&region=More%20recipes%20from%20Alison%20Roman&pgType=recipedetails&rank=1"
r = requests.get(url)
tree = html.fromstring(r.content)
t = tree.xpath('/html/body/script[14]')[0]
# look for value for bootstrap.recipe.avg_rating
m = re.search("bootstrap.recipe.avg_rating = ", t.text)
colon = re.search(";", t.text[m.end()::])
rating = t.text[m.end():m.end()+colon.start()]
print(rating)
# look for value for bootstrap.recipe.num_ratings =
n = re.search("bootstrap.recipe.num_ratings = ", t.text)
colon2 = re.search(";", t.text[n.end()::])
star = t.text[n.end():n.end()+colon2.start()]
print(star)
much easier to use attribute = value selectors to grab from span with class ratings-metadata
import requests
from bs4 import BeautifulSoup
data = requests.get('https://cooking.nytimes.com/recipes/1020049-lemony-chicken-soup-with-fennel-and-dill')
soup = BeautifulSoup(data.content, 'lxml')
rating = soup.select_one('[itemprop=ratingValue]').text
ratingCount = soup.select_one('[itemprop=ratingCount]').text
print(rating, ratingCount)

Webscraping HTML-tags including the ones inside all links

I'm using BeautifulSoup with Python 3.5 and I'm trying to scrape a website for all h-tags (so all h1, h2.. etc.). My problem is to make the program open other links on the website to scrape their tags too.
So let's say I have a website with a navigational menu with some links that go throughout the website and all include h-tags of some sort. How would I go about scraping all of them on my selected site?
This is the code i'm using so far to just scrape h1-tags in a specific url:
import requests
from bs4 import BeautifulSoup
url = "http://dsv.su.se/en/research"
r = requests.get(url)
soup = BeautifulSoup(r.content, "html.parser")
h1_data = soup.find_all("h1")
for item in h1_data:
print (item.contents[0])
I hope I made myself clear enough. Thanks.
using your example url, we can get all the urls from the HeadRowMenu and use a loop to extract all the h1's from each page.
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
url = "http://dsv.su.se/en"
base = "http://dsv.su.se"
def crawl(start, base):
r = requests.get(start)
soup = BeautifulSoup(r.content, "lxml")
hs = ["h1", "h2", "h3", "h4", "h5", "h6"]
menu_links = [urljoin(base, a["href"]) for a in soup.select("#HeadRowMenu a")][1:]
for h in hs:
yield soup.find_all(h)
for lnk in menu_links:
soup = BeautifulSoup(requests.get(lnk).content)
for h in hs:
yield soup.find_all(h)
If we run it:
In [17]: print(list(chain.from_iterable(crawl(url, base))))
[<h1 class="visuallyhidden">Department of Computer and Systems Sciences</h1>, <h1>
Improve your digital competences on-line with eSkills Match
</h1>, <h1>
Envisioning Equality in Computer Science - Tomorrow Today
</h1>, <h1>
eGovlab develops online democracy
</h1>, <h1>
Vinnova and DSV invite you to a seminar about Horizon 2020
</h1>, <h1>
Significant increase of applicants for international programmes
</h1>, <h1>News</h1>, <h2>Semester start information</h2>, <h2>Meet our students</h2>, <h1 class="visuallyhidden">Education</h1>, <h1>Welcome to the education web at DSV!</h1>, <h1>Master's Programmes at DSV</h1>, <h2>
Master's Programmes in English:</h2>, <h1 class="visuallyhidden">Research</h1>, <h1>Research highlights</h1>, <h2>Research news</h2>, <h1 class="visuallyhidden">About us</h1>, <h1>About DSV</h1>, <h2>Sweden's oldest IT department</h2>, <h2>Interdisciplinary education and research</h2>, <h2>Right in the middle of one of the world's leading ICT clusters</h2>, <h1 class="visuallyhidden">Internal</h1>, <h1>Internal</h1>, <h2>Semester start information</h2>, <h2>Meet our students</h2>]
If you want to literally scrape every link on the site you should look at scrapy, it id not trivial to do as you cannot just blindly visit every link you find as that could literally bring you anywhere and loop infinitely. You would need to make sure that you are only visiting the domain you want which you can do with scrapy quite easily. have a look at the crawlspider.
To roll your own:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
class Crawl:
def __init__(self, start_url, allowed, base, select):
self.start_url = start_url
self.base = base
self.allowed_domain = allowed
self.crawled = set()
self.select = select
def start(self):
r = requests.get(self.start_url)
soup = BeautifulSoup(r.content, "lxml")
menu_links = [urljoin(self.base, a["href"]) for a in soup.select(self.select)]
for lnk in menu_links:
yield from self.crawl(lnk)
def crawl(self, lnk):
r = requests.get(lnk)
soup = BeautifulSoup(r.content, "lxml")
hs = ["h1", "h2", "h3", "h4", "h5", "h6"]
page_links = (a["href"] for a in soup.select("a[href]"))
joined = (urljoin(base, lnk) if lnk.startswith("/en/") else lnk for lnk in page_links)
for lnk in filter(lambda link: link.startswith("http"), joined):
if lnk not in self.crawled:
soup = BeautifulSoup(requests.get(lnk).content,"lxml")
for h in hs:
yield soup.find_all(h)
self.crawled.add(lnk)
A sample run:
In [2]: from itertools import chain
In [3]: url = "http://dsv.su.se/en"
In [4]: base = "http://dsv.su.se"
In [5]: crawler = Crawl(url, "dsv.su.se", base, "#HeadRowMenu a")
In [6]: for h in chain.from_iterable(crawler.start()):
...: print(h)
...:
<h1 class="visuallyhidden">Institutionen för data- och systemvetenskap</h1>
<h1>
*DSV 50 år* - Digitala affärer - öppet jubileumsseminarium
</h1>
<h1>
Premiär för vandringsdramat Exil - fria poeter på flykt
</h1>
<h1>
Nu bör det stå klart att något måste göras
</h1>
<h1>
Hur enkelt är det för FBI att knäcka en Iphone utan Apples hjälp?
</h1>
<h1>
Svårt att backa tillbaka från ökad övervakning
</h1>
<h1>Senaste nyheterna</h1>
<h2 class="category">Kommande evenemang</h2>
<h2>Information inför terminsstart</h2>
<h1 class="visuallyhidden">Other languages</h1>
<h1>Other languages</h1>
<h2>
Information in Chinese and Russian</h2>
<h2>Contact The Administration of Studies</h2>
<h1 class="visuallyhidden">Department of Computer and Systems Sciences</h1>
<h1>
Improve your digital competences on-line with eSkills Match
</h1>
<h1>
Envisioning Equality in Computer Science - Tomorrow Today
</h1>
<h1>
eGovlab develops online democracy
</h1>
<h1>
Vinnova and DSV invite you to a seminar about Horizon 2020
</h1>
<h1>
Significant increase of applicants for international programmes
</h1>
<h1>News</h1>
<h2>Semester start information</h2>
<h2>Meet our students</h2>
...................................
Obviously if you want to go deeper you will need to add more logic storing all links in one structure and looping until it is empty. Something like:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from time import sleep
class Crawl:
def __init__(self, start_url, allowed, base, select):
self.start_url = start_url
self.base = base
self.allowed_domain = allowed
self.crawled = set()
self.select = select
self.urls = set()
def start(self):
r = requests.get(self.start_url)
soup = BeautifulSoup(r.content, "lxml")
menu_links = [urljoin(self.base, a["href"]) for a in soup.select(self.select)]
print(menu_links)
for lnk in menu_links:
yield from self.crawl(lnk)
def filter_urls(self, soup):
page_links = [a["href"] for a in soup.select("a[href]")]
joined = (urljoin(base, lnk) if lnk.startswith("/en/") else lnk for lnk in page_links)
return set(filter(lambda lnk: self.allowed_domain in lnk, joined))
def crawl(self, lnk):
r = requests.get(lnk)
soup = BeautifulSoup(r.content, "lxml")
hs = ["h1", "h2", "h3", "h4", "h5", "h6"]
self.urls.update(self.filter_urls(soup))
while self.urls:
nxt = self.urls.pop()
if nxt not in self.crawled:
try:
soup = BeautifulSoup(requests.get(nxt).content, "lxml")
except requests.exceptions.RequestException as e:
print(e.strerror)
self.crawled.add(nxt)
continue
self.urls.update((self.filter_urls(soup) - self.crawled))
for h in hs:
yield soup.find_all(h)
self.crawled.add(nxt)
sleep(.1)
That will visit every link on the site that has dsv.su.se in the url but be warned there are a lot of links to scrape so be prepared to wait for a while.
here is a demo version(untested) that does what you describe.
Basically you add the discovered url in a queue
and continue until all links are crawled
import requests
from bs4 import BeautifulSoup
from collections import deque
seen = set()
queue = deque(['http://dsv.su.se/en/research'])
while len(queue):
url = queue.popleft()
if url not in seen:
r = requests.get(url)
soup = BeautifulSoup(r.content, "html.parser")
h1_data = soup.find_all("h1")
for item in h1_data:
u = item.contents[0]
queue.append(u)
seen.add(url)
I think you need something similar.

Categories

Resources