Python BeautifulSoup soup.find

Python BeautifulSoup soup.find - python

I want to scrape some specific data from a website using urllib and BeautifulSoup.
Im trying to fetch the text "190.0 kg". I have tried as you can see in my code to use attrs={'class': 'col-md-7'}
but this returns the wrong result. Is there any way to specify that I want it to return the text between <h3>?
from urllib.request import urlopen
from bs4 import BeautifulSoup
# specify the url
quote_page = 'https://styrkeloft.no/live.styrkeloft.no/v2/?test-stevne'
# query the website and return the html to the variable 'page'
page = urlopen(quote_page)
# parse the html using beautiful soup
soup = BeautifulSoup(page, 'html.parser')
# take out the <div> of name and get its value
Weight_box = soup.find('div', attrs={'class': 'col-md-7'})
name = name_box.text.strip()
print (name)

Since this content is dynamically generated there is no way to access that data using the requests module.
You can use selenium webdriver to accomplish this:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
chrome_options = Options()
chrome_options.add_argument("--headless")
chrome_driver = "path_to_chromedriver"
driver = webdriver.Chrome(chrome_options=chrome_options,executable_path=chrome_driver)
driver.get('https://styrkeloft.no/live.styrkeloft.no/v2/?test-stevne')
html = driver.page_source
soup = BeautifulSoup(html, "lxml")
current_lifter = soup.find("div", {"id":"current_lifter"})
value = current_lifter.find_all("div", {'class':'row'})[2].find_all("h3")[0].text
driver.quit()
print(value)
Just be sure to have the chromedriver executable in your machine.

Related

python bs4, how to scrape this text in html?

the site url:https://n.news.naver.com/mnews/article/421/0006111920
I want to scrape "5" on the below html.
I used this code: soup.select_one('span.u_likeit_text._count').get_text()
the result is '추천'
html code
<span class="u_likeit_text _count num">5</span>

Main issue here that the count is dynamically generated by JavaScript and not present in response and so your soup.
You could use selenium to render the page like a browser will do and convert the driver.page_source to your BeautifulSoup object:
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
import time
driver = webdriver.Chrome(ChromeDriverManager().install())
driver.get("https://n.news.naver.com/mnews/article/421/0006111920")
time.sleep(3)
soup = BeautifulSoup(driver.page_source, 'html.parser')
soup.select_one('span.u_likeit_text._count').get_text()
Output:
8

You have to separate the classes using space, instead of connecting over dot.
from bs4 import BeautifulSoup
soup = BeautifulSoup("<span class='u_likeit_text _count num'>5</span>", 'html.parser')
print(soup)
seven_day = soup.find_all("span" , class_="u_likeit_text _count num")
print(seven_day[0].text)

Can't get anything back on some web pages using Beautiful Soup

I'm trying to scrape this page of job posts, the posts are buried in a bunch of divs but ultimately contained in an unordered list, when I try to retrieve the list using find_all I get None returned either by using the tag id or class. Is there anything I'm doing wrong?
url = "https://resumes.indeed.com/search?l=&q=python&searchFields=jt"
source = requests.get(url).text
soup = BeautifulSoup(source, "lxml")
match = soup.find_all("ul", class_="rezemp-ResumeSearchPage-resultsList")
print(match)

Try this instead
from bs4 import BeautifulSoup
from splinter import Browser
from webdriver_manager.chrome import ChromeDriverManager
executable_path = {'executable_path':ChromeDriverManager().install()}
browser = Browser('chrome', **executable_path, headless = False)
url = "https://resumes.indeed.com/search?l=&q=python&searchFields=jt"
browser.visit(url)
html = browser.html
soup = BeautifulSoup(html, 'html.parser')
match = soup.find_all("ul", class_="rezemp-ResumeSearchPage-resultsList")
# This is the next you get but it's not structured, not sure if that's what you are looking for.
match[0].text

First, I noticed that the given URL doesn't have any element using class
rezemp-ResumeSearchPage-resultsList
So I change this to "https://resumes.indeed.com/search?l=&lmd=all&q=python&searchFields=jt", i.e. choose 'show all resumes' on that website.
then use selenium to load js:
import lxml
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
chrome_options = Options()
chrome_options.add_argument('--headless')
url = "https://resumes.indeed.com/search?l=&lmd=all&q=python&searchFields=jt"
browser=webdriver.Chrome(options=chrome_options,executable_path="***\\chromedriver.exe")
browser.get(url)
soup = BeautifulSoup(browser.page_source, "lxml")
match = soup.find_all("ul", class_="rezemp-ResumeSearchPage-resultsList")
print(match)
Got it

You should try this
list = []
for m in match:
list.append(m)
print(list)

Parsing data scraped from Javascript rendered webpage with python

I am trying to use .find off of a soup variable but when I go to the webpage and try to find the right class it returns none.
from bs4 import *
import time
import pandas as pd
import pickle
import html5lib
from requests_html import HTMLSession
s = HTMLSession()
url = "https://cryptoli.st/lists/fixed-supply"
def get_data(url):
r = s.get(url)
global soup
soup = BeautifulSoup(r.text, 'html.parser')
return soup
def get_next_page(soup):
page = soup.find('div', {'class': 'dataTables_paginate paging_simple_numbers'})
return page
get_data(url)
print(get_next_page(soup))
The "page" variable returns "None" even though I pulled it from the website element inspector. I suspect it has something to do with the fact that the website is rendered with javascript but can't figure out why. If I take away the {'class' : ''datatables_paginate paging_simple_numbers'} and just try to find 'div' then it works and returns the first div tag so I don't know what else to do.

So you want to scrape dynamic page content , You can use beautiful soup with selenium webdriver. This answer is based on explanation here https://www.geeksforgeeks.org/scrape-content-from-dynamic-websites/
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
url = "https://cryptoli.st/lists/fixed-supply"
driver = webdriver.Chrome('./chromedriver')
driver.get(url)
# this is just to ensure that the page is loaded
time.sleep(5)
html = driver.page_source
# this renders the JS code and stores all
# of the information in static HTML code.
# Now, we could simply apply bs4 to html variable
soup = BeautifulSoup(html, "html.parser")

Python, BS and Selenium

I try to webscrape with javascript dynamic + bs + python and Ive read a lot of things to come up with this code where I try to scrape a price rendered with javascript on a famous website for example:
from bs4 import BeautifulSoup
from selenium import webdriver
url = "https://www.nespresso.com/fr/fr/order/capsules/original/"
browser = webdriver.PhantomJS(executable_path = "C:/phantomjs-2.1.1-windows/bin/phantomjs.exe")
browser.get(url)
html = browser.page_source
soup = BeautifulSoup(html, 'lxml')
soup.find("span", {'class':'ProductListElement__price'}).text
But I only have as a result '\xa0' which is the source value, not the javascript value and I don't know really what I did wrong ...
Best regards

You don't need the expense of a browser. The info is in a script tag so you can regex that out and handle with json library
import requests, re, json
r = requests.get('https://www.nespresso.com/fr/fr/order/capsules/original/')
p = re.compile(r'window\.ui\.push\((.*ProductList.*)\)')
data = json.loads(p.findall(r.text)[0])
products = {product['name']:product['price'] for product in data['configuration']['eCommerceData']['products']}
print(products)
Regex:

Here are two ways to get the prices
from bs4 import BeautifulSoup
from selenium import webdriver
url = "https://www.nespresso.com/fr/fr/order/capsules/original/"
browser = webdriver.Chrome()
browser.get(url)
html = browser.page_source
# Getting the prices using bs4
soup = BeautifulSoup(html, 'lxml')
prices = soup.select('.ProductListElement__price')
print([p.text for p in prices])
# Getting the prices using selenium
prices =browser.find_elements_by_class_name("ProductListElement__price")
print([p.text for p in prices])

BeautifulSoup4 not finding div

I've been experimenting with BeautifulSoup4 lately and have found pretty good success across a range of different sites. Though I have stumbled into an issue when trying to scrape amazon.com.
Using the code below, when printing 'soup' I can see the div, but when I search for the div itself, BS4 brings back null. I think the issue is in how the html is being processed. I've tried with LXML and html5lib. Any ideas?
import bs4 as bs
import urllib3
urllib3.disable_warnings()
http = urllib3.PoolManager()
url = 'https://www.amazon.com/gp/goldbox/ref=gbps_fcr_s-4_a870_dls_UPCM?gb_f_deals1=dealStates:AVAILABLE%252CWAITLIST%252CWAITLISTFULL,includedAccessTypes:GIVEAWAY_DEAL,sortOrder:BY_SCORE,enforcedCategories:2619533011,dealTypes:LIGHTNING_DEAL&pf_rd_p=56200e05-4eb2-42ca-9723-af0811ada870&pf_rd_s=slot-4&pf_rd_t=701&pf_rd_i=gb_main&pf_rd_m=ATVPDKIKX0DER&pf_rd_r=PQNZWZRKVD93HXXVG5A7&ie=UTF8'
original = http.request('Get',url)
soup = bs.BeautifulSoup(original.data, 'lxml')
div = soup.find_all('div', {'class':'a-row padCenterContainer'})

You could use selenium to allow the javascript to load before grabbing the html.
from bs4 import BeautifulSoup
from selenium import webdriver
options = webdriver.ChromeOptions()
options.add_argument('--headless')
driver = webdriver.Chrome(chrome_options=options)
url = 'https://www.amazon.com/gp/goldbox/ref=gbps_fcr_s-4_a870_dls_UPCM?gb_f_deals1=dealStates:AVAILABLE%252CWAITLIST%252CWAITLISTFULL,includedAccessTypes:GIVEAWAY_DEAL,sortOrder:BY_SCORE,enforcedCategories:2619533011,dealTypes:LIGHTNING_DEAL&pf_rd_p=56200e05-4eb2-42ca-9723-af0811ada870&pf_rd_s=slot-4&pf_rd_t=701&pf_rd_i=gb_main&pf_rd_m=ATVPDKIKX0DER&pf_rd_r=PQNZWZRKVD93HXXVG5A7&ie=UTF8'
driver.get(url)
html = driver.page_source
soup = BeautifulSoup(html, 'lxml')
div = soup.find('div', {'class': 'a-row padCenterContainer'})
print(div.prettify())
The output of this script was too long to put in this question but here is a link to it

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Python BeautifulSoup soup.find - python

Related

python bs4, how to scrape this text in html?

Can't get anything back on some web pages using Beautiful Soup

Parsing data scraped from Javascript rendered webpage with python

Python, BS and Selenium

BeautifulSoup4 not finding div

Categories

Resources