I want to scrape some specific data from a website using urllib and BeautifulSoup.
Im trying to fetch the text "190.0 kg". I have tried as you can see in my code to use attrs={'class': 'col-md-7'}
but this returns the wrong result. Is there any way to specify that I want it to return the text between <h3>?
from urllib.request import urlopen
from bs4 import BeautifulSoup
# specify the url
quote_page = 'https://styrkeloft.no/live.styrkeloft.no/v2/?test-stevne'
# query the website and return the html to the variable 'page'
page = urlopen(quote_page)
# parse the html using beautiful soup
soup = BeautifulSoup(page, 'html.parser')
# take out the <div> of name and get its value
Weight_box = soup.find('div', attrs={'class': 'col-md-7'})
name = name_box.text.strip()
print (name)
Since this content is dynamically generated there is no way to access that data using the requests module.
You can use selenium webdriver to accomplish this:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
chrome_options = Options()
chrome_options.add_argument("--headless")
chrome_driver = "path_to_chromedriver"
driver = webdriver.Chrome(chrome_options=chrome_options,executable_path=chrome_driver)
driver.get('https://styrkeloft.no/live.styrkeloft.no/v2/?test-stevne')
html = driver.page_source
soup = BeautifulSoup(html, "lxml")
current_lifter = soup.find("div", {"id":"current_lifter"})
value = current_lifter.find_all("div", {'class':'row'})[2].find_all("h3")[0].text
driver.quit()
print(value)
Just be sure to have the chromedriver executable in your machine.
Related
the site url:https://n.news.naver.com/mnews/article/421/0006111920
I want to scrape "5" on the below html.
I used this code: soup.select_one('span.u_likeit_text._count').get_text()
the result is '추천'
html code
<span class="u_likeit_text _count num">5</span>
Main issue here that the count is dynamically generated by JavaScript and not present in response and so your soup.
You could use selenium to render the page like a browser will do and convert the driver.page_source to your BeautifulSoup object:
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
import time
driver = webdriver.Chrome(ChromeDriverManager().install())
driver.get("https://n.news.naver.com/mnews/article/421/0006111920")
time.sleep(3)
soup = BeautifulSoup(driver.page_source, 'html.parser')
soup.select_one('span.u_likeit_text._count').get_text()
Output:
8
You have to separate the classes using space, instead of connecting over dot.
from bs4 import BeautifulSoup
soup = BeautifulSoup("<span class='u_likeit_text _count num'>5</span>", 'html.parser')
print(soup)
seven_day = soup.find_all("span" , class_="u_likeit_text _count num")
print(seven_day[0].text)
I'm trying to scrape this page of job posts, the posts are buried in a bunch of divs but ultimately contained in an unordered list, when I try to retrieve the list using find_all I get None returned either by using the tag id or class. Is there anything I'm doing wrong?
url = "https://resumes.indeed.com/search?l=&q=python&searchFields=jt"
source = requests.get(url).text
soup = BeautifulSoup(source, "lxml")
match = soup.find_all("ul", class_="rezemp-ResumeSearchPage-resultsList")
print(match)
Try this instead
from bs4 import BeautifulSoup
from splinter import Browser
from webdriver_manager.chrome import ChromeDriverManager
executable_path = {'executable_path':ChromeDriverManager().install()}
browser = Browser('chrome', **executable_path, headless = False)
url = "https://resumes.indeed.com/search?l=&q=python&searchFields=jt"
browser.visit(url)
html = browser.html
soup = BeautifulSoup(html, 'html.parser')
match = soup.find_all("ul", class_="rezemp-ResumeSearchPage-resultsList")
# This is the next you get but it's not structured, not sure if that's what you are looking for.
match[0].text
First, I noticed that the given URL doesn't have any element using class
rezemp-ResumeSearchPage-resultsList
So I change this to "https://resumes.indeed.com/search?l=&lmd=all&q=python&searchFields=jt", i.e. choose 'show all resumes' on that website.
then use selenium to load js:
import lxml
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
chrome_options = Options()
chrome_options.add_argument('--headless')
url = "https://resumes.indeed.com/search?l=&lmd=all&q=python&searchFields=jt"
browser=webdriver.Chrome(options=chrome_options,executable_path="***\\chromedriver.exe")
browser.get(url)
soup = BeautifulSoup(browser.page_source, "lxml")
match = soup.find_all("ul", class_="rezemp-ResumeSearchPage-resultsList")
print(match)
Got it
You should try this
list = []
for m in match:
list.append(m)
print(list)
I am trying to use .find off of a soup variable but when I go to the webpage and try to find the right class it returns none.
from bs4 import *
import time
import pandas as pd
import pickle
import html5lib
from requests_html import HTMLSession
s = HTMLSession()
url = "https://cryptoli.st/lists/fixed-supply"
def get_data(url):
r = s.get(url)
global soup
soup = BeautifulSoup(r.text, 'html.parser')
return soup
def get_next_page(soup):
page = soup.find('div', {'class': 'dataTables_paginate paging_simple_numbers'})
return page
get_data(url)
print(get_next_page(soup))
The "page" variable returns "None" even though I pulled it from the website element inspector. I suspect it has something to do with the fact that the website is rendered with javascript but can't figure out why. If I take away the {'class' : ''datatables_paginate paging_simple_numbers'} and just try to find 'div' then it works and returns the first div tag so I don't know what else to do.
So you want to scrape dynamic page content , You can use beautiful soup with selenium webdriver. This answer is based on explanation here https://www.geeksforgeeks.org/scrape-content-from-dynamic-websites/
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
url = "https://cryptoli.st/lists/fixed-supply"
driver = webdriver.Chrome('./chromedriver')
driver.get(url)
# this is just to ensure that the page is loaded
time.sleep(5)
html = driver.page_source
# this renders the JS code and stores all
# of the information in static HTML code.
# Now, we could simply apply bs4 to html variable
soup = BeautifulSoup(html, "html.parser")
I try to webscrape with javascript dynamic + bs + python and Ive read a lot of things to come up with this code where I try to scrape a price rendered with javascript on a famous website for example:
from bs4 import BeautifulSoup
from selenium import webdriver
url = "https://www.nespresso.com/fr/fr/order/capsules/original/"
browser = webdriver.PhantomJS(executable_path = "C:/phantomjs-2.1.1-windows/bin/phantomjs.exe")
browser.get(url)
html = browser.page_source
soup = BeautifulSoup(html, 'lxml')
soup.find("span", {'class':'ProductListElement__price'}).text
But I only have as a result '\xa0' which is the source value, not the javascript value and I don't know really what I did wrong ...
Best regards
You don't need the expense of a browser. The info is in a script tag so you can regex that out and handle with json library
import requests, re, json
r = requests.get('https://www.nespresso.com/fr/fr/order/capsules/original/')
p = re.compile(r'window\.ui\.push\((.*ProductList.*)\)')
data = json.loads(p.findall(r.text)[0])
products = {product['name']:product['price'] for product in data['configuration']['eCommerceData']['products']}
print(products)
Regex:
Here are two ways to get the prices
from bs4 import BeautifulSoup
from selenium import webdriver
url = "https://www.nespresso.com/fr/fr/order/capsules/original/"
browser = webdriver.Chrome()
browser.get(url)
html = browser.page_source
# Getting the prices using bs4
soup = BeautifulSoup(html, 'lxml')
prices = soup.select('.ProductListElement__price')
print([p.text for p in prices])
# Getting the prices using selenium
prices =browser.find_elements_by_class_name("ProductListElement__price")
print([p.text for p in prices])
I've been experimenting with BeautifulSoup4 lately and have found pretty good success across a range of different sites. Though I have stumbled into an issue when trying to scrape amazon.com.
Using the code below, when printing 'soup' I can see the div, but when I search for the div itself, BS4 brings back null. I think the issue is in how the html is being processed. I've tried with LXML and html5lib. Any ideas?
import bs4 as bs
import urllib3
urllib3.disable_warnings()
http = urllib3.PoolManager()
url = 'https://www.amazon.com/gp/goldbox/ref=gbps_fcr_s-4_a870_dls_UPCM?gb_f_deals1=dealStates:AVAILABLE%252CWAITLIST%252CWAITLISTFULL,includedAccessTypes:GIVEAWAY_DEAL,sortOrder:BY_SCORE,enforcedCategories:2619533011,dealTypes:LIGHTNING_DEAL&pf_rd_p=56200e05-4eb2-42ca-9723-af0811ada870&pf_rd_s=slot-4&pf_rd_t=701&pf_rd_i=gb_main&pf_rd_m=ATVPDKIKX0DER&pf_rd_r=PQNZWZRKVD93HXXVG5A7&ie=UTF8'
original = http.request('Get',url)
soup = bs.BeautifulSoup(original.data, 'lxml')
div = soup.find_all('div', {'class':'a-row padCenterContainer'})
You could use selenium to allow the javascript to load before grabbing the html.
from bs4 import BeautifulSoup
from selenium import webdriver
options = webdriver.ChromeOptions()
options.add_argument('--headless')
driver = webdriver.Chrome(chrome_options=options)
url = 'https://www.amazon.com/gp/goldbox/ref=gbps_fcr_s-4_a870_dls_UPCM?gb_f_deals1=dealStates:AVAILABLE%252CWAITLIST%252CWAITLISTFULL,includedAccessTypes:GIVEAWAY_DEAL,sortOrder:BY_SCORE,enforcedCategories:2619533011,dealTypes:LIGHTNING_DEAL&pf_rd_p=56200e05-4eb2-42ca-9723-af0811ada870&pf_rd_s=slot-4&pf_rd_t=701&pf_rd_i=gb_main&pf_rd_m=ATVPDKIKX0DER&pf_rd_r=PQNZWZRKVD93HXXVG5A7&ie=UTF8'
driver.get(url)
html = driver.page_source
soup = BeautifulSoup(html, 'lxml')
div = soup.find('div', {'class': 'a-row padCenterContainer'})
print(div.prettify())
The output of this script was too long to put in this question but here is a link to it