I am stuggling to scrape as per code below. Would apprciate it if someone can have a look at what I am missing?
Regards
PyProg70
from selenium import webdriver
from selenium.webdriver import FirefoxOptions
from selenium.webdriver.firefox.firefox_binary import FirefoxBinary
from bs4 import BeautifulSoup
import pandas as pd
import re, time
binary = FirefoxBinary('/usr/bin/firefox')
opts = FirefoxOptions()
opts.add_argument("--headless")
browser = webdriver.Firefox(options=opts, firefox_binary=binary)
browser.implicitly_wait(10)
url = 'http://tenderbulletin.eskom.co.za/'
browser.get(url)
html = browser.page_source
soup = BeautifulSoup(html, 'lxml')
print(soup.prettify())
not Java but Javascript. it dynamic page you need to wait and check if Ajax finished the request and content rendered using WebDriverWait.
....
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
.....
browser.get(url)
# wait max 30 second until table loaded
WebDriverWait(browser, 30).until(EC.presence_of_element_located((By.CSS_SELECTOR , 'table.CSSTableGenerator .ng-binding')))
html = browser.find_element_by_css_selector('table.CSSTableGenerator')
soup = BeautifulSoup(html.get_attribute("outerHTML"), 'lxml')
print(soup.prettify().encode('utf-8'))
Related
I'm trying to scrape the following table from this URL: https://baseballsavant.mlb.com/leaderboard/outs_above_average?type=Fielder&startYear=2022&endYear=2022&split=no&team=&range=year&min=10&pos=of&roles=&viz=show
This is my code:
import requests
from bs4 import BeautifulSoup
url = "https://baseballsavant.mlb.com/leaderboard/outs_above_average?type=Fielder&startYear=2022&endYear=2022&split=no&team=&range=year&min=10&pos=of&roles=&viz=show"
r = requests.get(url)
soup = BeautifulSoup(r.content, "lxml")
table = soup.find("table")
for row in table.findAll("tr"):
print([i.text for i in row.findAll("td")])
However, my variable table returns None, even though there is clearly a table tag in the HTML code of the website. How do I get it?
The webpage is loaded dynamically and relies on JavaScript, therefore requests won't support it. You could use another parser library such as selenium.
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
driver = webdriver.Firefox()
url = "https://baseballsavant.mlb.com/leaderboard/outs_above_average?type=Fielder&startYear=2022&endYear=2022&split=no&team=&range=year&min=10&pos=of&roles=&viz=show"
driver.get(url)
wait = WebDriverWait(driver, 10)
wait.until(EC.presence_of_element_located((By.TAG_NAME, 'table')))
table = driver.find_element(By.TAG_NAME, 'table')
table_html = table.get_attribute('innerHTML')
# print('table html:', table_html)
for tr_web_element in table.find_elements(By.TAG_NAME, 'tr'):
for td_web_element in tr_web_element.find_elements(By.TAG_NAME, 'td'):
print(td_web_element.text)
driver.close()
Or see this answer to incorporate Selenium with BeautifulSoup.
While accessing this link https://www.dickssportinggoods.com/f/tents-accessories?pageNumber=2 with requests_html, i need to wait to wait some time before the page actually loads. Is it possible with this?
My code:
from requests_html import HTMLSession
from bs4 import BeautifulSoup
from lxml import etree
s = HTMLSession()
response = s.get(
'https://www.dickssportinggoods.com/f/tents-accessories?pageNumber=2')
response.html.render()
soup = BeautifulSoup(response.content, "html.parser")
dom = etree.HTML(str(soup))
item = dom.xpath('//a[#class="rs_product_description d-block"]/text()')[0]
print(item)
It looks like the data you are looking for can be fetched using HTTP GET to
https://prod-catalog-product-api.dickssportinggoods.com/v2/search?searchVO=%7B%22selectedCategory%22%3A%2212301_1809051%22%2C%22selectedStore%22%3A%220%22%2C%22selectedSort%22%3A1%2C%22selectedFilters%22%3A%7B%7D%2C%22storeId%22%3A15108%2C%22pageNumber%22%3A2%2C%22pageSize%22%3A48%2C%22totalCount%22%3A112%2C%22searchTypes%22%3A%5B%22PINNING%22%5D%2C%22isFamilyPage%22%3Atrue%2C%22appliedSeoFilters%22%3Afalse%2C%22snbAudience%22%3A%22%22%2C%22zipcode%22%3A%22%22%7D
The call will return a JSON and you can use that direcly with zero scraping code.
Copy/Paste the URL into the browser --> see the data.
You can specify the page number in the url:
searchVO={"selectedCategory":"12301_1809051","selectedStore":"0","selectedSort":1,"selectedFilters":{},"storeId":15108,"pageNumber":2,"pageSize":48,"totalCount":112,"searchTypes":["PINNING"],"isFamilyPage":true,"appliedSeoFilters":false,"snbAudience":"","zipcode":""}
working code below
import requests
import pprint
page_num = 2
url = f'https://prod-catalog-product-api.dickssportinggoods.com/v2/search?searchVO=%7B%22selectedCategory%22%3A%2212301_1809051%22%2C%22selectedStore%22%3A%220%22%2C%22selectedSort%22%3A1%2C%22selectedFilters%22%3A%7B%7D%2C%22storeId%22%3A15108%2C%22pageNumber%22%3A2%2C%2{page_num}pageSize%22%3A48%2C%22totalCount%22%3A112%2C%22searchTypes%22%3A%5B%22PINNING%22%5D%2C%22isFamilyPage%22%3Atrue%2C%22appliedSeoFilters%22%3Afalse%2C%22snbAudience%22%3A%22%22%2C%22zipcode%22%3A%22%22%7D'
r = requests.get(url)
if r.status_code == 200:
pprint.pprint(r.json())
You can induce Selenium as well in headless mode.
Selenium has the capability to wait unit elements are found with Explicit waits.
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
options = webdriver.ChromeOptions()
options.add_argument('--window-size=1920,1080')
options.add_argument("--headless")
driver = webdriver.Chrome(executable_path = driver_path, options = options)
driver.get("URL here")
wait = WebDriverWait(driver, 20)
wait.until(EC.visibility_of_element_located((By.XPATH, "//a[#class='rs_product_description d-block']")))
PS: You'd have to download chromedriver from here
I'm trying to scrape a web page but before accessing the page, there is a banner for accepting cookies. I am using selenium to click on the button "Accept all cookies" but even after clicking on the button I can't access the right HTML page.
This is my code :
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
url = 'https://www.wikiparfum.fr/explore/by-name?query=dior'
driver = webdriver.Chrome(executable_path=DRIVER_PATH)
driver.get(url)
driver.find_element_by_id('onetrust-accept-btn-handler').click()
html = driver.page_source
soup = BeautifulSoup(html, 'lxml')
print(soup)
And this is the beginning of the HTML page that is printed :
If anyone can help me with this one, thank you!
You should wait for the accept cookies button element appearance before clicking it
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
url = 'https://www.wikiparfum.fr/explore/by-name?query=dior'
driver = webdriver.Chrome(executable_path=DRIVER_PATH)
wait = WebDriverWait(driver, 20)
driver.get(url)
wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "#onetrust-accept-btn-handler"))).click()
html = driver.page_source
soup = BeautifulSoup(html, 'lxml')
print(soup)
My code goes into a webpage and I want to scrape the href/HTML of each listing within this webpage.
(This code goes to a website which has 2)
I tried xpath, and beautifulSoup but it returns an empty list for me.
Here is the code-
import time
from selenium import webdriver
import pandas as pd
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
import pandas as pd
from bs4 import BeautifulSoup
driver = webdriver.Chrome()
bracket=[]
driver.get('https://casehippo.com/spa/symposium/national-kidney-foundation-2021-spring-clinical-meetings/event/gallery/?search=Patiromer')
time.sleep(3)
page_source = driver.page_source
soup = BeautifulSoup(page_source, 'html.parser')
eachRow=driver.find_element_by_partial_link_text('symposium')
print(eachRow.text)
I just ran the code what you provided, BeautifulSoup set soup variable with all page source successfully:
soup = BeautifulSoup(page_source, 'html.parser')
and in the next line:
eachRow=driver.find_element_by_partial_link_text('symposium')
exception has been raised with message:
selenium.common.exceptions.NoSuchElementException: Message: no such element: Unable to locate element: {"method":"partial link text","selector":"symposium"}
seems like you're using incorrect selector, try to use, somethink like:
element = driver.find_element_by_xpath("//a[#class='title ng-binding']")
code what i'm using:
import time
from bs4 import BeautifulSoup
from selenium import webdriver
driver = webdriver.Chrome()
try:
bracket = []
driver.get(
'https://casehippo.com/spa/symposium/national-kidney-foundation-2021-spring-clinical-meetings/event/gallery/?search=Patiromer')
time.sleep(3)
page_source = driver.page_source
soup = BeautifulSoup(page_source, 'html.parser')
print(soup)
element = driver.find_element_by_xpath("//a[#class='title ng-binding']")
print(element.get_attribute('href'))
elements = driver.find_elements_by_xpath("//a[#class='title ng-binding']")
for el in elements:
print(el.get_attribute('href'))
finally:
driver.quit()
Updated code:
import time
from selenium import webdriver
import pandas as pd
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
import pandas as pd
from bs4 import BeautifulSoup
driver = webdriver.Chrome()
bracket=[]
driver.get('https://casehippo.com/spa/symposium/national-kidney-foundation-2021-spring-clinical-meetings/event/gallery/?search=Patiromer')
time.sleep(3)
page_source = driver.page_source
soup = BeautifulSoup(page_source, 'html.parser')
eachRow=driver.find_elements_by_xpath("//a[contains(#ui-sref,'symposium')]")
for row in eachRow:
print(row.text)
You need to use find_elements (not find_element) if there are more than one, and then iterate over them to see their values. Also partial text wont work because the symposium text is embedded in another element, its not regular text, so xpath is needed
I have been trying to extract data from a sports site and so far failing. I am Trying to extract the 35, Shots on Goal and 23 but have been failing.
<div class="statTextGroup">
<div class="statText statText--homeValue">35</div>
<div class="statText statText--titleValue">Shots on Goal</div>
<div class="statText statText--awayValue">23</div></div>
from bs4 import BeautifulSoup
import requests
result = requests.get("https://www.scoreboard.com/uk/match/lvbns58C/#match-statistics;0")
src = result.content
soup = BeautifulSoup(src, 'html.parser')
stats = soup.find("div", {"class": "tab-statistics-0-statistic"})
print(stats)
This is the code I have been trying to use and when I run it I get "None" printed to me. Could someone help me so I can print out the data.
Full page found here: https://www.scoreboard.com/uk/match/lvbns58C/#match-statistics;0
As the website is rendered by javascript, possible option would load the page using selenium and then parse it with BeautifulSoup:
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
# initialize selenium driver
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
wd = webdriver.Chrome('<<PATH_TO_SELENIUMDRIVER>>', options=chrome_options)
# load page via selenium
wd.get("https://www.scoreboard.com/uk/match/lvbns58C/#match-statistics;0")
# wait 30 seconds until element with class mainGrid will be loaded
table = WebDriverWait(wd, 30).until(EC.presence_of_element_located((By.ID, 'statistics-content')))
# parse content of the table
soup = BeautifulSoup(table.get_attribute('innerHTML'), 'html.parser')
print(soup)
# close selenium driver
wd.quit()