Python 3 extract html data from sports site

Python 3 extract html data from sports site - python

I have been trying to extract data from a sports site and so far failing. I am Trying to extract the 35, Shots on Goal and 23 but have been failing.
<div class="statTextGroup">
<div class="statText statText--homeValue">35</div>
<div class="statText statText--titleValue">Shots on Goal</div>
<div class="statText statText--awayValue">23</div></div>
from bs4 import BeautifulSoup
import requests
result = requests.get("https://www.scoreboard.com/uk/match/lvbns58C/#match-statistics;0")
src = result.content
soup = BeautifulSoup(src, 'html.parser')
stats = soup.find("div", {"class": "tab-statistics-0-statistic"})
print(stats)
This is the code I have been trying to use and when I run it I get "None" printed to me. Could someone help me so I can print out the data.
Full page found here: https://www.scoreboard.com/uk/match/lvbns58C/#match-statistics;0

As the website is rendered by javascript, possible option would load the page using selenium and then parse it with BeautifulSoup:
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
# initialize selenium driver
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
wd = webdriver.Chrome('<<PATH_TO_SELENIUMDRIVER>>', options=chrome_options)
# load page via selenium
wd.get("https://www.scoreboard.com/uk/match/lvbns58C/#match-statistics;0")
# wait 30 seconds until element with class mainGrid will be loaded
table = WebDriverWait(wd, 30).until(EC.presence_of_element_located((By.ID, 'statistics-content')))
# parse content of the table
soup = BeautifulSoup(table.get_attribute('innerHTML'), 'html.parser')
print(soup)
# close selenium driver
wd.quit()

Related

Can't scrape table BeautifulSoup

I'm trying to scrape the following table from this URL: https://baseballsavant.mlb.com/leaderboard/outs_above_average?type=Fielder&startYear=2022&endYear=2022&split=no&team=&range=year&min=10&pos=of&roles=&viz=show
This is my code:
import requests
from bs4 import BeautifulSoup
url = "https://baseballsavant.mlb.com/leaderboard/outs_above_average?type=Fielder&startYear=2022&endYear=2022&split=no&team=&range=year&min=10&pos=of&roles=&viz=show"
r = requests.get(url)
soup = BeautifulSoup(r.content, "lxml")
table = soup.find("table")
for row in table.findAll("tr"):
print([i.text for i in row.findAll("td")])
However, my variable table returns None, even though there is clearly a table tag in the HTML code of the website. How do I get it?

The webpage is loaded dynamically and relies on JavaScript, therefore requests won't support it. You could use another parser library such as selenium.
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
driver = webdriver.Firefox()
url = "https://baseballsavant.mlb.com/leaderboard/outs_above_average?type=Fielder&startYear=2022&endYear=2022&split=no&team=&range=year&min=10&pos=of&roles=&viz=show"
driver.get(url)
wait = WebDriverWait(driver, 10)
wait.until(EC.presence_of_element_located((By.TAG_NAME, 'table')))
table = driver.find_element(By.TAG_NAME, 'table')
table_html = table.get_attribute('innerHTML')
# print('table html:', table_html)
for tr_web_element in table.find_elements(By.TAG_NAME, 'tr'):
for td_web_element in tr_web_element.find_elements(By.TAG_NAME, 'td'):
print(td_web_element.text)
driver.close()
Or see this answer to incorporate Selenium with BeautifulSoup.

AttributeError: 'NoneType' object has no attribute 'find' Web Scraping Python

I am working on an office project to get data to check active status on different websites but whenever I want to get data sometimes it shows none and sometimes it shows this Attribute error, I follow youtube videos steps but still get this error. help, please.
//Python Code
from bs4 import BeautifulSoup
import requests
html_text = requests.get(
"https://www.mintscan.io/cosmos/validators/cosmosvaloper1we6knm8qartmmh2r0qfpsz6pq0s7emv3e0meuw").text
soup = BeautifulSoup(html_text, 'lxml')
status = soup.find('div', {'class': "ValidatorInfo_statusBadge__PBIGr"})
para = status.find('p').text
print(para)

The url is dynamic meaning data is populated by javascript. So you need automation tool something like selenium.
from bs4 import BeautifulSoup
import time
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
url = 'https://www.mintscan.io/cosmos/validators/cosmosvaloper1we6knm8qartmmh2r0qfpsz6pq0s7emv3e0meuw'
driver = webdriver.Chrome(ChromeDriverManager().install())
driver.maximize_window()
time.sleep(8)
driver.get(url)
time.sleep(10)
soup = BeautifulSoup(driver.page_source, 'lxml')
#driver.close()
status = soup.find('div', {'class': "ValidatorInfo_statusBadge__PBIGr"})
para = status.find('p').text
print(para)
Output:
Active

You have the most common problem - modern pages use JavaScript to add elements but requests/BeautifulSoup can't run JavaScript.
So soup.find('div',...) gives None instead expected element and later it makes problem with None.find('p')
You may use Selenium to control real web browser which can run JavaScript.
from selenium import webdriver
#from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
#from selenium.common.exceptions import NoSuchElementException, TimeoutException
#from webdriver_manager.chrome import ChromeDriverManager
from webdriver_manager.firefox import GeckoDriverManager
url = "https://www.mintscan.io/cosmos/validators/cosmosvaloper1we6knm8qartmmh2r0qfpsz6pq0s7emv3e0meuw"
#driver = webdriver.Chrome(executable_path=ChromeDriverManager().install())
driver = webdriver.Firefox(executable_path=GeckoDriverManager().install())
driver.get(url)
#status = driver.find_element(By.XPATH, '//div[#class="ValidatorInfo_statusBadge__PBIGr"]')
wait = WebDriverWait(driver, 10)
status = wait.until(EC.visibility_of_element_located((By.XPATH, '//div[#class="ValidatorInfo_statusBadge__PBIGr"]')))
print(status.text)
Eventually you should check if page gives some API to get data.
You may also use DevTools (tab: Network) to check if JavaScript reads data from some URL and you may try to use this URL with requests. It could work faster than with Selenium but server may detect script/bot and block it.
JavaScript usually get data as JSON so it may not need to scrape HTML with BeautifulSoup

Python BSoup doesn't find hidden elements, problems with Selenium as alternative

Hey I asked a similar question before and from what I've learned BSoup doesn't find what I'm searching for because the element is hidden.
For context I find FancyCompLabel when I have the cursor above Rathberger and examine the html code
from bs4 import BeautifulSoup
from urllib.request import urlopen
url = "https://www.techpilot.de/zulieferer-suchen?laserschneiden%202d%20(laserstrahlschneiden)"
page = urlopen(url)
html = page.read().decode("utf-8")
soup = BeautifulSoup(html, "html.parser")
cclass=soup.find("div",class_="fancyCompLabel")
print(cclass)
It seems like Selenium could be a fix but I have really big problems implementing it.
heres what i tried:
from bs4 import BeautifulSoup
from selenium import webdriver
import requests
# Set some Selenium Options
options = webdriver.ChromeOptions()
options.add_argument('--headless')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
# Webdriver
wd = webdriver.Chrome('chromedriver',options=options)
# URL
url = 'https://www.techpilot.de/zulieferer-suchen?laserschneiden%202d%20(laserstrahlschneiden)'
# Load URL
wd.get(url)
# Get HTML
soup = BeautifulSoup(wd.page_source, 'html.parser')
cclass=soup.find_all("div",class_="fancyCompLabel")
print(cclass)

1 There are two "Accept cookies" modals, one of them is inside an iframe. You need to accept both and to stitch the the iframe of the seconds one.
2 You will need for your elements to become present in DOM after you accept the cookies.
I've managed to accomplish this with Selenium.
Solution
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
# Set some Selenium Options
options = webdriver.ChromeOptions()
# options.add_argument('--headless')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
# Webdriver
wd = webdriver.Chrome(executable_path='/snap/bin/chromium.chromedriver', options=options)
# URL
url = 'https://www.techpilot.de/zulieferer-suchen?laserschneiden%202d%20(laserstrahlschneiden)'
# Load URL
wd.get(url)
# Get HTML
soup = BeautifulSoup(wd.page_source, 'html.parser')
wait = WebDriverWait(wd, 15)
wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, "#bodyJSP #CybotCookiebotDialogBodyLevelButtonLevelOptinAllowAll"))).click()
wait.until(EC.frame_to_be_available_and_switch_to_it((By.CSS_SELECTOR, "#efficientSearchIframe")))
wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, ".hideFunctionalScrollbar #CybotCookiebotDialogBodyLevelButtonLevelOptinAllowAll"))).click()
#wd.switch_to.default_content() # you do not need to switch to default content because iframe is closed already
wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, ".fancyCompLabel")))
results = wd.find_elements_by_css_selector(".fancyCompLabel")
for p in results:
print(p.text)
Result:
Rathberger GmbH
Deschberger Metall- und Blechbearbeit...
Anant GmbH
Gröbmer GmbH
Berma Plaatwerk BV
Punzonado y Láser METALKOR S.L.
Blankart AG - Werkzeugbau + Fertigung
Goodwill Precision Machinery (German ...
Bechtold GmbH
PMT GmbH

Scraping: cannot extract content from webpage

I am trying to scrape the news content from the following page, but with no success.
https://www.business-humanrights.org/en/latest-news/?&search=nike
I have tried with Beautifulsoup :
r = requests.get("https://www.business-humanrights.org/en/latest-news/?&search=nike")
soup = BeautifulSoup(r.content, 'lxml')
soup
but the content that I am looking for - the bits of news that are tagged as div class = 'card__content', do not appear in the soup output.
I also checked, but I could not find any frames to switch to.
Finally, I tried with phantomjs and the following code but with no success:
import time
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
url = "https://www.business-humanrights.org/en/latest-news/?&search=nike"
driver = webdriver.PhantomJS(executable_path= '~\Chromedriver\phantomjs-2.1.1-windows\bin\phantomjs.exe')
driver.get(url)
time.sleep(7)
page = driver.page_source
driver.quit()
soup = BeautifulSoup(page, 'html.parser')
container = soup.find_all('div', attrs={
'class':'card__content'})
print(container)
I am running out of options, anyone can help?

Use API
import requests
r = requests.get("https://www.business-humanrights.org/en/api/internal/explore/?format=json&search=nike")
print(r.json())

I didn't understand why you're facing this. I tried the same above but not with requests and bs4. I used requests_html. xpaths can be used directly in this library without any other libraries.
import requests_html
session = requests_html.HTMLSession()
URL = 'https://www.business-humanrights.org/en/latest-news/?&search=nike'
res = session.get(URL)
divs_with_required_class = res.html.xpath(r'//div[#class="card__content"]')
for item in divs_with_required_class:
print(f'Div {divs_with_required_class.index(item) + 1}:\n', item.text, end='\n\n')

driver.page_source returns initial HTML-doc content no matter how long you wait (time.sleep(7) has no effect).
Try below:
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
driver.get(url)
cards = WebDriverWait(driver, 10).until(EC.presence_of_all_elements_located((By.XPATH, "//div[#class='card__content' and normalize-space(.)]")))
texts = [card.text for card in cards]
print(texts)
driver.quit()

Selenium Scraping Javascript Table

I am stuggling to scrape as per code below. Would apprciate it if someone can have a look at what I am missing?
Regards
PyProg70
from selenium import webdriver
from selenium.webdriver import FirefoxOptions
from selenium.webdriver.firefox.firefox_binary import FirefoxBinary
from bs4 import BeautifulSoup
import pandas as pd
import re, time
binary = FirefoxBinary('/usr/bin/firefox')
opts = FirefoxOptions()
opts.add_argument("--headless")
browser = webdriver.Firefox(options=opts, firefox_binary=binary)
browser.implicitly_wait(10)
url = 'http://tenderbulletin.eskom.co.za/'
browser.get(url)
html = browser.page_source
soup = BeautifulSoup(html, 'lxml')
print(soup.prettify())

not Java but Javascript. it dynamic page you need to wait and check if Ajax finished the request and content rendered using WebDriverWait.
....
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
.....
browser.get(url)
# wait max 30 second until table loaded
WebDriverWait(browser, 30).until(EC.presence_of_element_located((By.CSS_SELECTOR , 'table.CSSTableGenerator .ng-binding')))
html = browser.find_element_by_css_selector('table.CSSTableGenerator')
soup = BeautifulSoup(html.get_attribute("outerHTML"), 'lxml')
print(soup.prettify().encode('utf-8'))

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Python 3 extract html data from sports site - python

Related

Can't scrape table BeautifulSoup

AttributeError: 'NoneType' object has no attribute 'find' Web Scraping Python

Python BSoup doesn't find hidden elements, problems with Selenium as alternative

Scraping: cannot extract content from webpage

Selenium Scraping Javascript Table

Categories

Resources