Scraping: cannot extract content from webpage - python

I am trying to scrape the news content from the following page, but with no success.
https://www.business-humanrights.org/en/latest-news/?&search=nike
I have tried with Beautifulsoup :
r = requests.get("https://www.business-humanrights.org/en/latest-news/?&search=nike")
soup = BeautifulSoup(r.content, 'lxml')
soup
but the content that I am looking for - the bits of news that are tagged as div class = 'card__content', do not appear in the soup output.
I also checked, but I could not find any frames to switch to.
Finally, I tried with phantomjs and the following code but with no success:
import time
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
url = "https://www.business-humanrights.org/en/latest-news/?&search=nike"
driver = webdriver.PhantomJS(executable_path= '~\Chromedriver\phantomjs-2.1.1-windows\bin\phantomjs.exe')
driver.get(url)
time.sleep(7)
page = driver.page_source
driver.quit()
soup = BeautifulSoup(page, 'html.parser')
container = soup.find_all('div', attrs={
'class':'card__content'})
print(container)
I am running out of options, anyone can help?

Use API
import requests
r = requests.get("https://www.business-humanrights.org/en/api/internal/explore/?format=json&search=nike")
print(r.json())

I didn't understand why you're facing this. I tried the same above but not with requests and bs4. I used requests_html. xpaths can be used directly in this library without any other libraries.
import requests_html
session = requests_html.HTMLSession()
URL = 'https://www.business-humanrights.org/en/latest-news/?&search=nike'
res = session.get(URL)
divs_with_required_class = res.html.xpath(r'//div[#class="card__content"]')
for item in divs_with_required_class:
print(f'Div {divs_with_required_class.index(item) + 1}:\n', item.text, end='\n\n')

driver.page_source returns initial HTML-doc content no matter how long you wait (time.sleep(7) has no effect).
Try below:
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
driver.get(url)
cards = WebDriverWait(driver, 10).until(EC.presence_of_all_elements_located((By.XPATH, "//div[#class='card__content' and normalize-space(.)]")))
texts = [card.text for card in cards]
print(texts)
driver.quit()

Related

Can't scrape table BeautifulSoup

I'm trying to scrape the following table from this URL: https://baseballsavant.mlb.com/leaderboard/outs_above_average?type=Fielder&startYear=2022&endYear=2022&split=no&team=&range=year&min=10&pos=of&roles=&viz=show
This is my code:
import requests
from bs4 import BeautifulSoup
url = "https://baseballsavant.mlb.com/leaderboard/outs_above_average?type=Fielder&startYear=2022&endYear=2022&split=no&team=&range=year&min=10&pos=of&roles=&viz=show"
r = requests.get(url)
soup = BeautifulSoup(r.content, "lxml")
table = soup.find("table")
for row in table.findAll("tr"):
print([i.text for i in row.findAll("td")])
However, my variable table returns None, even though there is clearly a table tag in the HTML code of the website. How do I get it?
The webpage is loaded dynamically and relies on JavaScript, therefore requests won't support it. You could use another parser library such as selenium.
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
driver = webdriver.Firefox()
url = "https://baseballsavant.mlb.com/leaderboard/outs_above_average?type=Fielder&startYear=2022&endYear=2022&split=no&team=&range=year&min=10&pos=of&roles=&viz=show"
driver.get(url)
wait = WebDriverWait(driver, 10)
wait.until(EC.presence_of_element_located((By.TAG_NAME, 'table')))
table = driver.find_element(By.TAG_NAME, 'table')
table_html = table.get_attribute('innerHTML')
# print('table html:', table_html)
for tr_web_element in table.find_elements(By.TAG_NAME, 'tr'):
for td_web_element in tr_web_element.find_elements(By.TAG_NAME, 'td'):
print(td_web_element.text)
driver.close()
Or see this answer to incorporate Selenium with BeautifulSoup.

Parsing web-page with search bar

I need to parse store names (<div class="LocationName">) from https://www.comicshoplocator.com/StoreLocator.
The thing is -- when you entering zip code (for instance 73533) in search it does not appear in the URL.
Because of that python can't see elements on page.
Here is my code snippet. I am receiving no output, because of that.
How to make python see input with zip code?
Thanks
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup
browser = webdriver.Firefox(executable_path=r'C:\Geckodriver\Geckodriver.exe')
browser.get('https://www.comicshoplocator.com/StoreLocator')
browser.find_element(By.NAME, 'query').send_keys('73533' + Keys.RETURN)
html = browser.page_source
soup = BeautifulSoup(html, features="html.parser")
for tag in soup.find_all('div', class_="LocationName"):
print(tag.text)
The problem is in here: browser.find_element(By.NAME, 'query').send_keys('73533' + Keys.RETURN)
The correct one would be:
search = browser.find_element(By.NAME, 'query')
search.send_keys('73533')
search.send_keys(Keys.RETURN)
Full working code:
I use chrome driver you can change that portion with no times
import time
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
driver.get('https://www.comicshoplocator.com/StoreLocator')
driver.maximize_window()
time.sleep(2)
d=driver.find_element(By.NAME, 'query')
d.send_keys('73533')
d.send_keys(Keys.ENTER)
soup = BeautifulSoup(driver.page_source, 'lxml')
for tag in soup.find_all('div', class_="LocationName"):
print(tag.text)
Output:
MARK DOWN COMICS
WWW.DCBSERVICE.COM
Well, actually, this can be done with requests, there's no need to use Selenium. You can send a post request to:
https://www.comicshoplocator.com/StoreLocator
import re
import requests
from bs4 import BeautifulSoup
data = {
"showAll": "false",
"showCsls": "true",
"query": "73533",
}
response = requests.post(
"https://www.comicshoplocator.com/StoreLocator",
data=data,
)
soup = BeautifulSoup(response.text, "html.parser")
string = soup.select_one("script:-soup-contains('address')").string
unformatted_data = re.search(r"\(({.*?})\)", string, re.DOTALL).group(1)
# remove all the whitespace
formatted_data = re.sub(r"\s+", "", unformatted_data)
print(formatted_data)
Prints:
{storeno:"8816",lat:"41.0671081542969",lng:"-85.1372680664063",name:"WWW.DCBSERVICE.COM",address:"6005ESHELBYDR",address2:"WWW.DCBSERVICE.COM",city:"MEMPHIS",state:"TN",zip:"38141",phone:"",hasProfile:"True",storeLogo:'/Image/CslsLogo/'+"8816"}
This code worked for me:
listings = browser.find_elements(By.CLASS_NAME, 'CslsLocationItem')
for listing in listings:
print(listing.find_element(By.CLASS_NAME,'LocationName').get_attribute('innerText'))

How to wait to page to fully load using requests_html

While accessing this link https://www.dickssportinggoods.com/f/tents-accessories?pageNumber=2 with requests_html, i need to wait to wait some time before the page actually loads. Is it possible with this?
My code:
from requests_html import HTMLSession
from bs4 import BeautifulSoup
from lxml import etree
s = HTMLSession()
response = s.get(
'https://www.dickssportinggoods.com/f/tents-accessories?pageNumber=2')
response.html.render()
soup = BeautifulSoup(response.content, "html.parser")
dom = etree.HTML(str(soup))
item = dom.xpath('//a[#class="rs_product_description d-block"]/text()')[0]
print(item)
It looks like the data you are looking for can be fetched using HTTP GET to
https://prod-catalog-product-api.dickssportinggoods.com/v2/search?searchVO=%7B%22selectedCategory%22%3A%2212301_1809051%22%2C%22selectedStore%22%3A%220%22%2C%22selectedSort%22%3A1%2C%22selectedFilters%22%3A%7B%7D%2C%22storeId%22%3A15108%2C%22pageNumber%22%3A2%2C%22pageSize%22%3A48%2C%22totalCount%22%3A112%2C%22searchTypes%22%3A%5B%22PINNING%22%5D%2C%22isFamilyPage%22%3Atrue%2C%22appliedSeoFilters%22%3Afalse%2C%22snbAudience%22%3A%22%22%2C%22zipcode%22%3A%22%22%7D
The call will return a JSON and you can use that direcly with zero scraping code.
Copy/Paste the URL into the browser --> see the data.
You can specify the page number in the url:
searchVO={"selectedCategory":"12301_1809051","selectedStore":"0","selectedSort":1,"selectedFilters":{},"storeId":15108,"pageNumber":2,"pageSize":48,"totalCount":112,"searchTypes":["PINNING"],"isFamilyPage":true,"appliedSeoFilters":false,"snbAudience":"","zipcode":""}
working code below
import requests
import pprint
page_num = 2
url = f'https://prod-catalog-product-api.dickssportinggoods.com/v2/search?searchVO=%7B%22selectedCategory%22%3A%2212301_1809051%22%2C%22selectedStore%22%3A%220%22%2C%22selectedSort%22%3A1%2C%22selectedFilters%22%3A%7B%7D%2C%22storeId%22%3A15108%2C%22pageNumber%22%3A2%2C%2{page_num}pageSize%22%3A48%2C%22totalCount%22%3A112%2C%22searchTypes%22%3A%5B%22PINNING%22%5D%2C%22isFamilyPage%22%3Atrue%2C%22appliedSeoFilters%22%3Afalse%2C%22snbAudience%22%3A%22%22%2C%22zipcode%22%3A%22%22%7D'
r = requests.get(url)
if r.status_code == 200:
pprint.pprint(r.json())
You can induce Selenium as well in headless mode.
Selenium has the capability to wait unit elements are found with Explicit waits.
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
options = webdriver.ChromeOptions()
options.add_argument('--window-size=1920,1080')
options.add_argument("--headless")
driver = webdriver.Chrome(executable_path = driver_path, options = options)
driver.get("URL here")
wait = WebDriverWait(driver, 20)
wait.until(EC.visibility_of_element_located((By.XPATH, "//a[#class='rs_product_description d-block']")))
PS: You'd have to download chromedriver from here

Python 3 extract html data from sports site

I have been trying to extract data from a sports site and so far failing. I am Trying to extract the 35, Shots on Goal and 23 but have been failing.
<div class="statTextGroup">
<div class="statText statText--homeValue">35</div>
<div class="statText statText--titleValue">Shots on Goal</div>
<div class="statText statText--awayValue">23</div></div>
from bs4 import BeautifulSoup
import requests
result = requests.get("https://www.scoreboard.com/uk/match/lvbns58C/#match-statistics;0")
src = result.content
soup = BeautifulSoup(src, 'html.parser')
stats = soup.find("div", {"class": "tab-statistics-0-statistic"})
print(stats)
This is the code I have been trying to use and when I run it I get "None" printed to me. Could someone help me so I can print out the data.
Full page found here: https://www.scoreboard.com/uk/match/lvbns58C/#match-statistics;0
As the website is rendered by javascript, possible option would load the page using selenium and then parse it with BeautifulSoup:
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
# initialize selenium driver
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
wd = webdriver.Chrome('<<PATH_TO_SELENIUMDRIVER>>', options=chrome_options)
# load page via selenium
wd.get("https://www.scoreboard.com/uk/match/lvbns58C/#match-statistics;0")
# wait 30 seconds until element with class mainGrid will be loaded
table = WebDriverWait(wd, 30).until(EC.presence_of_element_located((By.ID, 'statistics-content')))
# parse content of the table
soup = BeautifulSoup(table.get_attribute('innerHTML'), 'html.parser')
print(soup)
# close selenium driver
wd.quit()

Trying to use selenium to webscrape ncbi, the data doesn't load and isn't contained in an element with an ID that I can wait for

I am trying to scrape genetic data from web pages like https://www.ncbi.nlm.nih.gov//nuccore/KC208619.1?report=fasta.
I am using beautiful soup and selenium.
The data is located inside an element with the id viewercontent1.
When I print that out with this code:
from bs4 import BeautifulSoup
import requests
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
import re
secondDriver = webdriver.Chrome(executable_path='/Users/me/Documents/chloroPlastGenScrape/chromedriver')
newLink = "https://www.ncbi.nlm.nih.gov//nuccore/KC208619.1?report=fasta"
secondDriver.implicitly_wait(10)
WebDriverWait(secondDriver, 10).until(lambda driver: driver.execute_script('return document.readyState') == 'complete')
secondDriver.get(newLink)
html2 = secondDriver.page_source
subSoup = BeautifulSoup(html2, 'html.parser')
viewercontent1 = subSoup.findAll("div", {"id" : "viewercontent1"})[0]
print(viewercontent1)
I print out :
<div class="seq gbff" id="viewercontent1" sequencesize="450826" style="display: block;" val="426261815" virtualsequence=""><div class="loading">Loading ... <img alt="record loading animation" src="/core/extjs/ext-2.1/resources/images/default/grid/loading.gif"/></div></div>
It seems the content hasn't finished loading.
I tried implicitly waiting and checking to see if the content is done loading (before and after calling the .get() function) but that doesn't seem like it did anything.
I can't wait for the content to load specifically via ID (presence_of_element_located) because the data is contained directly within a <pre></pre> element with on id.
Any help would be greatly appreciated.
To get content of the <div>, you can use this script:
import requests
from bs4 import BeautifulSoup
url = 'https://www.ncbi.nlm.nih.gov//nuccore/KC208619.1?report=fasta'
fasta_url = 'https://www.ncbi.nlm.nih.gov/sviewer/viewer.fcgi?id={id}&report=fasta'
soup = BeautifulSoup(requests.get(url).content, 'html.parser')
id_ = soup.select_one('meta[name="ncbi_uidlist"]')['content']
fasta_txt = requests.get(fasta_url.format(id=id_)).text
print(fasta_txt)
Prints:
>KC208619.1 Butomus umbellatus mitochondrion, complete genome
CCGCCTCTCCCCCCCCCCCCCCGCTCCGTTGTTGAAGCGGGCCCCCCCCATACTCATGAATCTGCATTCC
CAACCAAGGAGTTGTCTCATATAGACAGAGTTGGGCCCCCGTGTTCTGAGATCTTTTTCAACTTGATTAA
TAAAGAGGATTTCTCGGCCGTCTTTTTCGGCTAGGCTCCATTCGGGGTGGGTGTCCAGCTCGTCCCGCTT
CTCGTTAAAGAAATCGATAAAGGCTTCTTCGGGGGTGTAGGCGGCATTTTCCCCCAAGTGGGGATGTCGA
GAAAGCACTTCTTGAAAACGAGAATAAGCTGCGTGCTTACGTTCCCGGATTTGGAGATCCCGGTTTTCGA
...and so on.
#Andrej's solution seems much simpler but if you still want to go the waiting route...
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import re
driver = webdriver.Chrome()
newLink = "https://www.ncbi.nlm.nih.gov//nuccore/KC208619.1?report=fasta"
driver.get(newLink)
WebDriverWait(driver, 10).until(lambda driver: driver.execute_script('return document.readyState') == 'complete')
element = WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.CSS_SELECTOR, "#viewercontent1 pre"))
)
html2 = driver.page_source
subSoup = BeautifulSoup(html2, 'html.parser')
viewercontent1 = subSoup.findAll("div", {"id" : "viewercontent1"})[0]
print(viewercontent1)

Categories

Resources