Python BSoup doesn't find hidden elements, problems with Selenium as alternative - python

Hey I asked a similar question before and from what I've learned BSoup doesn't find what I'm searching for because the element is hidden.
For context I find FancyCompLabel when I have the cursor above Rathberger and examine the html code
from bs4 import BeautifulSoup
from urllib.request import urlopen
url = "https://www.techpilot.de/zulieferer-suchen?laserschneiden%202d%20(laserstrahlschneiden)"
page = urlopen(url)
html = page.read().decode("utf-8")
soup = BeautifulSoup(html, "html.parser")
cclass=soup.find("div",class_="fancyCompLabel")
print(cclass)
It seems like Selenium could be a fix but I have really big problems implementing it.
heres what i tried:
from bs4 import BeautifulSoup
from selenium import webdriver
import requests
# Set some Selenium Options
options = webdriver.ChromeOptions()
options.add_argument('--headless')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
# Webdriver
wd = webdriver.Chrome('chromedriver',options=options)
# URL
url = 'https://www.techpilot.de/zulieferer-suchen?laserschneiden%202d%20(laserstrahlschneiden)'
# Load URL
wd.get(url)
# Get HTML
soup = BeautifulSoup(wd.page_source, 'html.parser')
cclass=soup.find_all("div",class_="fancyCompLabel")
print(cclass)

1 There are two "Accept cookies" modals, one of them is inside an iframe. You need to accept both and to stitch the the iframe of the seconds one.
2 You will need for your elements to become present in DOM after you accept the cookies.
I've managed to accomplish this with Selenium.
Solution
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
# Set some Selenium Options
options = webdriver.ChromeOptions()
# options.add_argument('--headless')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
# Webdriver
wd = webdriver.Chrome(executable_path='/snap/bin/chromium.chromedriver', options=options)
# URL
url = 'https://www.techpilot.de/zulieferer-suchen?laserschneiden%202d%20(laserstrahlschneiden)'
# Load URL
wd.get(url)
# Get HTML
soup = BeautifulSoup(wd.page_source, 'html.parser')
wait = WebDriverWait(wd, 15)
wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, "#bodyJSP #CybotCookiebotDialogBodyLevelButtonLevelOptinAllowAll"))).click()
wait.until(EC.frame_to_be_available_and_switch_to_it((By.CSS_SELECTOR, "#efficientSearchIframe")))
wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, ".hideFunctionalScrollbar #CybotCookiebotDialogBodyLevelButtonLevelOptinAllowAll"))).click()
#wd.switch_to.default_content() # you do not need to switch to default content because iframe is closed already
wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, ".fancyCompLabel")))
results = wd.find_elements_by_css_selector(".fancyCompLabel")
for p in results:
print(p.text)
Result:
Rathberger GmbH
Deschberger Metall- und Blechbearbeit...
Anant GmbH
Gröbmer GmbH
Berma Plaatwerk BV
Punzonado y Láser METALKOR S.L.
Blankart AG - Werkzeugbau + Fertigung
Goodwill Precision Machinery (German ...
Bechtold GmbH
PMT GmbH

Related

BeautifulSoup scrap only few positions

I am learning scraping.
Few month ago I did scraping for learning purpose website with house prices. It is on github:
https://github.com/MariuszTP/Beautifulsoup
Now when I try to scrap it, it is returning only few positions.
I simplifiy the code:
url = "https://www.otodom.pl/pl/oferty/sprzedaz/mieszkanie/poznan?distanceRadius=0&market=ALL&locations=%5Bcities_6-1%5D&viewType=listing&lang=pl&searchingCriteria=sprzedaz&searchingCriteria=mieszkanie&searchingCriteria=cala-polska"
html_text = requests.get( url ).text
soup = BeautifulSoup(html_text, 'lxml')
houses = soup.find_all('article', class_ = 'css-n8rq67 es62z2j16')
for i in houses:
print(i.text)
Can somebody tell me please what is the problem and how to solve it?
Today most of the websites not have the data in the HTML directly. It means that BeautifulSoup cannot work with dynamic websites. (Selenium is the solution)
from webdriver_manager.chrome import ChromeDriverManager
from selenium import webdriver
from selenium.webdriver.chrome.service import Service as ChromeService
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
options = Options()
# options.add_argument('--disable-blink-features=AutomationControlled')
service = ChromeService(executable_path=ChromeDriverManager().install())
driver = webdriver.Chrome(service=service)
URL = 'https://www.otodom.pl/pl/oferty/sprzedaz/mieszkanie/poznan?distanceRadius=0&market=ALL&locations=%5Bcities_6-1%5D&viewType=listing&lang=pl&searchingCriteria=sprzedaz&searchingCriteria=mieszkanie&searchingCriteria=cala-polska'
driver.get(URL)
houses = driver.find_elements(By.XPATH, "//article[#class='css-n8rq67 es62z2j16']")
for i in houses:
print(i.text)
print(len(houses))
driver.quit()

Using a proxy with Selenium does not get correct results

I have this function that works properly, but without proxy.
It contains contents in the HTML that I need when I extract it from a website:
def extract_listing_html(url):
driver_path = "C:/Users/parkj/Downloads/chromedriver_win32/chromedriver.exe"
driver = webdriver.Chrome(service = Service(driver_path))
driver.get(url)
time.sleep(5)
html = driver.page_source
soup = BeautifulSoup(html, "html.parser")
return soup
I want to use a proxy and this is what I have so far, but I am not getting the same results as when I am not using a proxy:
def extract_listing_html(url):
PROXY = "164.155.145.1:80"
driver_path = "C:/Users/parkj/Downloads/chromedriver_win32/chromedriver.exe"
chrome_options = Options()
chrome_options.add_argument('--proxy-server=%s' "http://" +PROXY)
driver = webdriver.Chrome(service = Service(driver_path), options = chrome_options)
driver.get(url)
time.sleep(5)
html = driver.page_source
soup = BeautifulSoup(html, "html.parser")
return soup
I played around with it and found out that adding options = chrome_options in webdriver.Chrome( ) is what is causing it to not return the same HTML, but I'm not sure.
HTML Without Proxy
HTML With Proxy
They look quite different, not sure what is causing it.
Imports:
import time
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
Using a user-agent instead has given me results.
Use pip install pyyaml ua-parser user-agents fake-useragent to install fake_useragent
from fake_useragent import UserAgent
def extract_listing_html(url):
opts = Options()
ua = UserAgent()
userAgent = ua.random
print(userAgent)
opts.add_argument(f'user-agent={userAgent}')
driver = webdriver.Chrome(service = Service(driver_path), options = opts)
driver.get(url)
time.sleep(5)
html = driver.page_source
soup = BeautifulSoup(html, "html.parser")
return soup

How to wait to page to fully load using requests_html

While accessing this link https://www.dickssportinggoods.com/f/tents-accessories?pageNumber=2 with requests_html, i need to wait to wait some time before the page actually loads. Is it possible with this?
My code:
from requests_html import HTMLSession
from bs4 import BeautifulSoup
from lxml import etree
s = HTMLSession()
response = s.get(
'https://www.dickssportinggoods.com/f/tents-accessories?pageNumber=2')
response.html.render()
soup = BeautifulSoup(response.content, "html.parser")
dom = etree.HTML(str(soup))
item = dom.xpath('//a[#class="rs_product_description d-block"]/text()')[0]
print(item)
It looks like the data you are looking for can be fetched using HTTP GET to
https://prod-catalog-product-api.dickssportinggoods.com/v2/search?searchVO=%7B%22selectedCategory%22%3A%2212301_1809051%22%2C%22selectedStore%22%3A%220%22%2C%22selectedSort%22%3A1%2C%22selectedFilters%22%3A%7B%7D%2C%22storeId%22%3A15108%2C%22pageNumber%22%3A2%2C%22pageSize%22%3A48%2C%22totalCount%22%3A112%2C%22searchTypes%22%3A%5B%22PINNING%22%5D%2C%22isFamilyPage%22%3Atrue%2C%22appliedSeoFilters%22%3Afalse%2C%22snbAudience%22%3A%22%22%2C%22zipcode%22%3A%22%22%7D
The call will return a JSON and you can use that direcly with zero scraping code.
Copy/Paste the URL into the browser --> see the data.
You can specify the page number in the url:
searchVO={"selectedCategory":"12301_1809051","selectedStore":"0","selectedSort":1,"selectedFilters":{},"storeId":15108,"pageNumber":2,"pageSize":48,"totalCount":112,"searchTypes":["PINNING"],"isFamilyPage":true,"appliedSeoFilters":false,"snbAudience":"","zipcode":""}
working code below
import requests
import pprint
page_num = 2
url = f'https://prod-catalog-product-api.dickssportinggoods.com/v2/search?searchVO=%7B%22selectedCategory%22%3A%2212301_1809051%22%2C%22selectedStore%22%3A%220%22%2C%22selectedSort%22%3A1%2C%22selectedFilters%22%3A%7B%7D%2C%22storeId%22%3A15108%2C%22pageNumber%22%3A2%2C%2{page_num}pageSize%22%3A48%2C%22totalCount%22%3A112%2C%22searchTypes%22%3A%5B%22PINNING%22%5D%2C%22isFamilyPage%22%3Atrue%2C%22appliedSeoFilters%22%3Afalse%2C%22snbAudience%22%3A%22%22%2C%22zipcode%22%3A%22%22%7D'
r = requests.get(url)
if r.status_code == 200:
pprint.pprint(r.json())
You can induce Selenium as well in headless mode.
Selenium has the capability to wait unit elements are found with Explicit waits.
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
options = webdriver.ChromeOptions()
options.add_argument('--window-size=1920,1080')
options.add_argument("--headless")
driver = webdriver.Chrome(executable_path = driver_path, options = options)
driver.get("URL here")
wait = WebDriverWait(driver, 20)
wait.until(EC.visibility_of_element_located((By.XPATH, "//a[#class='rs_product_description d-block']")))
PS: You'd have to download chromedriver from here

Python 3 extract html data from sports site

I have been trying to extract data from a sports site and so far failing. I am Trying to extract the 35, Shots on Goal and 23 but have been failing.
<div class="statTextGroup">
<div class="statText statText--homeValue">35</div>
<div class="statText statText--titleValue">Shots on Goal</div>
<div class="statText statText--awayValue">23</div></div>
from bs4 import BeautifulSoup
import requests
result = requests.get("https://www.scoreboard.com/uk/match/lvbns58C/#match-statistics;0")
src = result.content
soup = BeautifulSoup(src, 'html.parser')
stats = soup.find("div", {"class": "tab-statistics-0-statistic"})
print(stats)
This is the code I have been trying to use and when I run it I get "None" printed to me. Could someone help me so I can print out the data.
Full page found here: https://www.scoreboard.com/uk/match/lvbns58C/#match-statistics;0
As the website is rendered by javascript, possible option would load the page using selenium and then parse it with BeautifulSoup:
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
# initialize selenium driver
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
wd = webdriver.Chrome('<<PATH_TO_SELENIUMDRIVER>>', options=chrome_options)
# load page via selenium
wd.get("https://www.scoreboard.com/uk/match/lvbns58C/#match-statistics;0")
# wait 30 seconds until element with class mainGrid will be loaded
table = WebDriverWait(wd, 30).until(EC.presence_of_element_located((By.ID, 'statistics-content')))
# parse content of the table
soup = BeautifulSoup(table.get_attribute('innerHTML'), 'html.parser')
print(soup)
# close selenium driver
wd.quit()

Scraping paginated data loaded with Javascript

I am trying to use selenium and beautifulsoup to scrape videos off a website. The videos are loaded when the 'videos' tab is clicked (via JS I guess). When the videos are loaded, there is also the pagination where videos on each page is loaded on click (via JS I guess).
Here is how it looks
When I inspect element, here is what I get
My issue is I can't seem to get all videos across all pages, I can only get the first page. Here is my code,
from selenium import webdriver
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup as soup
import random
import time
chrome_options = webdriver.ChromeOptions()
prefs = {"profile.default_content_setting_values.notifications": 2}
chrome_options.add_experimental_option("prefs", prefs)
chrome_options.add_argument('--headless')
seconds = 5 + (random.random() * 5)
driver = webdriver.Chrome(chrome_options=chrome_options)
driver.implicitly_wait(30)
driver.get("https://")
time.sleep(seconds)
time.sleep(seconds)
for i in range(1):
element = driver.find_element_by_id("tab-videos")
driver.execute_script("arguments[0].click();", element)
time.sleep(seconds)
time.sleep(seconds)
html = driver.page_source
page_soup = soup(html, "html.parser")
containers = page_soup.findAll("div", {"id": "tabVideos"})
for videos in containers:
main_videos = videos.find_all("div", {"class":"thumb-block tbm-init-ok"})
print(main_videos)
driver.quit()
Please what am I missing here?
The content is loaded from URL 'https://www.x***s.com/amateur-channels/ajibola_elizabeth/videos/best/{page}' where page goes from 0.
This script will print all video URLs:
import requests
from bs4 import BeautifulSoup
url = 'https://www.x***s.com/amateur-channels/ajibola_elizabeth/videos/best/{page}'
page = 0
while True:
soup = BeautifulSoup(requests.get(url.format(page=page)).content, 'html.parser')
for video in soup.select('div[id^="video_"] .title a'):
u = video['href'].rsplit('/', maxsplit=2)
print('https://www.x***s.com/video' + u[-2] + '/' + u[-1])
next_page = soup.select_one('a.next-page')
if not next_page:
break
page += 1

Categories

Resources