Get html attribute? selenium / bs4

Get html attribute? selenium / bs4 - python

How do I get the value of status= on an twitch page using bs4 or selenium ?
Example of someone offline
How it shows in the html source:
<a class="ScHalo-sc-1l14b0i-0 dcbwCs tw-halo" size="72" status="offline" href="/mizkif">........</a>
code:
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
op = Options()
op.add_argument("user-data-dir=C:\\Users\\bestg\\AppData\\Local\\Google\\Chrome\\bor")
driver = webdriver.Chrome(options=op)
driver.get('https://www.twitch.tv/mizkif')
html = driver.page_source
soup = BeautifulSoup(html, features="html.parser")
#HOW DO GET VALUE OF "STATUS=" in soup?
#unrelated: (used to focus offline streams)
#click the avatar
stream = WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.XPATH, ".//a[#status='offline']")))
stream.click()

You can use the get_attribute(name) function in Selenium
element = driver.find_element(By.XPATH,"/the/X/path")
attribute = element.get_attribute('status')
PS: driver.find_element with that syntax works with Selenium 4

Related

Why are elements missing in HTML while using selenium?

I am trying to scrape the text of labels by
url='https://www.hydac.com/shop/en/GP_1000188028'
in Product Search section. I've tried all the solutions I know but got nowhere.
Here is my code:
items=soup.find_all('div',attrs={'class':'filter-options-item'})
for item in items:
p=(item.find('label',attrs={'data-bind':'attr: {for: id}'})).find_all('span')
for q in p:
print(q.text)

BeautifulSoup only parses the HTML, it do not handle requesting or rendering what seems to be your issue.
Check the behaviour of the website in your browser, it needs some time to render the labels, so you simply have to wait.
Option#1
Simply use time.sleep() to wait:
...
driver.get(url)
time.sleep(5)
...
Option#2
Use selenium waits(recommended) to solve the issue:
...
driver.get(url)
WebDriverWait(driver, 10).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, '[data-bind="text: label"]')))
...
Example
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
url = 'https://www.hydac.com/shop/en/GP_1000188028'
driver.get(url)
WebDriverWait(driver, 10).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, '[data-bind="text: label"]')))
soup = BeautifulSoup(driver.page_source)
[x.get_text(strip=True) for x in soup.select('#narrow-by-list label')]
Output
['3.5 m/s (piston 2)58',
'0.8 m/s (piston 3)8',
'Aluminium31',
'Carbon steel35',
'NBR / PTFE compound58',
'PUR8',
'10 l6',
'100 l5',
'120 l3',...]

Scraping webpage with tabs that do not change url

I am trying to scrape Nasdaq webpage and have some issue with locating elements:
My code:
from selenium import webdriver
import time
import pandas as pd
driver.get('http://www.nasdaqomxnordic.com/shares/microsite?Instrument=CSE32679&symbol=ALK%20B&name=ALK-Abell%C3%B3%20B')
time.sleep(5)
btn_overview = driver.find_element_by_xpath('//*[#id="tabarea"]/section/nav/ul/li[2]/a')
btn_overview.click()
time.sleep(5)
employees = driver.find_element_by_xpath('//*[#id="CompanyProfile"]/div[6]')
After the last call, I receive the following error:
NoSuchElementException: no such element: Unable to locate element: {"method":"xpath","selector":"//*[#id="CompanyProfile"]/div[6]"}
Normally the problem would be in wrong 'xpath' but I tried several items, also by 'id'. I suspect that it has something to do with tabs (in my case navigating to "Overview"). Visually the webpage changes, but if for example, I scrape the table, it gets it from the first page:
table_test = pd.read_html(driver.page_source)[0]
What am I missing or doing wrong?

The overview page is under iframe
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
option = webdriver.ChromeOptions()
option.add_argument("start-maximized")
#chrome to stay open
option.add_experimental_option("detach", True)
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()),options=option)
driver.get('http://www.nasdaqomxnordic.com/shares/microsite?Instrument=CSE32679&symbol=ALK%20B&name=ALK-Abell%C3%B3%20B')
WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.XPATH, '//*[#id="tabarea"]/section/nav/ul/li[2]/a'))).click()
WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.XPATH, '//*[#id="cookieConsentOK"]'))).click()
WebDriverWait(driver, 20).until(EC.frame_to_be_available_and_switch_to_it((By.CSS_SELECTOR,"iframe#MorningstarIFrame")))
employees=WebDriverWait(driver, 20).until(EC.presence_of_element_located((By.XPATH, '//*[#id="CompanyProfile"]/div[6]'))).text.split()[1]
print(employees)
Output:
2,537
webdriverManager

You sure you need Selenium?
import requests
from bs4 import BeautifulSoup
url = 'http://lt.morningstar.com/gj8uge2g9k/stockreport/default.aspx'
payload = {
'SecurityToken': '0P0000A5LL]3]1]E0EXG$XCSE_3060'}
response = requests.get(url, params=payload)
soup = BeautifulSoup(response.text, 'html.parser')
employees = soup.find('h3', text='Employees').next_sibling.text
print(employees)
Output:
2,537

How to get all links from a webpage using selenium?

I am trying to webscrape a site using Python, Selenium, Beautifulsoup.
When I tried to get all the links ,It' returning an invalid string.
This is what I have tried
Can someone help me please?
from time import sleep
from selenium.webdriver.common.by import By
from selenium import webdriver
driver = webdriver.Chrome()
driver.get('https://www.hirist.com/c/filter/mobile-applications-jobs-in-cochin%20kochi_trivandrum%20thiruvananthapuram-5-70_75-0-0-1-0-0-0-0-2.html?ref=homepagecat')
sleep(10)
links = driver.find_elements(by=By.XPATH, value='.//div[#class="jobfeed-wrapper multiple-wrapper"]')
for link in links:
link.get_attribute('href')
print(link)

It is your selection with xpath, you select the <div> that do not have an href attribute. Select also its first <a> like .//div[#class="jobfeed-wrapper multiple-wrapper"]/a and it will work:
links = driver.find_elements(by=By.XPATH, value='.//div[#class="jobfeed-wrapper multiple-wrapper"]/a')
for link in links:
print(link.get_attribute('href'))
Example
Instead of time use WebDriverWait to check if specific elements are available.
from selenium import webdriver
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
url = 'https://www.hirist.com/c/filter/mobile-applications-jobs-in-cochin%20kochi_trivandrum%20thiruvananthapuram-5-70_75-0-0-1-0-0-0-0-2.html?ref=homepagecat'
driver = webdriver.Chrome(ChromeDriverManager().install())
driver.maximize_window()
driver.get(url)
wait = WebDriverWait(driver, 10)
links = wait.until(EC.presence_of_all_elements_located((By.XPATH, './/div[#class="jobfeed-wrapper multiple-wrapper"]/a')))
for link in links:
print(link.get_attribute('href'))
Output
https://www.hirist.com/j/xforia-technologies-android-developer-javakotlin-10-15-yrs-1011605.html?ref=cl&jobpos=1&jobversion=2
https://www.hirist.com/j/firminiq-system-ios-developer-swiftobjective-c-3-10-yrs-1011762.html?ref=cl&jobpos=2&jobversion=2
https://www.hirist.com/j/firminiq-system-android-developer-kotlin-3-10-yrs-1011761.html?ref=cl&jobpos=3&jobversion=2
https://www.hirist.com/j/react-native-developer-mobile-app-designing-3-5-yrs-1009438.html?ref=cl&jobpos=4&jobversion=2
https://www.hirist.com/j/flutter-developer-iosandroid-apps-2-3-yrs-1008214.html?ref=cl&jobpos=5&jobversion=2
https://www.hirist.com/j/accubits-technologies-react-native-developer-ios-android-platforms-3-7-yrs-1003520.html?ref=cl&jobpos=6&jobversion=2
https://www.hirist.com/j/appincubator-react-native-developer-iosandroid-platform-2-7-yrs-1001957.html?ref=cl&jobpos=7&jobversion=2

You didn't declare path to chromedriver on your computer. Check where the chromdriver is, then try
driver = webdriver.Chrome(executable_path=CHROME_DRIVER_PATH)

Scraping web page after accepting cookies in python

I'm trying to scrape a web page but before accessing the page, there is a banner for accepting cookies. I am using selenium to click on the button "Accept all cookies" but even after clicking on the button I can't access the right HTML page.
This is my code :
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
url = 'https://www.wikiparfum.fr/explore/by-name?query=dior'
driver = webdriver.Chrome(executable_path=DRIVER_PATH)
driver.get(url)
driver.find_element_by_id('onetrust-accept-btn-handler').click()
html = driver.page_source
soup = BeautifulSoup(html, 'lxml')
print(soup)
And this is the beginning of the HTML page that is printed :
If anyone can help me with this one, thank you!

You should wait for the accept cookies button element appearance before clicking it
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
url = 'https://www.wikiparfum.fr/explore/by-name?query=dior'
driver = webdriver.Chrome(executable_path=DRIVER_PATH)
wait = WebDriverWait(driver, 20)
driver.get(url)
wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "#onetrust-accept-btn-handler"))).click()
html = driver.page_source
soup = BeautifulSoup(html, 'lxml')
print(soup)

Python, Selenium, and Beautiful Soup for URL

I am trying to write a script using Selenium to access pastebin do a search and print out in text the URL results. I need the visible URL results and nothing else.
<div class="gs-bidi-start-align gs-visibleUrl gs-visibleUrl-long" dir="ltr" style="word-break:break-all;">pastebin.com/VYQTSbzY</div>
Current script is:
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup
browser = webdriver.Firefox()
browser.get('http://www.pastebin.com')
search = browser.find_element_by_name('q')
search.send_keys("test")
search.send_keys(Keys.RETURN)
soup=BeautifulSoup(browser.page_source)
for link in soup.find_all('a'):
print link.get('href',None),link.get_text()

You don't actually need BeautifulSoup. selenium itself is very powerful at locating element:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
browser = webdriver.Firefox()
browser.get('http://www.pastebin.com')
search = browser.find_element_by_name('q')
search.send_keys("test")
search.send_keys(Keys.RETURN)
# wait for results to appear
wait = WebDriverWait(browser, 10)
results = wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "div.gsc-resultsbox-visible")))
# grab results
for link in results.find_elements_by_css_selector("a.gs-title"):
print link.get_attribute("href")
browser.close()
Prints:
http://pastebin.com/VYQTSbzY
http://pastebin.com/VYQTSbzY
http://pastebin.com/VAAQCjkj
...
http://pastebin.com/fVUejyRK
http://pastebin.com/fVUejyRK
Note the use of an Explicit Wait which helps to wait for the search results to appear.

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Get html attribute? selenium / bs4 - python

You can use the get_attribute(name) function in Selenium element = driver.find_element(By.XPATH,"/the/X/path") attribute = element.get_attribute('status') PS: driver.find_element with that syntax works with Selenium 4

Related

Why are elements missing in HTML while using selenium?

Scraping webpage with tabs that do not change url

How to get all links from a webpage using selenium?

Scraping web page after accepting cookies in python

Python, Selenium, and Beautiful Soup for URL

Categories

Resources