Scraping webpage with tabs that do not change url

Scraping webpage with tabs that do not change url - python

I am trying to scrape Nasdaq webpage and have some issue with locating elements:
My code:
from selenium import webdriver
import time
import pandas as pd
driver.get('http://www.nasdaqomxnordic.com/shares/microsite?Instrument=CSE32679&symbol=ALK%20B&name=ALK-Abell%C3%B3%20B')
time.sleep(5)
btn_overview = driver.find_element_by_xpath('//*[#id="tabarea"]/section/nav/ul/li[2]/a')
btn_overview.click()
time.sleep(5)
employees = driver.find_element_by_xpath('//*[#id="CompanyProfile"]/div[6]')
After the last call, I receive the following error:
NoSuchElementException: no such element: Unable to locate element: {"method":"xpath","selector":"//*[#id="CompanyProfile"]/div[6]"}
Normally the problem would be in wrong 'xpath' but I tried several items, also by 'id'. I suspect that it has something to do with tabs (in my case navigating to "Overview"). Visually the webpage changes, but if for example, I scrape the table, it gets it from the first page:
table_test = pd.read_html(driver.page_source)[0]
What am I missing or doing wrong?

The overview page is under iframe
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
option = webdriver.ChromeOptions()
option.add_argument("start-maximized")
#chrome to stay open
option.add_experimental_option("detach", True)
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()),options=option)
driver.get('http://www.nasdaqomxnordic.com/shares/microsite?Instrument=CSE32679&symbol=ALK%20B&name=ALK-Abell%C3%B3%20B')
WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.XPATH, '//*[#id="tabarea"]/section/nav/ul/li[2]/a'))).click()
WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.XPATH, '//*[#id="cookieConsentOK"]'))).click()
WebDriverWait(driver, 20).until(EC.frame_to_be_available_and_switch_to_it((By.CSS_SELECTOR,"iframe#MorningstarIFrame")))
employees=WebDriverWait(driver, 20).until(EC.presence_of_element_located((By.XPATH, '//*[#id="CompanyProfile"]/div[6]'))).text.split()[1]
print(employees)
Output:
2,537
webdriverManager

You sure you need Selenium?
import requests
from bs4 import BeautifulSoup
url = 'http://lt.morningstar.com/gj8uge2g9k/stockreport/default.aspx'
payload = {
'SecurityToken': '0P0000A5LL]3]1]E0EXG$XCSE_3060'}
response = requests.get(url, params=payload)
soup = BeautifulSoup(response.text, 'html.parser')
employees = soup.find('h3', text='Employees').next_sibling.text
print(employees)
Output:
2,537

Related

Why are elements missing in HTML while using selenium?

I am trying to scrape the text of labels by
url='https://www.hydac.com/shop/en/GP_1000188028'
in Product Search section. I've tried all the solutions I know but got nowhere.
Here is my code:
items=soup.find_all('div',attrs={'class':'filter-options-item'})
for item in items:
p=(item.find('label',attrs={'data-bind':'attr: {for: id}'})).find_all('span')
for q in p:
print(q.text)

BeautifulSoup only parses the HTML, it do not handle requesting or rendering what seems to be your issue.
Check the behaviour of the website in your browser, it needs some time to render the labels, so you simply have to wait.
Option#1
Simply use time.sleep() to wait:
...
driver.get(url)
time.sleep(5)
...
Option#2
Use selenium waits(recommended) to solve the issue:
...
driver.get(url)
WebDriverWait(driver, 10).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, '[data-bind="text: label"]')))
...
Example
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
url = 'https://www.hydac.com/shop/en/GP_1000188028'
driver.get(url)
WebDriverWait(driver, 10).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, '[data-bind="text: label"]')))
soup = BeautifulSoup(driver.page_source)
[x.get_text(strip=True) for x in soup.select('#narrow-by-list label')]
Output
['3.5 m/s (piston 2)58',
'0.8 m/s (piston 3)8',
'Aluminium31',
'Carbon steel35',
'NBR / PTFE compound58',
'PUR8',
'10 l6',
'100 l5',
'120 l3',...]

How to get all links from a webpage using selenium?

I am trying to webscrape a site using Python, Selenium, Beautifulsoup.
When I tried to get all the links ,It' returning an invalid string.
This is what I have tried
Can someone help me please?
from time import sleep
from selenium.webdriver.common.by import By
from selenium import webdriver
driver = webdriver.Chrome()
driver.get('https://www.hirist.com/c/filter/mobile-applications-jobs-in-cochin%20kochi_trivandrum%20thiruvananthapuram-5-70_75-0-0-1-0-0-0-0-2.html?ref=homepagecat')
sleep(10)
links = driver.find_elements(by=By.XPATH, value='.//div[#class="jobfeed-wrapper multiple-wrapper"]')
for link in links:
link.get_attribute('href')
print(link)

It is your selection with xpath, you select the <div> that do not have an href attribute. Select also its first <a> like .//div[#class="jobfeed-wrapper multiple-wrapper"]/a and it will work:
links = driver.find_elements(by=By.XPATH, value='.//div[#class="jobfeed-wrapper multiple-wrapper"]/a')
for link in links:
print(link.get_attribute('href'))
Example
Instead of time use WebDriverWait to check if specific elements are available.
from selenium import webdriver
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
url = 'https://www.hirist.com/c/filter/mobile-applications-jobs-in-cochin%20kochi_trivandrum%20thiruvananthapuram-5-70_75-0-0-1-0-0-0-0-2.html?ref=homepagecat'
driver = webdriver.Chrome(ChromeDriverManager().install())
driver.maximize_window()
driver.get(url)
wait = WebDriverWait(driver, 10)
links = wait.until(EC.presence_of_all_elements_located((By.XPATH, './/div[#class="jobfeed-wrapper multiple-wrapper"]/a')))
for link in links:
print(link.get_attribute('href'))
Output
https://www.hirist.com/j/xforia-technologies-android-developer-javakotlin-10-15-yrs-1011605.html?ref=cl&jobpos=1&jobversion=2
https://www.hirist.com/j/firminiq-system-ios-developer-swiftobjective-c-3-10-yrs-1011762.html?ref=cl&jobpos=2&jobversion=2
https://www.hirist.com/j/firminiq-system-android-developer-kotlin-3-10-yrs-1011761.html?ref=cl&jobpos=3&jobversion=2
https://www.hirist.com/j/react-native-developer-mobile-app-designing-3-5-yrs-1009438.html?ref=cl&jobpos=4&jobversion=2
https://www.hirist.com/j/flutter-developer-iosandroid-apps-2-3-yrs-1008214.html?ref=cl&jobpos=5&jobversion=2
https://www.hirist.com/j/accubits-technologies-react-native-developer-ios-android-platforms-3-7-yrs-1003520.html?ref=cl&jobpos=6&jobversion=2
https://www.hirist.com/j/appincubator-react-native-developer-iosandroid-platform-2-7-yrs-1001957.html?ref=cl&jobpos=7&jobversion=2

You didn't declare path to chromedriver on your computer. Check where the chromdriver is, then try
driver = webdriver.Chrome(executable_path=CHROME_DRIVER_PATH)

AttributeError: 'NoneType' object has no attribute 'find' Web Scraping Python

I am working on an office project to get data to check active status on different websites but whenever I want to get data sometimes it shows none and sometimes it shows this Attribute error, I follow youtube videos steps but still get this error. help, please.
//Python Code
from bs4 import BeautifulSoup
import requests
html_text = requests.get(
"https://www.mintscan.io/cosmos/validators/cosmosvaloper1we6knm8qartmmh2r0qfpsz6pq0s7emv3e0meuw").text
soup = BeautifulSoup(html_text, 'lxml')
status = soup.find('div', {'class': "ValidatorInfo_statusBadge__PBIGr"})
para = status.find('p').text
print(para)

The url is dynamic meaning data is populated by javascript. So you need automation tool something like selenium.
from bs4 import BeautifulSoup
import time
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
url = 'https://www.mintscan.io/cosmos/validators/cosmosvaloper1we6knm8qartmmh2r0qfpsz6pq0s7emv3e0meuw'
driver = webdriver.Chrome(ChromeDriverManager().install())
driver.maximize_window()
time.sleep(8)
driver.get(url)
time.sleep(10)
soup = BeautifulSoup(driver.page_source, 'lxml')
#driver.close()
status = soup.find('div', {'class': "ValidatorInfo_statusBadge__PBIGr"})
para = status.find('p').text
print(para)
Output:
Active

You have the most common problem - modern pages use JavaScript to add elements but requests/BeautifulSoup can't run JavaScript.
So soup.find('div',...) gives None instead expected element and later it makes problem with None.find('p')
You may use Selenium to control real web browser which can run JavaScript.
from selenium import webdriver
#from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
#from selenium.common.exceptions import NoSuchElementException, TimeoutException
#from webdriver_manager.chrome import ChromeDriverManager
from webdriver_manager.firefox import GeckoDriverManager
url = "https://www.mintscan.io/cosmos/validators/cosmosvaloper1we6knm8qartmmh2r0qfpsz6pq0s7emv3e0meuw"
#driver = webdriver.Chrome(executable_path=ChromeDriverManager().install())
driver = webdriver.Firefox(executable_path=GeckoDriverManager().install())
driver.get(url)
#status = driver.find_element(By.XPATH, '//div[#class="ValidatorInfo_statusBadge__PBIGr"]')
wait = WebDriverWait(driver, 10)
status = wait.until(EC.visibility_of_element_located((By.XPATH, '//div[#class="ValidatorInfo_statusBadge__PBIGr"]')))
print(status.text)
Eventually you should check if page gives some API to get data.
You may also use DevTools (tab: Network) to check if JavaScript reads data from some URL and you may try to use this URL with requests. It could work faster than with Selenium but server may detect script/bot and block it.
JavaScript usually get data as JSON so it may not need to scrape HTML with BeautifulSoup

Get html attribute? selenium / bs4

How do I get the value of status= on an twitch page using bs4 or selenium ?
Example of someone offline
How it shows in the html source:
<a class="ScHalo-sc-1l14b0i-0 dcbwCs tw-halo" size="72" status="offline" href="/mizkif">........</a>
code:
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
op = Options()
op.add_argument("user-data-dir=C:\\Users\\bestg\\AppData\\Local\\Google\\Chrome\\bor")
driver = webdriver.Chrome(options=op)
driver.get('https://www.twitch.tv/mizkif')
html = driver.page_source
soup = BeautifulSoup(html, features="html.parser")
#HOW DO GET VALUE OF "STATUS=" in soup?
#unrelated: (used to focus offline streams)
#click the avatar
stream = WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.XPATH, ".//a[#status='offline']")))
stream.click()

You can use the get_attribute(name) function in Selenium
element = driver.find_element(By.XPATH,"/the/X/path")
attribute = element.get_attribute('status')
PS: driver.find_element with that syntax works with Selenium 4

Getting the element on page source but unable to locate using xpath

I am trying to find the element by using xpath but they are unable to locate. While when I am getting the page source using selenium they have the element and Also I have checked but the element are not in Iframe.
Here is my code:
from requests_html import HTMLSession
import pandas as pd
from fake_useragent import UserAgent
from requests_html import AsyncHTMLSession
from selenium import webdriver
from shutil import which
import time
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.keys import Keys
ua = UserAgent()
s = HTMLSession()
asession = AsyncHTMLSession()
url = 'https://ordiamond-frame-categoryembed-catid23621.jewelershowcase.com/search/results?query=124405'
try : User_Agent = str(ua.chrome)
except : pass
headers = {'User-Agent':User_Agent}
response = s.get(url, headers= headers)
print(response)
link = response.html.xpath('//a[#class="image logClick containerFix"]/#href')
if link:
p_url = "https://ordiamond-frame-categoryembed-catid23621.jewelershowcase.com" + (link[0])
chrome_path = which('chromedriver')
driver = webdriver.Chrome(executable_path=chrome_path)
driver.maximize_window()
driver.get(p_url)
time.sleep(20)
with open('data.html', 'w') as file:
file.write(str(driver.page_source))
print(driver.page_source)
driver.page_source
WebDriverWait(driver, 50).until(EC.visibility_of_element_located((By.XPATH, '(//h3[#class="description"])[2]')))
# time.sleep(16)
na = driver.find_element_by_xpath('(//h3[#class="description"])[2]')
print(na.text)
Hoping to get the solution. Thanks

If there are multiple matching nodes, Selenium will always fetch the first set if we are using find_element not find_elements. also same with webdriverwait.
driver = webdriver.Chrome(driver_path)
driver.maximize_window()
driver.implicitly_wait(30)
wait = WebDriverWait(driver, 30)
driver.get("https://ordiamond-frame-categoryembed-catid23621.jewelershowcase.com/search/results?query=124405")
product = WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.XPATH, "//div[#id='results']/descendant::a")))
product.click()
heading = WebDriverWait(driver, 20).until(EC.visibility_of_element_located((By.XPATH, "//h3[#class='description']")))
print(heading.text)
Imports :
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC4
Output :
14K Yellow 9x7 mm Oval Engagement Ring Mounting

Consider the fact that you want the xpath of the link of the ring, here it is:
link = response.html.xpath('//*[#id='results']//a[1]')

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Scraping webpage with tabs that do not change url - python

Related

Why are elements missing in HTML while using selenium?

How to get all links from a webpage using selenium?

AttributeError: 'NoneType' object has no attribute 'find' Web Scraping Python

Get html attribute? selenium / bs4

Getting the element on page source but unable to locate using xpath

Categories

Resources