Locating an element in bs4 - python

Trying to scrape all the information of every item dozer on this page.
I have just started and have only fair idea about scraping but not sure of how to doing that.
driver=webdriver.Firefox()
driver.get('https://www.rbauction.com/dozers?keywords=&category=21261693092')
soup=BeautifulSoup(driver.page_source,'html.parser')
#trying all d/f ways buh getting oly nonetype or no element
get= soup.findAll('div' , attrs={'class' : 'sc-gisBJw eHFfwj'})
get2= soup.findAll('div' , attrs={'id' : 'searchResultsList'})
get3= soup.find('div.searchResultsList').find_all('a')
I have to get into each class/id and loop a['href'] and get information of each dozer.
Please help.

You need to wait for the data you are looking for to load before reading it into
the BeautifulSoup object. Use WebDriverWait in selenium to wait for the page to load as it takes a while to render fully:
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
driver = webdriver.Firefox()
driver.get('https://www.rbauction.com/dozers?keywords=&category=21261693092')
WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.ID, 'searchResultsList')))
soup = BeautifulSoup(driver.page_source,'html.parser')
This line should return the hrefs from the page then:
hrefs = [el.attrs.get('href') for el in soup.find('div', attrs={'id': 'searchResultsList'}).find_all('a')]

You can just use requests
import requests
headers = {'Referrer':'https://www.rbauction.com/dozers?keywords=&category=21261693092'}
data = requests.get('https://www.rbauction.com/rba-msapi/search?keywords=&searchParams=%7B%22category%22%3A%2221261693092%22%7D&page=0&maxCount=48&trackingType=2&withResults=true&withFacets=true&withBreadcrumbs=true&catalog=ci&locale=en_US', headers = headers).json()
for item in data['response']['results']:
print(item['name'],item['url'])

Related

Can't scrape table BeautifulSoup

I'm trying to scrape the following table from this URL: https://baseballsavant.mlb.com/leaderboard/outs_above_average?type=Fielder&startYear=2022&endYear=2022&split=no&team=&range=year&min=10&pos=of&roles=&viz=show
This is my code:
import requests
from bs4 import BeautifulSoup
url = "https://baseballsavant.mlb.com/leaderboard/outs_above_average?type=Fielder&startYear=2022&endYear=2022&split=no&team=&range=year&min=10&pos=of&roles=&viz=show"
r = requests.get(url)
soup = BeautifulSoup(r.content, "lxml")
table = soup.find("table")
for row in table.findAll("tr"):
print([i.text for i in row.findAll("td")])
However, my variable table returns None, even though there is clearly a table tag in the HTML code of the website. How do I get it?
The webpage is loaded dynamically and relies on JavaScript, therefore requests won't support it. You could use another parser library such as selenium.
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
driver = webdriver.Firefox()
url = "https://baseballsavant.mlb.com/leaderboard/outs_above_average?type=Fielder&startYear=2022&endYear=2022&split=no&team=&range=year&min=10&pos=of&roles=&viz=show"
driver.get(url)
wait = WebDriverWait(driver, 10)
wait.until(EC.presence_of_element_located((By.TAG_NAME, 'table')))
table = driver.find_element(By.TAG_NAME, 'table')
table_html = table.get_attribute('innerHTML')
# print('table html:', table_html)
for tr_web_element in table.find_elements(By.TAG_NAME, 'tr'):
for td_web_element in tr_web_element.find_elements(By.TAG_NAME, 'td'):
print(td_web_element.text)
driver.close()
Or see this answer to incorporate Selenium with BeautifulSoup.

Scraping webpage with tabs that do not change url

I am trying to scrape Nasdaq webpage and have some issue with locating elements:
My code:
from selenium import webdriver
import time
import pandas as pd
driver.get('http://www.nasdaqomxnordic.com/shares/microsite?Instrument=CSE32679&symbol=ALK%20B&name=ALK-Abell%C3%B3%20B')
time.sleep(5)
btn_overview = driver.find_element_by_xpath('//*[#id="tabarea"]/section/nav/ul/li[2]/a')
btn_overview.click()
time.sleep(5)
employees = driver.find_element_by_xpath('//*[#id="CompanyProfile"]/div[6]')
After the last call, I receive the following error:
NoSuchElementException: no such element: Unable to locate element: {"method":"xpath","selector":"//*[#id="CompanyProfile"]/div[6]"}
Normally the problem would be in wrong 'xpath' but I tried several items, also by 'id'. I suspect that it has something to do with tabs (in my case navigating to "Overview"). Visually the webpage changes, but if for example, I scrape the table, it gets it from the first page:
table_test = pd.read_html(driver.page_source)[0]
What am I missing or doing wrong?
The overview page is under iframe
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
option = webdriver.ChromeOptions()
option.add_argument("start-maximized")
#chrome to stay open
option.add_experimental_option("detach", True)
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()),options=option)
driver.get('http://www.nasdaqomxnordic.com/shares/microsite?Instrument=CSE32679&symbol=ALK%20B&name=ALK-Abell%C3%B3%20B')
WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.XPATH, '//*[#id="tabarea"]/section/nav/ul/li[2]/a'))).click()
WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.XPATH, '//*[#id="cookieConsentOK"]'))).click()
WebDriverWait(driver, 20).until(EC.frame_to_be_available_and_switch_to_it((By.CSS_SELECTOR,"iframe#MorningstarIFrame")))
employees=WebDriverWait(driver, 20).until(EC.presence_of_element_located((By.XPATH, '//*[#id="CompanyProfile"]/div[6]'))).text.split()[1]
print(employees)
Output:
2,537
webdriverManager
You sure you need Selenium?
import requests
from bs4 import BeautifulSoup
url = 'http://lt.morningstar.com/gj8uge2g9k/stockreport/default.aspx'
payload = {
'SecurityToken': '0P0000A5LL]3]1]E0EXG$XCSE_3060'}
response = requests.get(url, params=payload)
soup = BeautifulSoup(response.text, 'html.parser')
employees = soup.find('h3', text='Employees').next_sibling.text
print(employees)
Output:
2,537

WebScrape - Python/Selenium

My code goes into a webpage and I want to scrape the href/HTML of each listing within this webpage.
(This code goes to a website which has 2)
I tried xpath, and beautifulSoup but it returns an empty list for me.
Here is the code-
import time
from selenium import webdriver
import pandas as pd
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
import pandas as pd
from bs4 import BeautifulSoup
driver = webdriver.Chrome()
bracket=[]
driver.get('https://casehippo.com/spa/symposium/national-kidney-foundation-2021-spring-clinical-meetings/event/gallery/?search=Patiromer')
time.sleep(3)
page_source = driver.page_source
soup = BeautifulSoup(page_source, 'html.parser')
eachRow=driver.find_element_by_partial_link_text('symposium')
print(eachRow.text)
I just ran the code what you provided, BeautifulSoup set soup variable with all page source successfully:
soup = BeautifulSoup(page_source, 'html.parser')
and in the next line:
eachRow=driver.find_element_by_partial_link_text('symposium')
exception has been raised with message:
selenium.common.exceptions.NoSuchElementException: Message: no such element: Unable to locate element: {"method":"partial link text","selector":"symposium"}
seems like you're using incorrect selector, try to use, somethink like:
element = driver.find_element_by_xpath("//a[#class='title ng-binding']")
code what i'm using:
import time
from bs4 import BeautifulSoup
from selenium import webdriver
driver = webdriver.Chrome()
try:
bracket = []
driver.get(
'https://casehippo.com/spa/symposium/national-kidney-foundation-2021-spring-clinical-meetings/event/gallery/?search=Patiromer')
time.sleep(3)
page_source = driver.page_source
soup = BeautifulSoup(page_source, 'html.parser')
print(soup)
element = driver.find_element_by_xpath("//a[#class='title ng-binding']")
print(element.get_attribute('href'))
elements = driver.find_elements_by_xpath("//a[#class='title ng-binding']")
for el in elements:
print(el.get_attribute('href'))
finally:
driver.quit()
Updated code:
import time
from selenium import webdriver
import pandas as pd
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
import pandas as pd
from bs4 import BeautifulSoup
driver = webdriver.Chrome()
bracket=[]
driver.get('https://casehippo.com/spa/symposium/national-kidney-foundation-2021-spring-clinical-meetings/event/gallery/?search=Patiromer')
time.sleep(3)
page_source = driver.page_source
soup = BeautifulSoup(page_source, 'html.parser')
eachRow=driver.find_elements_by_xpath("//a[contains(#ui-sref,'symposium')]")
for row in eachRow:
print(row.text)
You need to use find_elements (not find_element) if there are more than one, and then iterate over them to see their values. Also partial text wont work because the symposium text is embedded in another element, its not regular text, so xpath is needed

BeautifulSoup does not extract commet tags in dynamic page

What I need: Count the number of reviews under an extension in Chrome Store in all languages.
What I did: Tried BeautifulSoup to extract a certain tag. I reserched the html-code of the page and found a review tag:
Tried this code:
from bs4 import BeautifulSoup
import requests
page = requests.get(url)
soup = BeautifulSoup(page.content, 'html5lib')
comments = soup.find_all('div', class_ = 'ba-bc-Xb ba-ua-zl-Xb')
But print(comments) shows that the array is empty.
I am stuck at the moment and I see that further I need to handle two problems:
How to cope with select language buttom? How to count reviews in all languages if by default only one language is selected.
The reviews are stored in different tabs. I read about dynamically extract it but didn't get a point.
You could use selenium to perform the tasks and waits for page changes and extract the review count from the PaginationMessage. Tested with a few links. You may need to add error handling for items with no reviews. There also seems to be some POST XHR activity yielding review JSON strings that you may wish to explore.
from selenium import webdriver
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains
url = 'https://chrome.google.com/webstore/detail/evernote-web-clipper/pioclpoplcdbaefihamjohnefbikjilc?hl=en/'
#url = 'https://chrome.google.com/webstore/detail/https-everywhere/gcbommkclmclpchllfjekcdonpmejbdp?hl=en/'
d = webdriver.Chrome()
d.get(url)
WebDriverWait(d, 5).until(EC.visibility_of_element_located((By.ID, ':21'))).click()
ActionChains(d).click_and_hold(WebDriverWait(d, 5).until(EC.visibility_of_element_located((By.CSS_SELECTOR, '.h-z-Ba-ca.ga-dd-Va.g-aa-ca')))).perform()
languageSelection = WebDriverWait(d, 5).until(EC.visibility_of_all_elements_located((By.CSS_SELECTOR, '.g-aa-ca-ma-x-L')))
languageSelection[1].click()
s= WebDriverWait(d, 5).until(EC.visibility_of_element_located((By.CSS_SELECTOR, '.Aa.dc-tf + span'))).text
print(s.split()[-1])
d.quit()
try this
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
driver = webdriver.Chrome()
driver.get('https://chrome.google.com/webstore/detail/evernote-web-clipper/pioclpoplcdbaefihamjohnefbikjilc?hl=en')
wait = WebDriverWait(driver, 5)
wait.until(EC.visibility_of_element_located((By.ID, ':21'))).click()
wait.until(
EC.visibility_of_element_located((By.CSS_SELECTOR, '.h-z-Ba-ca.ga-dd-Va.g-aa-ca'))
).click()
english = driver.find_element_by_xpath('//div[#class="ah-mg-j"]/span').text
print('English: ' + english.split()[-1])
wait.until(
EC.visibility_of_element_located((By.XPATH, '//div[#class="g-aa-ca-ma-x-L" and text() = "All languages"]'))
).click()
wait.until_not(EC.text_to_be_present_in_element((By.XPATH, '//div[#class="ah-mg-j"]/span'), english))
time.sleep(2)
AllCount = driver.find_element_by_xpath('//div[#class="ah-mg-j"]/span').text
print('All languages: ' + AllCount.split()[-1])
driver.close()

beautifulsoup unable to extract href link

So i am using selenium , phantomjs as my webdriver, and beautifulsoup.
currently i want to extract all the links which are underneath the attribute title.
The site i want to extract
However, it seems to be not picking up these links at all ! What is going on ?
# The standard library modules
import os
import sys
import re
# The wget module
import wget
# The BeautifulSoup module
from bs4 import BeautifulSoup
# The selenium module
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
def getListLinks(link):
#setup drivers
driver = webdriver.PhantomJS(service_args=['--ignore-ssl-errors=true'])
driver.get(link) # load the web page
src = driver.page_source
#Get text and split it
soup = BeautifulSoup(src, 'html5lib')
print soup
links = soup.find_all('a')
print links
driver.close()
getListLinks("http://www.bursamalaysia.com/market/listed-companies/company-announcements/#/?category=FA&sub_category=FA1&alphabetical=All&company=9695&date_from=01/01/2012&date_to=31/12/2016")
Here is an example of a link i want to extract
Quarterly rpt on consolidated results for the financial period ended 31/03/2017
What I really don't understand is why you are mixing beautifullsoup with selenium. Selenium has it's own api for extract dom element. You don't need to bring BS4 into the picture. Besides BS4 can only work with static HTML and ignores dynamically generated HTML which your selenium instance is capable of handling.
Just do
driver.find_element_by_tag_name('a')
You want the links under the title column which is the 4th column of the table. You can use an nth-of-type selector to restrict to table cells (td elements) within 4 column of each row of target table. A wait condition is added to ensure elements are present.
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
d = webdriver.Chrome()
url = 'http://www.bursamalaysia.com/market/listed-companies/company-announcements/#/?category=all'
d.get(url)
links = [link.get_attribute('href') for link in WebDriverWait(d, 10).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, 'tr td:nth-of-type(4) a')))]
print(links)
d.quit()

Categories

Resources