Unable to obtain table info through python selenium - python

I am new bee on python selenium environment. I am trying to get the SQL version table from enter link description here
from selenium.webdriver.common.by import By
from selenium import webdriver
# define the website to scrape and path where the chromediver is located
website = "https://www.sqlserverversions.com"
driver = webdriver.Chrome(executable_path='/Users//Downloads/chromedriver/chromedriver.exe')
# define 'driver' variable
# open Google Chrome with chromedriver
driver.get(website)
matches = driver.find_elements(By.TAG_NAME, 'tr')
for match in matches:
b=match.find_elements(By.XPATH,"./td[1]")
print(b.text)
it says AttributeError: 'list' object has no attribute 'text'. Am i choosing the write syntax and right parameters to grab the data?
Below is the table which i am trying to get data.
enter image description here
Below are the parameters which i am trying to put in code.
enter image description here
Please advise what is required to modify in the code to obtain the data in table format.
Thanks,
Arun

If you need data only from first table:
from selenium.webdriver.common.by import By
from selenium import webdriver
website = "https://www.sqlserverversions.com"
driver = webdriver.Chrome(executable_path='/Users//Downloads/chromedriver/chromedriver.exe')
driver.get(website)
show_service_pack_versions = True
xpath_first_table_sql_rows = "(//table[#class='tbl'])[1]//tr/td/a[starts-with(text(),'SQL Server')]//ancestor::tr"
matches = driver.find_elements(By.XPATH, xpath_first_table_sql_rows)
for match in matches:
sql_server_a_element = match.find_element(By.XPATH, "./td/a[2]")
print(sql_server_a_element.text)
sql_server_rtm_version_a_element = match.find_element(By.XPATH, ".//td[#class='rtm']")
print('RTMs:')
print(sql_server_rtm_version_a_element.text)
if(show_service_pack_versions):
print('SPs:')
sql_server_sp_version_td_elements = match.find_elements(By.XPATH, ".//td[#class='sp']")
for td in sql_server_sp_version_td_elements:
print('---')
print(td.text)
print('----------------------------------')
if you set show_service_pack_versions = False then information regarding service packs will be skipped

There was a part of your code where you were calling b.text after getting the result of find_elements, which returns a list. You can only call b.text on a single WebElement (not a list of them). Here's the updated code:
from selenium.webdriver.common.by import By
from selenium import webdriver
website = "https://www.sqlserverversions.com"
driver = webdriver.Chrome(executable_path='/Users//Downloads/chromedriver/chromedriver.exe')
driver.get(website)
matches = driver.find_elements("css selector", "tr")
for match in matches[1:]:
items = match.find_elements("css selector", "td")
for item in items:
print(item.text)
That will print out A LOT of rows, unless you limit the loop.

If you just need text it's simpler to do it on the browser side:
data = driver.execute_script("""
return [...document.querySelectorAll('tr')].map(tr => [...tr.querySelectorAll('td')].map(td => td.innerText))
""")

Related

Cant get text while webscraping a site

I'm trying to get the part where it says Avvisami on this website: https://www.nike.com/it/launch/t/womens-air-jordan-3-sp-a-ma-maniere
to appear as a string on my code. Every time I try anything it doesn't work. This is the part of the code:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
options = webdriver.ChromeOptions()
options.add_argument(r'--user-data-dir=C:\Users\mainuser\AppData\Local\Google\Chrome\User Data')
options.add_argument('--profile-directory=Profile 1')
driver = webdriver.Chrome(options = options)
driver.get('https://www.nike.com/it/launch/t/womens-air-jordan-3-sp-a-ma-maniere')
instock = (driver.find_elements_by_class_name('ncss-btn-primary-dark btn-lg'))
print(instock)
and in that, this is the part I think I need to change:
instock = (driver.find_elements_by_class_name('ncss-btn-primary-dark btn-lg'))
print(in stock)
I've been trying to fix it for an hour or so but I just can't wrap my head around how.
instock = driver.find_element_by_css_selector(".ncss-btn-primary-dark.btn-lg").text
print(instock)
Multiple class names should be used with css selector and to grab the text just use .text and then place it in your variable.
You are trying to get a text from a list of elements. Iterate it and use .text:
elems = driver.find_elements_by_css_selector(".ncss-btn-primary-dark.btn-lg")
for el in elems:
print(el.text)
All of these elements are buttons.

scraping yahoo stock news

I am scraping news articles related to Infosys at the end of page but getting error
selenium.common.exceptions.InvalidSelectorException: Message: invalid selector .
Want to scrape all articles related to Infosys.
from bs4 import BeautifulSoup
import re
from selenium import webdriver
import chromedriver_binary
import string
import time
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
driver = webdriver.Chrome("/Users/abhishekgupta/Downloads/chromedriver")
driver.get("https://finance.yahoo.com/quote/INFY/news?p=INFY")
for i in range(20): # adjust integer value for need
# you can change right side number for scroll convenience or destination
driver.execute_script("window.scrollBy(0, 250)")
# you can change time integer to float or remove
time.sleep(1)
print(driver.find_element_by_xpath('//*[#id="latestQuoteNewsStream-0-Stream"]/ul/li[9]/div/div/div[2]/h3/a/text()').text())
You could use less detailed xpath using // instead of /div/div/div[2]
And if you want last item then get all li as list and later use [-1] to get last element on list
from selenium import webdriver
import time
driver = webdriver.Chrome("/Users/abhishekgupta/Downloads/chromedriver")
#driver = webdriver.Firefox()
driver.get("https://finance.yahoo.com/quote/INFY/news?p=INFY")
for i in range(20):
driver.execute_script("window.scrollBy(0, 250)")
time.sleep(1)
all_items = driver.find_elements_by_xpath('//*[#id="latestQuoteNewsStream-0-Stream"]/ul/li')
#for item in all_items:
# print(item.find_element_by_xpath('.//h3/a').text)
# print(item.find_element_by_xpath('.//p').text)
# print('---')
print(all_items[-1].find_element_by_xpath('.//h3/a').text)
print(all_items[-1].find_element_by_xpath('.//p').text)
xPath you provided does not exist in the page.
Download the xPath Finder Chrome Extension to find the correct xPath for articles.
Here is an example xPath of articles list, you need to loop through id:
/html/body/div[1]/div/div/div[1]/div/div[3]/div[1]/div/div[5]/div/div/div/ul/li[ID]/div/div/div[2]/h3/a/u
I think your code is fine just one thing: there are few difference when we retrieve text or links when using xpath in selenium as compare to scrapy or if you are using lxml fromstring library so here is something that should work for you
#use this code for printing instead
print(driver.find_element_by_xpath('//*[#id="latestQuoteNewsStream-0- Stream"]/ul/li[9]/div/div/div[2]/h3/a').text)
Even if you do this it will work the same way since there is only one element with this id so simply use
#This should also work fine
print(driver.find_element_by_xpath('//*[#id="latestQuoteNewsStream-0- Stream"]').text)

Selenium how to extract href and label name Python?

I'm trying to pull the href and the data-promoname from the
URL:
https://www2.deloitte.com/global/en/pages/about-deloitte/topics/combating-covid-19-with-resilience.html?icid=covid-19_article-nav
I tried the code below but can only extract href under the class "promo-focus", but I also want to get the COVID-19 Economic cases: Scenarios for business leaders from data-promoname
driver = webdriver.Chrome(executable_path=r'C:\chromedriver.exe')
url = "https://www2.deloitte.com/global/en/pages/about-deloitte/topics/combating-covid-19-with-resilience.html?icid=covid-19_article-nav"
driver.get(url)
for i in driver.find_elements_by_class_name('promo-focus'):
print(i.get_attribute('href'))
Can anyone tell me how to do that using Python?
Try using the text method to get the text.
Example
from selenium import webdriver
chrome_browser = webdriver.Chrome()
url = "https://www2.deloitte.com/global/en/pages/about-deloitte/topics/combating-covid-19-with-resilience.html?icid=covid-19_article-nav"
chrome_browser.get(url)
for a in chrome_browser.find_elements_by_class_name('promo-focus'):
print(a.get_attribute('href'))
print(a.text)
To get the value from data-promoname you can do this by using .get_attribute method. This method can be used to get the value of any attribute corresponding to its tag.
driver_path = 'C:/chromedriver.exe' #the path to your chrome driver
browser = webdriver.Chrome(driver_path)
url_to_open = 'https://www2.deloitte.com/global/en/pages/about-deloitte/topics/combating-covid-19-with-resilience.html?icid=covid-19_article-nav'
browser.get(url_to_open)
for a in browser.find_elements_by_class_name('promo-focus'):
print(a.get_attribute('href'))
print(a.get_attribute("data-promoname"))
If you are looking for the content being displayed on the page under the anchor tags, you can use .text instead
print(a.text)

extract the text using Selenium WebDriver when page source is not shown fully

How to extract the text using Selenium WebDriver when page source is not shown fully?
or
how do you deal with it using your tools?
It looks like this site have blocked the extracting the fields manually,
and only allows to manually download preprocessed CSV file.
When I try to check the searched data in Page Source (Ctrl+U or Strg+U) I do not see the table.
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup
from time import sleep
# driver = webdriver.PhantomJS()
driver = webdriver.Chrome()
driverurl = "https://officialrecords.broward.org/AcclaimWeb"
driver.get(driverurl)
driver.find_element_by_id("btnButton").click()
Name = "John"
DocType = "DEED TRANSFERS OF REAL PROPERTY (D)"
RecordDate = "8/1/2017"
driver.find_element_by_id("SearchOnName").send_keys(Name)
driver.find_element_by_id("DocTypesDisplay-input").clear()
sleep(1)
driver.find_element_by_id("DocTypesDisplay-input").send_keys(DocType)
driver.find_element_by_id("RecordDateFrom").clear()
driver.find_element_by_id("RecordDateFrom").send_keys(RecordDate)
driver.find_element_by_id("RecordDateTo").clear()
driver.find_element_by_id("RecordDateTo").send_keys(RecordDate)
driver.find_element_by_id("btnSearch").click()
html = driver.page_source
soup = BeautifulSoup(html, "lxml")
Row = soup.findAll("tr", { "class" : "t-state-selected" })[0].findAll("td")
SearchedName = Row[1].get_text()
RecordDate = Row[5].get_text()
print SearchedName
print RecordDate
# driver.close()
Traceback (most recent call last): File "test.py", line 34, in
Row = soup.findAll("tr", { "class" : "t-state-selected" })[0].findAll("td") IndexError: list index out of range
really the list has more elements
I would like to automate saving the generated CSV files, or write a bot to go through all the rows and save to my csv file.
I simplified the question and ask only how to extract any 1-2 fields from the 1 st row.
open https://officialrecords.broward.org/AcclaimWeb/search/SearchTypeName
or
https://officialrecords.broward.org/AcclaimWeb/search/SearchTypeDocT‌​ype
and see.
Two things I got an error:
selenium.common.exceptions.ElementNotVisibleException: Message: element not visible
I added two lines of code around your existing line like this to fix it:
...
driver.execute_script("return arguments[0].scrollIntoView();", driver.find_element_by_id("btnSearch"))
driver.find_element_by_id("btnSearch").click()
sleep(10) # maybe it doesn't need this long
....
Also it looks to me like the class t-state-selected is not applied to an element until it is clicked so I changed:
Row = soup.findAll("tr", { "class" : "t-state-selected" })[0].findAll("td")
to
Row = soup.findAll("tr")[1].findAll("td")
Now it outputs:
JOHNSON,BRIAN S
08/01/2017 02:05:25

Python Selenium Screen Scrape

I am trying to screen scrape a website (snippet below)
The website takes an input, navigates to a second page and takes more inputs and finally displays a table. I fail at this step:
driver.find_element_by_xpath("//select[#id='agencies']/option[#value='13156']").click()
The error I get is:
selenium.common.exceptions.NoSuchElementException: Message: 'Unable to locate element:
Which is strange because I do see the element (Commented out Display id). Any help/pointers, please?
(I tried requests/RoboBrowser -- can't seem to get the post to work but failed there as well)
from selenium import webdriver
from selenium import selenium
from bs4 import BeautifulSoup
driver = webdriver.Firefox()
url = 'http://www.ucrdatatool.gov/Search/Crime/Local/OneYearofData.cfm'
driver.get(url)
driver.find_element_by_xpath("//select[#id='state']/option[#value='1']").click()
#driver.find_element_by_xpath("//select[#id='groups']/option[#value='8']").click()
driver.find_element_by_xpath("//input[#type='submit' and #value='Next']").click()
driver.implicitly_wait(5) # seconds
# Display id tags
#elementsAll = driver.find_elements_by_xpath('//*[#id]')
#for elements in elementsAll:
# print("id: ", repr(elements))
# print("idName: ",elements.get_attribute("id"))
# driver.implicitly_wait(5) # seconds
driver.find_element_by_xpath("//select[#id='groups']/option[#value='2']").click()
driver.find_element_by_xpath("//select[#id='year']/option[#value=1986]").click()
driver.find_element_by_xpath("//select[#id='agencies']/option[#value='13156']").click()
Update -- the below works on Selenium. I intended to choose all options in the list box and save the query results...Thanks for the pointer, Alecxe!
select = Select(driver.find_element_by_id('agencies'))
for options in select.options:
select.select_by_visible_text(options.text)
select = Select(driver.find_element_by_id('groups'))
for options in select.options:
select.select_by_visible_text(options.text)
driver.find_element_by_xpath("//select[#id='year']/option[#value=1985]").click()
driver.find_element_by_xpath("//input[#type='submit' and #value='Get Table']").click()
There is no option with 13156 value in select with agencies id. There are values from 102 to 522, you can see them by printing:
[element.get_attribute('value') for element in driver.find_elements_by_xpath('//select[#id="agencies"]/option')]
Also, instead of finding options by value, use Select and get options by text:
from selenium.webdriver.support.ui import Select
select = Select(driver.find_element_by_id('agencies'))
print select.options
select.select_by_visible_text('Selma Police Dept')

Categories

Resources