Selenium not able to get some data on the web page - python

I am using Selenium with Python to get some data about Chrome extensions. I am trying to get the number of users of particular extension at this page. I am using the code below:
from selenium import webdriver
from selenium.common.exceptions import ElementNotVisibleException, NoSuchElementException
import time
def create_browser(first_page=None):
print "Starting"
browser = webdriver.Chrome('/home/user/ChromeDriver/chromedriver')
if first_page:
browser.get(first_page);
print "Done."
return browser
def wait_find_element_by_xpath(driver, path):
counter = 0
while counter < 7:
try:
elem = driver.find_element_by_xpath(path)
break
except NoSuchElementException:
time.sleep(1)
counter += 1
elem = None
return elem
URL = 'https://chrome.google.com/webstore/detail/id-vault/jlljbiieciifehccmokcpnmlklpaimpa/details'
browser = create_browser()
browser.get(URL)
time.sleep(7)
#Get number of users
userStr = wait_find_element_by_xpath(browser, './/span[#class="webstore-f-g-He"]')
#print "\n\n\n No. of Users: "
#print userStr
#print userStr.text
#print "\n\n\n-----"
noOfUserStr = userStr.text.replace(" users", "")
noOfUsers = noOfUserStr.replace(",", "")
users = int(noOfUsers)
My problem is I am not able to get the number of users at that particular page. Instead I get the error: ValueError: invalid literal for int() with base 10: ''
I find this strange because the code works well with other extensions. Also, even when you click see the source (Right Click-> Inspect element) you see the number of users in the source (just after the "from" field) but I am still not able to get the value. Can anyone help me fix the problem?

The problem is that for this particular extension, the number of users is not visible due to the length of the "from" url. Selenium generally only works on visible elements in the document.
I would recommend getting this value via javascript execution:
userStr = browser.execute_script("return document.getElementsByClassName('webstore-f-g-He')[0].textContent")

Related

Can I restart my python code using a try/except?

I have a program where I am going to the reddit.com website and grabbing an html element from it. however, about 1/10th of the time, the old reddit website shows up, and I have to restart the program. Is there any shorter way to handle this error (basically restart from the top again)? I couldn't seem to figure it out with a try/except.
browser = webdriver.Chrome(executable_path=r'C:\Users\jacka\Downloads\chromedriver_win32\chromedriver.exe')
browser.get("https://www.reddit.com/")
# grabs the html tag for the subreddit name
elem = browser.find_elements_by_css_selector("a[data-click-id='subreddit']")
#in the case that old reddit loads, it restarts the browser.
if len(elem) == 0:
browser.close()
browser = webdriver.Chrome(executable_path=r'C:\Users\jacka\Downloads\chromedriver_win32\chromedriver.exe')
browser.get("https://www.reddit.com/")
# grabs the html tag for the subreddit name
elem = browser.find_elements_by_css_selector("a[data-click-id='subreddit']")
Like #HSK has mentioned in the comment, you can use an infinite while loop to keep trying until you get what you want without an exception. Do add a finally clause to close the browser handle before leaving.
while True:
browser = webdriver.Chrome(executable_path=r'C:\Users\jacka\Downloads\chromedriver_win32\chromedriver.exe')
try:
browser.get("https://www.reddit.com/")
elem = browser.find_elements_by_css_selector("a[data-click-id='subreddit']")
break
except Exception:
pass
finally:
browser.close()
Solved thanks to #HSK. I put the code in a while loop that ran until it got the right version of reddit.
#had to initalize elem so the loop would run
elem = ""
while len(elem) == 0:
browser = webdriver.Chrome(executable_path=r'C:\Users\jacka\Downloads\chromedriver_win32\chromedriver.exe')
browser.get("https://www.reddit.com/")
# grabs the html tag for the subreddit name
elem = browser.find_elements_by_css_selector("a[data-click-id='subreddit']")

Can't access some content on page using selenium and python

I am using selenium in a python script to login into a website where I can get an authorization key to access their API. I am able to login and navigate to the page where the authorization key is provided, I am using chrome driver for testing so I can see what's going on. When I get to the final page where the key is displayed, I can't find a way to access it. I can't see it in the page source, and when I try to access via the page element outer html, it doesn't print the value shown on the page. Here is a screenshot of what I see in the browser (I'm interested in accessing the content shown in response body):
this is the code snippet I am using to try to access the content:
auth_key = WebDriverWait(sel_browser, 10).until(EC.presence_of_element_located((By.XPATH, '//*[#id="responseBodyContent"]')))
print auth_key.get_attribute("outerHTML")
and this is what the print statement returns:
<pre id="responseBodyContent"></pre>
I've also tried:
print auth_key.text
which returns nothing. Is there way I can extract this key from the page?
It looks like you need a custom wait to wait for the element and then wait for text.
First, add a class, find element and then get innerHTML of the element. Finally, measure length of the string.
See my example below.
class element_text_not_empty(object):
def __init__(self, locator):
self.locator = locator
def __call__(self, driver):
try:
element = driver.find_element(*self.locator)
if(len(element.get_attribute('innerHTML').strip())>0):
return element.get_attribute('innerHTML')
else:
return False
except Exception as ex:
print("Error while waiting: " + str(ex))
return False
driver = webdriver.Chrome(chrome_path)
...
...
try:
print("Start wait")
result = WebDriverWait(driver, 20).until(element_text_not_empty((By.XPATH, '//*[#id="responseBodyContent"]')))
print(result)
except Exception as ex:
print("Error: " + str(ex))
Since attribute value is in json format for responseBodyContent try this
authkey_text = json.loads(auth_key.get_attribute)
print str(authkey_text)

Web scraping in Selenium (Python) throws Element Not Found

I'm trying to scrape Chinese economic data from an official website, but I keep getting an Element Not Found exception on the last line here. I've scoured stackoverflow and have tried adding implicitly_wait and switching the problem line from xpath to ID, but nothing has worked. Any thoughts?
from selenium import webdriver
FAI = []
FAIinfra = []
FAIestate = []
path_to_chromedriver = '/Users/cargillsk/Downloads/chromedriver'
browser = webdriver.Chrome(executable_path = path_to_chromedriver)
browser.implicitly_wait(30)
url = 'http://www.cqdata.gov.cn/easyquery.htm?cn=A0101'
browser.get(url)
browser.find_element_by_id('treeZhiBiao_4').click()
browser.find_element_by_xpath('//*
[#id="mySelect_sj"]/div[2]/div[1]').click()
browser.find_element_by_xpath('//*
[#id="mySelect_sj"]/div[2]/div[2]/div[3]/input').clear()
browser.find_element_by_xpath('//*
[#id="mySelect_sj"]/div[2]/div[2]/div[3]/input').send_keys('last100')
browser.find_element_by_xpath('//*
[#id="mySelect_sj"]/div[2]/div[2]/div[3]/div[1]').click()
FAIinitial = browser.find_element_by_xpath('//*[#id="main-container"]/div[2]/div[2]/div[2]/div/div[2]/table/thead/tr/th[2]/strong').text
for i in range(2,102):
i = str(i)
FAI.append(browser.find_element_by_xpath('//*[#id="table_main"]/tbody/tr[1]/td[%s]' % i).text)
FAIinfra.append(browser.find_element_by_xpath('//*[#id="table_main"]/tbody/tr[4]/td[%s]' % i).text)
FAIestate.append(browser.find_element_by_xpath('//*[#id="table_main"]/tbody/tr[55]/td[%s]' % i).text)
browser.find_element_by_id("treeZhiBiao_3").click()
browser.find_element_by_id("treeZhiBiao_14").click()
So... the implicit wait is not your issue. Looking through the websites code I found that there is no "treeZhiBiao_14", so I'm not sure what your trying to click here. Maybe try using something like this instead so you know what your clicking.
browser.find_element_by_xpath("//*[contains(text(), '工业')]").click()
or
browser.find_element_by_xpath("//*[contains(text(), 'industry')]").click()

Unable to fetch all the necessary links during Iteration - Selenium Python

I am newbie to Selenium Python. I am trying to fetch the profile URLs which will be 10 per page. Without using while, I am able to fetch all 10 URLs but for only the first page alone. When I use while, it iterates, but fetches only 3 or 4 URLs per page.
I need to fetch all the 10 links and keep iterating through pages. I think, I must do something with StaleElementReferenceException
Kindly help me solve this problem.
Given the code below.
def test_connect_fetch_profiles(self):
driver = self.driver
search_data = driver.find_element_by_id("main-search-box")
search_data.clear()
search_data.send_keys("Selenium Python")
search_submit = driver.find_element_by_name("search")
search_submit.click()
noprofile = driver.find_elements_by_xpath("//*[text() = 'Sorry, no results containing all your search terms were found.']")
self.assertFalse(noprofile)
while True:
wait = WebDriverWait(driver, 150)
try:
profile_links = wait.until(EC.presence_of_all_elements_located((By.XPATH,"//*[contains(#href,'www.linkedin.com/profile/view?id=')][text()='LinkedIn Member'or contains(#href,'Type=NAME_SEARCH')][contains(#class,'main-headline')]")))
for each_link in profile_links:
page_links = each_link.get_attribute('href')
print(page_links)
driver.implicitly_wait(15)
appendFile = open("C:\\Users\\jayaramb\\Documents\\profile-links.csv", 'a')
appendFile.write(page_links + "\n")
appendFile.close()
driver.implicitly_wait(15)
next = wait.until(EC.visibility_of(driver.find_element_by_partial_link_text("Next")))
if next.is_displayed():
next.click()
else:
print("End of Page")
break
except ValueError:
print("It seems no values to fetch")
except NoSuchElementException:
print("No Elements to Fetch")
except StaleElementReferenceException:
print("No Change in Element Location")
else:
break
Please let me know if there are any other effective ways to fetch the required profile URL and keep iterating through pages.
I created a similar setup which works alright for me. I've had some problems with selenium trying to click on the next-button but it throwing a WebDriverException instead, likely because the next-button is not in view. Hence, instead of clicking the next-button I get its href-attribute and load the new page up with driver.get() and thus avoiding an actual click making the test more stable.
def test_fetch_google_links():
links = []
# Setup driver
driver = webdriver.Firefox()
driver.implicitly_wait(10)
driver.maximize_window()
# Visit google
driver.get("https://www.google.com")
# Enter search query
search_data = driver.find_element_by_name("q")
search_data.send_keys("test")
# Submit search query
search_button = driver.find_element_by_xpath("//button[#type='submit']")
search_button.click()
while True:
# Find and collect all anchors
anchors = driver.find_elements_by_xpath("//h3//a")
links += [a.get_attribute("href") for a in anchors]
try:
# Find the next page button
next_button = driver.find_element_by_xpath("//a[#id='pnnext']")
location = next_button.get_attribute("href")
driver.get(location)
except NoSuchElementException:
break
# Do something with the links
for l in links:
print l
print "Found {} links".format(len(links))
driver.quit()

Scrape data with Ghost

I want to scrap some data on the following link:
http://www.six-structured-products.com/en/search-find/new-search#search_type=profi&class_category=svsp
My target is simply to retrieve the table of all instruments (displayed in "search results" on page 1,2,3, etc) in a data.frame.
I can't simply use urllib and urllib2 to retrieve static data since I need to mimic a human by cliking on buttons: Ghost or Selenium are the way to go.
However, I really do not get how to translate into code "click on page 2", "click on page 3" ... as well as getting the total number of pages.
My code:
from ghost import Ghost
url = "http://www.six-structured-products.com/en/search-find/new-search#search_type=profi&class_category=svsp"
gh = Ghost()
page, resources = gh.open(url)
I am stuck there and do not know which identifier to put instead of XXX:
page, resources = ghost.evaluate(
"document.getElementById(XXX).click();", expect_loading=True)
(I would also accept a solution using Selenium)
You can also use the next button this way:
import logging
import sys
from ghost import Ghost, TimeoutError
logging.basicConfig(level=logging.INFO)
url = "http://www.six-structured-products.com/en/search-find/new-search#search_type=profi&class_category=svsp"
ghost = Ghost(wait_timeout=20, log_level=logging.CRITICAL)
data = dict()
def extract_value(line, ntd):
return line.findFirst('td.DataItem:nth-child(%d)' % ntd).toPlainText()
def extract(ghost):
lines = ghost.main_frame.findAllElements(
'.derivativeSearchResult > tbody:nth-child(2) tr'
)
for line in lines:
symbol = extract_value(line, 2)
name = extract_value(line, 5)
logging.info("Found %s: %s" % (symbol, name))
# Persist data here
ghost.sleep(1)
try:
ghost.click('.pagination_next a', expect_loading=True)
except TimeoutError:
sys.exit(0)
extract(ghost)
ghost.open(url)
extract(ghost)
Make an endless loop incrementing the page index. Exit the loop when you don't find the button with a current index:
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
import time
driver = webdriver.Firefox()
driver.get('http://www.six-structured-products.com/en/search-find/new-search#search_type=profi&class_category=svsp')
page = 2 # starting page
while True:
try:
button = driver.find_element_by_xpath('//ul[#id="pagination_pages"]/li[#class="pagination_page" and . = "%d"]' % page)
except NoSuchElementException:
break
time.sleep(1)
button.click()
page += 1
print page # total number of pages
driver.close()
Note that instead of a time.sleep(), a more reliable approach would be to use Waits.

Categories

Resources