How to speed up python selenium find_elements? - python

I am trying to scrape company info from kompass.com
However, as each company profile provide different amount of details, certain pages may have missing elements. For example, not all companies have info on 'Associations'. In such cases, my script takes extremely long searching for these missing elements. Is there anyway I can speed up the search process?
Here's the excerpt of my script:
import time
import selenium
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from selenium.common.exceptions import ElementNotVisibleException
from lxml import html
def init_driver():
driver = webdriver.Firefox()
driver.wait = WebDriverWait(driver, 5)
return driver
def convert2text(webElement):
if webElement != []:
webElement = webElement[0].text.encode('utf8')
else:
webElement = ['NA']
return webElement
link='http://sg.kompass.com/c/mizkan-asia-pacific-pte-ltd/sg050477/'
driver = init_driver()
driver.get(link)
driver.implicitly_wait(10)
name = driver.find_elements_by_xpath("//*[#id='productDetailUpdateable']/div[1]/div[2]/div/h1")
name = convert2text(name)
## Problem:
associations = driver.find_elements_by_xpath("//body//div[#class='item minHeight']/div[#id='associations']/div/ul/li/strong")
associations = convert2text(associations)
It takes more than a minute to scrape each page and I have more than 26,000 pages to scrape.

driver.implicitly_wait(10) tell the driver to wait up to 10 seconds for the element to exist in the DOM. That means that each time you are looking for non-existing element it waits for 10 seconds. Reducing the time to 2-3 seconds will improve the run time.
In addition, xpath is the slowest selector, and you are making it worth by giving absolute path. Use find_elements_by_id and find_elements_by_class_name where you can. For example, you can improve
driver.find_elements_by_xpath("//body//div[#class='item minHeight']/div[#id='associations']/div/ul/li/strong")
Simply by stating with the associations id
driver.find_elements_by_xpath("//*div[#id='associations']/div/ul/li/strong")
Or changing it to css_selector
driver.find_elements_by_css_selector("#associations > div > ul > li > strong")

Since your XPaths are not using any attributes apart from class and id to find elements, you could migrate your searches to CSS Selectors. These may be faster on browsers like IE where native XPath searching is not supported.
For example:
//body//div[#class='item minHeight']/div[#id='associations']/div/ul/li/strong
Can become:
body .item .minHeight > #associations > div > ul > li > strong

Related

How to use XPath to scrape javascript website values

I'm trying to scrape (in python) the savings interest rate from this website using the value's xpath variable.
I've tried everything: beautifulsoup, selenium, etree, etc. I've been able to scrape a few other websites successfully. However, this site and many others are giving me fits. I'd love a solution that can scrape info from several sites regardless of their formatting using xpath variables.
My current attempt:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
service = Service(executable_path="/chromedriver")
options = Options()
options.add_argument(' — incognito')
options.headless = True
driver = webdriver.Chrome(service=service, options=options)
url = 'https://www.americanexpress.com/en-us/banking/online-savings/account/'
driver.get(url)
element = driver.find_element(By.XPATH, '//*[#id="hysa-apy-2"]')
print(element.text)
if element.text == "":
print("Error: Element text is empty")
driver.quit()
The interest rates are written inside span elements. All span elements which contain interest rates share the same class heading-6. But bear in mind, the result returns two span elements for each interest rate, each element for a different viewport.
The xpath selector:
'//span[#class="heading-6"]'
You can also get elements by containing text APY:
'//span[contains(., "APY")]'
But this selector looks for all span elements in the DOM that contain word APY.
If you find unique id, it is recommended to be priority, like this :find_element(By.ID,'hysa-apy-2') like #John Gordon comment.
But sometimes when the element found, the text not yet load.
Use xpath with add this logic and text()!=""
element = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.XPATH, '//span[#id="hysa-apy-2" and text()!=""]')))
Following import:
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

Python Selenium find element from specific div [duplicate]

This question already has answers here:
Python Beautifulsoup cannot get svg tags
(2 answers)
Closed 1 year ago.
from selenium import webdriver
import time
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
driver = webdriver.Chrome(executable_path='C:/chromedriver/chromedriver.exe')
driver.get('https://ggl-maxim.com/')
driver.find_element_by_xpath('//*[#id="body"]/div/div[2]/div/div[2]/fieldset/input[1]').send_keys('tnrud3080')
driver.find_element_by_xpath('//*[#id="body"]/div/div[2]/div/div[2]/fieldset/input[2]').send_keys('tnrud3080')
driver.find_element_by_xpath('//*[#id="body"]/div/div[2]/div/div[2]/fieldset/button[1]').click()
time.sleep(2)
driver.get('https://ggl-maxim.com/api/popup/popup_menu.asp?mobile=0&lobby=EVOLUTION')
wait = WebDriverWait(driver, 20)
wait.until(EC.frame_to_be_available_and_switch_to_it("gameIframe"))
wait.until(EC.visibility_of_all_elements_located((By.CSS_SELECTOR, ".svg--1nrnH")))
targets = driver.find_elements_by_css_selector(".svg--1nrnH")
res = []
for el in targets:
res.append(el.get_attribute('innerHTML'))
print(*res, sep='\n')
This code gets the svg (records of the game) as you look at the picture. However, if you click the button that I wrote "multi" the picture at the bottom, I can see records also at the right of the page. I found out that this part shows up the records more faster than before. In order to do that I have to get svg value only from that div. How can I? Please help me!
The second approach is not faster and is harder to implement, as each container is loaded separately and starts to reload after the first load is done. It looks like a nightmare to automate.
I tried Selenium's explicit waits and time.sleep() neither of the approached worked.
The code below clicks the button, switches to a new iframe and tries to get containers content. But the content is almost always empty for the reasons described above.
from selenium import webdriver
import time
from bs4 import BeautifulSoup
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
driver = webdriver.Chrome(executable_path='/snap/bin/chromium.chromedriver')
driver.get('http://ggl-maxim.com/')
driver.find_element_by_xpath('//*[#id="body"]/div/div[2]/div/div[2]/fieldset/input[1]').send_keys('tnrud3080')
driver.find_element_by_xpath('//*[#id="body"]/div/div[2]/div/div[2]/fieldset/input[2]').send_keys('tnrud3080')
driver.find_element_by_xpath('//*[#id="body"]/div/div[2]/div/div[2]/fieldset/button[1]').click()
time.sleep(2)
driver.get('http://ggl-maxim.com/api/popup/popup_menu.asp?mobile=0&lobby=EVOLUTION')
wait = WebDriverWait(driver, 30)
wait.until(EC.frame_to_be_available_and_switch_to_it("gameIframe"))
wait.until(EC.visibility_of_all_elements_located((By.CSS_SELECTOR, ".svg--1nrnH")))
wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, "span[data-role=button-label]"))).click()
wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, ".sidebar-container>iframe")))
wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, ".sidebar-container")))
iframe2 = driver.find_element_by_css_selector('iframe[src^="https://evo.kplaycasino.com/frontend/evo/r2/"]')
driver.switch_to.frame(iframe2)
# wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, ".list--BLuiJ .svg--1nrnH")))
time.sleep(20)
targets = driver.find_elements_by_css_selector(".list--BLuiJ .svg--1nrnH")
res = []
for el in targets:
res.append(el.get_attribute('innerHTML'))
print(*res, sep='\n')
As you see, even 20 seconds is not enough, because content is reloading on fly.
I left explicit wait commented, so you could reassure that it won't work as well.
However, from my answer you can learn how to find a locator which starts with a specific text:
iframe2 = driver.find_element_by_css_selector('iframe[src^="https://evo.kplaycasino.com/frontend/evo/r2/"]')
Where, src^=means that src starts with some specified text.

Access elements on the next page with Selenium

I have a problem with an automatic google search using Selenium. So, I need to paste into google search something like 'JFK annual passengers 2019' and find the answer. The answer is usually shown at the top of the page like that:
I wrote the code to do it and I cannot use it because of the error (no such element exception).
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.wait import WebDriverWait
def search(link):
driver = webdriver.Safari()
driver.get("http://www.google.com")
elem = driver.find_element_by_name("q")
elem.clear()
elem.send_keys(link)
elem.submit()
elem1 = driver.find_element_by_class_name('Z0LcW XcVN5d AZCkJd')
nyam = elem1.get_attribute("data-tts-text")
print(nyam)
driver.close()
search('JFK annual passengers 2019')
I think the problem is that I am not looking at the new google page I opened through starting to search.
Help, please!
Two things. First, when you find an element with 2 or more classes, use find_element_by_css_selector instead. Second, you need to wait before the search results page loads its DOM. You can use python's time library for that.
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.wait import WebDriverWait
import time
def search(link):
driver = webdriver.Safari()
driver.get("http://www.google.com")
elem = driver.find_element_by_name("q")
elem.clear()
elem.send_keys(link)
elem.submit()
time.sleep(3) # wait 3 seconds
elem1 = driver.find_element_by_css_selector('.Z0LcW.XcVN5d.AZCkJd')
nyam = elem1.get_attribute("data-tts-text")
print(nyam)
driver.close()
search('JFK annual passengers 2019')

Element Not Clickable - even though it is there

Hoping you can help. I'm relatively new to Python and Selenium. I'm trying to pull together a simple script that will automate news searching on various websites. The primary focus was football and to go and get me the latest Manchester United news from a couple of places and save the list of link titles and URLs for me. I could then look through the links myself and choose anything I wanted to review.
In trying the the independent newspaper (https://www.independent.co.uk/) I seem to have come up against a problem with element not interactable when using the following approaches:
import time
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
driver = webdriver.Chrome('chromedriver')
driver.get('https://www.independent.co.uk')
time.sleep(3)
#accept the cookies/privacy bit
OK = driver.find_element_by_id('qcCmpButtons')
OK.click()
#wait a few seconds, just in case
time.sleep(5)
search_toggle = driver.find_element_by_class_name('icon-search.dropdown-toggle')
search_toggle.click()
This throws the selenium.common.exceptions.ElementNotInteractableException: Message: element not interactable error
I've also tried with XPATH
search_toggle = driver.find_element_by_xpath('//*[#id="quick-search-toggle"]')
and I also tried ID.
I did a lot of reading on here and then also tried using WebDriverWait and execute_script methods:
element = WebDriverWait(driver, 20).until(EC.presence_of_element_located((By.XPATH, '//*[#id="quick-search-toggle"]')))
driver.execute_script("arguments[0].click();", element)
This didn't seem to error but the search box never appeared, i.e. the appropriate click didn't happen.
Any help you could give would be fantastic.
Thanks,
Pete
Your locator is //*[#id="quick-search-toggle"], there are 2 on the page. The first is invisible and the second is visible. By default selenium refers to the first element, sadly the element you mean is the second one, so you need another unique locator. Try this:
search_toggle = WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.XPATH, '//div[#class="row secondary"]//a[#id="quick-search-toggle"]')))
search_toggle.click()
First you need to open search box, then send search keys:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.options import Options
import os
chrome_options = Options()
chrome_options.add_argument("--start-maximized")
browser = webdriver.Chrome(executable_path=os.path.abspath(os.getcwd()) + "/chromedriver", options=chrome_options)
link = 'https://www.independent.co.uk'
browser.get(link)
# accept privacy
button = browser.find_element_by_xpath('//*[#id="qcCmpButtons"]/button').click()
# open search box
li = browser.find_element_by_xpath('//*[#id="masthead"]/div[3]/nav[2]/ul/li[1]')
search_tab = li.find_element_by_tag_name('a').click()
# send keys to search box
search = browser.find_element_by_xpath('//*[#id="gsc-i-id1"]')
search.send_keys("python")
search.send_keys(Keys.RETURN)
Can you try with below steps
search_toggle = driver.find_element_by_xpath('//*[#class="row secondary"]/nav[2]/ul/li[1]/a')
search_toggle.click()

Unable to wrap `driver.execute_script()` within `explicit wait` condition

I've created a python script together with selenium to parse a specific content from a webpage. I can get this result AARONS INC located under QUOTE in many different ways but the way I wish to scrape that is by using pseudo selector which unfortunately selenium doesn't support. The commented out line within the script below represents that selenium doesn't support pseudo selector.
However, when I use pseudo selector within driver.execute_script() then I can parse it flawlessly. To make this work I had to use hardcoded delay for the element to be avilable. Now, I wish to do the same wrapping this driver.execute_script() within Explicit Wait condition.
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
driver = webdriver.Chrome()
wait = WebDriverWait(driver, 20)
driver.get("https://www.nyse.com/quote/XNYS:AAN")
time.sleep(15)
# item = wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "span:contains('AARONS')")))
item = driver.execute_script('''return $('span:contains("AARONS")')[0];''')
print(item.text)
How can I wrap driver.execute_script() within Explicit Wait condition?
This is one of the ways you can achieve that. Give it a shot.
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
with webdriver.Chrome() as driver:
wait = WebDriverWait(driver, 10)
driver.get('https://www.nyse.com/quote/XNYS:AAN')
item = wait.until(
lambda driver: driver.execute_script('''return $('span:contains("AARONS")')[0];''')
)
print(item.text)
You could do the while thing in the browser script which is probably safer:
item = driver.execute_async_script("""
var span, interval = setInterval(() => {
if(span = $('span:contains("AARONS")')[0]){
clearInterval(interval)
arguments[0](span)
}
}, 1000)
""")
Here is the simple approach.
url = 'https://www.nyse.com/quote/XNYS:AAN'
driver.get(url)
# wait for the elment to be presented
ele = WebDriverWait(driver, 30).until(lambda driver: driver.execute_script('''return $('span:contains("AARONS")')[0];'''))
# print the text of the element
print (ele.text)

Categories

Resources