No data collected when I extract info from a website using xpath - python

I'd need to extract information from a website. This website has information inside the following path:
<div class="accordion-block__question">
<div class="accordion-block__text">Server</div></div>
...
<div class="block__col"><b>Country</b></div>
Running
try:
# Country
c=driver.find_element_by_xpath("//div[contains(#class,'block__col') and contains(text(),'Country')]").get_attribute('textContent')
country.append(c)
except:
country.append("Error")
I create a df with all errors. I'd interest in all the fields (but for fixing this issue, just one would be great), included the Trustscore (number), but I don't know if it'd possible to get it. I'm using selenium, web driver on Chrome.
The website is https://www.scamadviser.com/check-website.
CODE
This is the entire code:
def scam(df):
chrome_options = webdriver.ChromeOptions()
trust=[]
country = []
isp_country = []
query=df['URL'].unique().tolist()
driver=webdriver.Chrome('mypath',chrome_options=chrome_options))
for x in query:
wait = WebDriverWait(driver, 10)
response=driver.get('https://www.scamadviser.com/check-website/'+x)
try:
wait = WebDriverWait(driver, 30)
# missing trustscore
# Country
c=driver.execute_script("arguments[0].scrollTop = arguments[0].scrollHeight", driver.find_element_by_xpath("//div[contains(#class,'block__col') and contains(text(),'Country')]")).get_attribute('innerText')
country.append(c)
# ISP country
ic=driver.find_element_by_xpath("//div[contains(#class,'block__col') and contains(text(),'ISP')]").get_attribute('innerText')
isp_country.append(ic)
except:
# missing trustscore
country.append("Error")
isp_country.append("Error")
# Create dataframe
dict = {'URL': query, 'Trustscore':trust, 'Country': country, 'ISP': isp_country}
df=pd.DataFrame(dict)
driver.quit()
return df
You can try for example with df['URL'] equal to
stackoverflow.com
gitHub.com

You are looking for innerText not textContent.
Code :
try:
# Country
c = driver.find_element_by_xpath("//div[contains(#class,'block__col') and contains(text(),'Country')]").get_attribute('innerText')
print(c)
country.append(c)
except:
country.append("Error")
Updated 1 :
In case already used locator is correct.
driver.execute_script("arguments[0].scrollTop = arguments[0].scrollHeight", driver.find_element_by_xpath("//div[contains(#class,'block__col') and contains(text(),'Country')]"))
or may be try with both the options with this xpath :-
//div[contains(#class,'block__col')]/b[text()='Country']
Udpated 2 :
try:
wait = WebDriverWait(driver, 30)
# missing trustscore
# Country
time.sleep(2)
ele = driver.find_element_by_xpath("//div[contains(#class,'block__col')]/b[text()='Country']")
driver.execute_script("arguments[0].scrollIntoView(true);", ele)
country.append(ele.get_attribute('innerText'))
time.sleep(2)
# ISP country
ic = driver.find_element_by_xpath("//div[contains(#class,'block__col')]/b[text()='ISP']")
driver.execute_script("arguments[0].scrollIntoView(true);", ele)
isp_country.append(ic.get_attribute('innerText'))
Udpate 3 :
to get the Company data, Country name.
use this xpath :
//div[text()='Company data']/../following-sibling::div/descendant::b[text()='Country']/../following-sibling::div
also, make sure few things before using this xpath.
Launch browser in full screen mode.
Scroll using js, and then use sroll into view or Actions chain.
Code :-
driver.maximize_window()
time.sleep(2)
driver.execute_script("window.scrollTo(0, 1000)")
time.sleep(2)
driver.execute_script("arguments[0].scrollTop = arguments[0].scrollHeight", WebDriverWait(driver, 20).until(EC.visibility_of_element_located((By.XPATH, "//div[text()='Company data']"))))
# now use the mentioned xpath.
company_data_country_name` = WebDriverWait(driver, 20).until(EC.visibility_of_element_located((By.XPATH, "//div[text()='Company data']/../following-sibling::div/descendant::b[text()='Country']/../following-sibling::div")))
print(company_data_country_name.text)

Related

selenium stale element reference: element is not attached to the page document error

I have an e-commerce page and there are multiple products on a page. I need to click the link of a product then return on the main page and click the link of the next product, but when I return, the elements can't be found anymore.
Path = "C:\Program Files (x86)\chromedriver.exe"
driver = webdriver.Chrome(Path)
driver.get("https://www.emag.ro/")
search_bar = driver.find_element_by_id("searchboxTrigger")
search_bar.send_keys("laptopuri")
search_bar.send_keys(Keys.RETURN)
main = None
try:
main = WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.ID, "main-container"))
)
print("Page loaded,main retrived succesfully")
print(" ")
except:
driver.quit()
products = main.find_elements_by_css_selector("div.card-item.js-product-data")
for product in products:
raw_name = product.text
raw_price = product.find_element_by_css_selector("p.product-new-price").text
link = product.find_element_by_tag_name("a")
#clicking the link
link.click()
spec_page = None
try:
spec_page = WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.CLASS_NAME, "col-md-12"))
)
except:
driver.quit()
print(spec_page)
driver.back()
After the first iteration, I get the following error :
selenium.common.exceptions.StaleElementReferenceException: Message: stale element reference: element is not attached to the page document on line raw_name = product.text,basically at the beginning of the loop.
I assume the page is not loading properly or something like that, I tried using time.sleep before going through the loop but nothing
When you are writing driver.back(), it will go back to previous page and by the time it will reach to original page all the defined elements will become stale. You need to redefined them like below :-
This should handle the exception.
products = len(main.find_elements_by_css_selector("div.card-item.js-product-data"))
j = 0
for product in range(products):
elements = main.find_elements_by_css_selector("div.card-item.js-product-data")
raw_name = elements[j].text
raw_price = elements[j].find_element_by_css_selector("p.product-new-price").text
link = elements[j].find_element_by_tag_name("a")
# clicking the link
link.click()
spec_page = None
try:
spec_page = WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.CLASS_NAME, "col-md-12"))
)
except:
driver.quit()
print(spec_page)
j = j + 1
driver.back()

Scroll on a specific DIV Element to the bottom in Python Selenium

I'm trying to do a simple Python Selenium automation on a website while the website is blocked by a dialog which needs to scroll down to see all the paragraph so as to pass into the website.
I tried to use the code below to scroll the paragraph, but unsuccessful.
driver = webdriver.Chrome('chromedriver')
driver.maximize_window()
driver.implicitly_wait(30)
driver.get('https://www.fidelity.com.hk/en/our-funds/mpf')
wait = WebDriverWait(driver, 20)
wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, 'div[data-action="button68"]'))).click()
time.sleep(1)
ele =driver.find_element_by_css_selector('.content-scrolling-behavior')
driver.execute_script("return arguments[0].scrollIntoView(true);", ele)
html capture
I would appreciate any feedback on how to consistently select an option from the dropdown noted in the code provided. And here is the website I looking at: https://www.fidelity.com.hk/en/our-funds/mpf
You can scroll using ActionChain like this :
also, in that div, there are 27 li tags, so I am doing xpath indexing and then one by one I am moving driver focus to those li.
Sample code :
driver.implicitly_wait(30)
driver.maximize_window()
driver.get("https://www.fidelity.com.hk/en/our-funds/mpf")
wait = WebDriverWait(driver, 20)
wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, 'div[data-action="button68"]'))).click()
wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "div.container")))
list_size = len(wait.until(EC.visibility_of_all_elements_located((By.XPATH, "//ul[#class='list']/li"))))
print(list_size)
j = 1
for i in range(list_size):
ActionChains(driver).move_to_element(wait.until(EC.visibility_of_element_located((By.XPATH, f"(//ul[#class='list']/li)[{j}]")))).perform()
j = j + 1
time.sleep(1)
wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, "div[class$='btn-confirm']"))).click()
This should work
ele =driver.find_element_by_css_selector('div.content-scrolling-behavior')
driver.execute_script("arguments[0].scrollTop = arguments[0].scrollHeight", ele)
UPD
Try this instead:
ele =driver.find_element_by_css_selector('div.container')
driver.execute_script("arguments[0].scrollTop = arguments[0].scrollHeight", ele)

Scrolling a particular web element using Selenium python

I'm trying to scrape company's jobs offer from linkedin. I need to scroll a section in the page (with an inner scrollbar). I have been trying this :
1.
scroll_active = WebDriverWait(driver, 40).until(EC.presence_of_element_located((By.CSS_SELECTOR, "body > div.application-outlet > div.authentication-outlet > div.job-search-ext > div > div > section.jobs-search__left-rail > div > div > ul")))
scroll_active.location_once_scrolled_into_view
while driver.find_element_by_tag_name('div'):
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
Divs=driver.find_element_by_tag_name('div').text
if 'End of Results' in Divs:
print 'end'
break
else:
continue
Need to extract 'href'
If any one facing that, I wish this could help, you just have to choose well the element that you want to scroll
my_xpath = WebDriverWait(driver, 40).until(EC.presence_of_element_located((By.XPATH, "/html/body/div[8]/div[3]/div[3]/div/div/section[1]/div/div")))
driver.execute_script('arguments[0].scrollTop = arguments[0].scrollHeight', my_xpath)
Why do need to scroll here?
seems like you can get all of the element by command:
elements = driver.find_elements(By.XPATH, "//a[#class='result-card__full-card-link']")
and looks like:
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
driver = webdriver.Chrome()
driver.get('https://www.linkedin.com/jobs/search/?f_C=1110%2C12800%2C5115950%2C3165553%2C603115%2C10916%2C8331%2C3297950%2C8238%2C5509188%2C3093%2C2625246%2C1112%2C947572%2C11018069%2C407323&geoId=92000000')
time.sleep(3)
def element_present():
try:
driver.find_element(By.XPATH, "//button[#class='infinite-scroller__show-more-button infinite-scroller__show-more-button--visible']")
except Exception:
return False
return True
while not element_present():
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
elements = driver.find_elements(By.XPATH, "//a[#class='result-card__full-card-link']")
hrefs = [el.get_attribute('href') for el in elements]
print(hrefs)
print(len(hrefs))
driver.quit()
might I missed smth, but seems like it works well as well

Can't locate div element - Selenium

I am unable to locate a div element using the css_selector. Please find my code below.
driver = wb.Firefox()
driver.get("https://www.jumia.com.ng/")
driver.maximize_window() #//For maximizing window
driver.implicitly_wait(20) #//gives an implicit wait for 20 seconds
#WebDriverWait(driver,20).until(EC.visibility_of_element_located((By.CLASS_NAME,'star _s')))
#driver.switch_to.frame(driver.find_element_by_class_name('star _s'))
#selecting phones and tablets
clickObj = driver.find_element_by_xpath("/html/body/div[1]/main/div[1]/div[1]/div[1]/div/a[4]/span").click()
#selecting mobile phones only
driver.find_element_by_xpath("/html/body/div[1]/main/div[2]/div[1]/div/article[1]/a[2]").click()
#selecting smartphones only
driver.find_element_by_xpath("/html/body/div[1]/main/div[2]/div[1]/div/article[1]/a[2]").click()
#selecting android phones only
driver.find_element_by_xpath("/html/body/div[1]/main/div[2]/div[1]/div/article[1]/a[1]").click()
product_info = driver.find_elements_by_css_selector("div.info")
product_name = list()
price = list()
rating = list()
for info in product_info:
#print(info.find_elements_by_class_name("rev")
product_name.append(info.find_element_by_css_selector("h3.name").text)
rating.append(info.find_element_by_css_selector("div.rev").text)
price.append(info.find_element_by_css_selector("div.prc").text)
#rating.append(info.find_element_by_class_name("rev").text)
data = {"product_name":product_name, "rating":rating, "price":price}
df_product = pd.DataFrame.from_dict(data)
It returns the below error despite the presence of an element such as "rev":
NoSuchElementException: Message: Unable to locate element: div.rev
This is the link to the site https://www.jumia.com.ng/android-phones/
Please assist. What am I doing wrong?
The issue is due to not considering space between tag name and class name in find_elements_by_css_selector function.
product_info = driver.find_elements_by_css_selector("div .info")
In addition, the type of product_info variable is <class 'list'>. Thus, it is not possible to apply the functions or methods of Selenium library. To fill other lists, you have to use product_info.text and apply for loop for it.
Refer to below code as the sample one:
from selenium import webdriver
driver = webdriver.Firefox()
driver.get("https://www.jumia.com.ng/android-phones/")
driver.maximize_window() #//For maximizing window
#selecting android phones only
product_info = driver.find_elements_by_css_selector("div .info")
for info in product_info:
try:
print(info.text)
except:
break
driver.close()

looping through a dropdown menu using Selenium and Python

I'm trying to loop through a dropdown menu on at this url: https://www.accuform.com/safety-sign/danger-danger-authorized-personnel-only-MADM006
So, for example, the first dropdown menu - under options - lists out different materials and I want to select each one in turn and then gather some other information from the webpage before moving on to the next material. Here is my current code:
driver = webdriver.Firefox()
driver.get('https://www.accuform.com/safety-sign/danger-danger-authorized-personnel-only-MADM006')
time.sleep(3)
driver.find_element_by_id('x-mark-icon').click()
select = Select(driver.find_element_by_name('Wiqj7mb4rsAq9LB'))
options = select.options
optionsList = []
driver.find_elements_by_class_name('select-wrapper')[0].click()
element = driver.find_element_by_xpath("//select[#name='Wiqj7mb4rsAq9LB']")
actions = ActionChains(driver)
actions.move_to_element(element).perform()
# driver.execute_script("arguments[0].scrollIntoView();", element)
for option in options: #iterate over the options, place attribute value in list
optionsList.append(option.get_attribute("value"))
for optionValue in optionsList:
print("starting loop on option %s" % optionValue)
# select = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, "//select[#name='Wiqj7mb4rsAq9LB']")))
# select = Select(select)
select.select_by_value(optionValue)
I started with just the loop, but got this error:
ElementNotInteractableException: Message: Element <option> could not be scrolled into view
I then added the webdriverwait and get a TimeoutException error.
I then realized I should probably click on the wrapper in which the dropdown is held, so I added the click, which does pup up the menu, but I still got the TimeoutException.
So I thought, maybe I should move to the element, which I tried with the action chain lines and I got this error
WebDriverException: Message: TypeError: rect is undefined
I tried to avoid that error by using this code instead:
# driver.execute_script("arguments[0].scrollIntoView();", element)
Which just resulted in the timeoutexception again.
I pretty new to Python and Selenium and have basically just been modifying code from SO answers to similar questions, but nothing has worked.
I'm using python 3.6 and the current versions of Selenium and firefox webdriver.
If anything is unclear or if you need more info just let me know.
Thanks so much!
EDIT: Based on the answer and comments by Kajal Kunda, I've updated my code to the following:
`material_dropdown = driver.find_element_by_xpath("//input[#class='select-
dropdown']")
driver.execute_script("arguments[0].click();", material_dropdown)
materials=driver.find_elements_by_css_selector("div.select-wrapper
ul.dropdown-content li")
for material in materials:
# material_dropdown =
driver.find_element_by_xpath("//input[#class='select-dropdown']")
# driver.execute_script("arguments[0].click();", material_dropdown)
# materials=driver.find_elements_by_css_selector("div.select-wrapper ul.dropdown-content li")
material_ele=material.find_element_by_tag_name('span')
if material_ele.text!='':
material_ele.click()
time.sleep(5)
price = driver.find_element_by_class_name("dataPriceDisplay")
print(price.text)`
The result is that it successfully prints the price for the first type of material, but then it returns:
StaleElementReferenceException: Message: The element reference of <li class=""> is stale;...
I've tried variations of having the hashed out lines in and outside of the loop, but always get a version of the StaleElementReferenceException error.
Any suggestions?
Thanks!
You could do the whole thing with requests. Grab the drop down list from the options listed in drop down then concatenate the value attributes into requests url that retrieves json containing all the info on the page. Same principle applies for adding in other dropdown values. The ids for each drop down selection are the value attributes of the options in the drop down and appear in the url I show separated by // for each drop down selection.
import requests
from bs4 import BeautifulSoup as bs
url = 'https://www.accuform.com/product/getSku/danger-danger-authorized-personnel-only-MADM006/1/false/null//{}//WHFIw3xXmQx8zlz//6wr93DdrFo5JV//WdnO0RpwKpc4fGF'
startURL = 'https://www.accuform.com/safety-sign/danger-danger-authorized-personnel-only-MADM006'
res = requests.get(startURL)
soup = bs(res.content, 'lxml')
materials = [item['value'] for item in soup.select('#Wiqj7mb4rsAq9LB option')]
sizes = [item['value'] for item in soup.select('#WvXESrTyQjM3Ciw option')]
languages = [item['value'] for item in soup.select('#WUYWGMePtpmpmhy option')]
units = [item['value'] for item in soup.select('#W91eqaJ0WPXwe9b option')]
for material in materials:
data = requests.get(url.format(material)).json()
soup = bs(data['dataMaterialBullets'], 'lxml')
lines = [item.text for item in soup.select('li')]
print(lines)
print(data['dataPriceDisplay'])
# etc......
Sample of JSON:
Try the below code.It should work.
driver = webdriver.Firefox()
driver.get('https://www.accuform.com/safety-sign/danger-danger-authorized-personnel-only-MADM006')
time.sleep(3)
driver.find_element_by_id('x-mark-icon').click()
material_dropdown = driver.find_element_by_xpath("//input[#class='select-dropdown']")
driver.execute_script("arguments[0].click();", material_dropdown)
#Code for material dropdown
materials=driver.find_elements_by_css_selector("div.select-wrapper ul.dropdown-content li")
material_optionsList = []
for material in materials:
material_ele=material.find_element_by_tag_name('span')
if material_ele.text!='':
material_optionsList.append(material_ele.text)
print(material_optionsList)
driver.execute_script("arguments[0].click();", material_dropdown)
size_dropdown = driver.find_element_by_xpath("(//input[#class='select-dropdown'])[2]")
driver.execute_script("arguments[0].click();", size_dropdown)
#Code for size dropdown
Sizes=driver.find_elements_by_css_selector("div.select-wrapper ul.dropdown-content li")
size_optionsList = []
for size in Sizes:
size_ele=size.find_element_by_tag_name('span')
if size_ele.text!='':
size_optionsList.append(size_ele.text)
driver.execute_script("arguments[0].click();", size_dropdown)
Output :
[u'Adhesive Vinyl', u'Plastic', u'Adhesive Dura-Vinyl', u'Aluminum', u'Dura-Plastic\u2122', u'Aluma-Lite\u2122', u'Dura-Fiberglass\u2122', u'Accu-Shield\u2122']
Hope you will do the remaining.Let me know if it works for you.
EDIT Code for loop through and get the price value of materials.
for material in range(len(materials)):
material_ele=materials[material]
if material_ele.text!='':
#material_optionsList.append(material_ele.text)
#material_ele.click()
driver.execute_script("arguments[0].click();", material_ele)
time.sleep(2)
price = driver.find_element_by_id("priceDisplay")
print( price.text)
time.sleep(2)
material_dropdown = driver.find_element_by_xpath("//input[#class='select-dropdown']")
driver.execute_script("arguments[0].click();", material_dropdown)
materials = driver.find_elements_by_css_selector("div.select-wrapper ul.dropdown-content li")
material+=2
Output :
$8.31
$9.06
$13.22
$15.91
$15.91

Categories

Resources