I want to scrap data from this website(Ignore the perfume that it loads when you scroll down).
For each perfume i want to get its size. In order to see its size I need to click on the perfume which leading me to another page.
Assuming I can get the size of a perfume when Im in its url, How can I make a program that will give me the url of every perfume's page in the website?
This is the code that finds the perfume`s size when I Have the right url:
import gspread
from oauth2client.service_account import ServiceAccountCredentials
from selenium import webdriver
from selenium.webdriver import ChromeOptions
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
urlM = 'https://www.myperfume.co.il/155567-%D7%9B%D7%9C-%D7%94%D7%9E%D7%95%D7%AA%D7%92%D7%99%D7%9D-%D7%9C%D7%92%D7%91' \
'%D7%A8?order=up_title&page=0'
scope = ["https://spreadsheets.google.com/feeds", 'https://www.googleapis.com/auth/spreadsheets',
"https://www.googleapis.com/auth/drive.file", "https://www.googleapis.com/auth/drive"]
creds = ServiceAccountCredentials.from_json_keyfile_name("credentials.json", scope)
client = gspread.authorize(creds)
spreadsheet = client.open("Perfumes")
options = ChromeOptions()
options.headless = True
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
driver.get(# [THE PERFUME'S URL]... )
info = driver.find_element_by_xpath('//*[(#id = "item_current_sub_title")]//span').text
res = ''
for i in info[:info.find('\n')].replace('גודל', ''):
if i.isdigit() or i.isalpha():
res += i
print(res)
Here you will need the following:
Per each product hover over the product to make "more details" and "add to cart" buttons appear.
Click the "more details" button.
In the opened page get the product size (and any other details).
Get back to the main page.
In order to do that for many products you will have to get the list of products again on the main page. Otherwise you will get stale element exception.
So, your code can be something like this:
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
actions = ActionChains(driver)
wait = WebDriverWait(driver, 20)
wait.until(EC.visibility_of_element_located((By.XPATH, "//div[contains(#class,'layout_list_item')]")))
time.sleep(1)
products = driver.find_elements_by_xpath("//div[contains(#class,'layout_list_item')]")
for i in range(len(products)):
wait.until(EC.visibility_of_element_located((By.XPATH, "//div[contains(#class,'layout_list_item')]")))
time.sleep(1)
product = driver.find_elements_by_xpath("//div[contains(#class,'layout_list_item')]")[i]
#hover over the product block
actions.move_to_element(product).perform()
#click the "mode details button
product.find_element_by_xpath(".//p[contains(#class,'extra_button')]").click()
#in the details page get the product sub-title containing the product size
product_size = wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "div#item_current_sub_title"))).text
#get back to the main page
driver.execute_script("window.history.go(-1)")
UPD
This is exactly what I run:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains
import time
urlM = 'https://www.myperfume.co.il/155567-%D7%9B%D7%9C-%D7%94%D7%9E%D7%95%D7%AA%D7%92%D7%99%D7%9D-%D7%9C%D7%92%D7%91' \
'%D7%A8?order=up_title&page=0'
driver = webdriver.Chrome(executable_path='chromedriver.exe')
wait = WebDriverWait(driver, 20)
actions = ActionChains(driver)
driver.maximize_window()
driver.get(urlM)
wait.until(EC.visibility_of_element_located((By.XPATH, "//div[contains(#class,'layout_list_item')]")))
time.sleep(1)
products = driver.find_elements_by_xpath("//div[contains(#class,'layout_list_item')]")
for i in range(len(products)):
wait.until(EC.visibility_of_element_located((By.XPATH, "//div[contains(#class,'layout_list_item')]")))
time.sleep(1)
product = driver.find_elements_by_xpath("//div[contains(#class,'layout_list_item')]")[i]
#hover over the product block
actions.move_to_element(product).perform()
#click the "mode details button
product.find_element_by_xpath(".//p[contains(#class,'extra_button')]").click()
#in the details page get the product sub-title containing the product size
product_size = wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "div#item_current_sub_title"))).text
product_size = product_size.split('\n')[0]
print(product_size)
#get back to the main page
driver.execute_script("window.history.go(-1)")
And it prints me the products sizes like גודל: 100 ML
Related
How to navigate through each page without using driver.current_url? In my full code, I get a bunch of errors once I navigate through the page for a loop. Without it, it runs fine but can only go through one page. I want to navigate through as many pages. Any help appreciated, thanks.
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
driver_service = Service(executable_path="C:\Program Files (x86)\chromedriver.exe")
driver = webdriver.Chrome(service=driver_service)
driver.maximize_window() # load web driver
wait = WebDriverWait(driver, 5)
url_test = driver.get('https://www.seek.com.au/data-jobs-in-information-communication-technology/in-All-Perth-WA')
url_template = driver.current_url
template = url_template+ '?page={}'
for page in range(2,5):
link_job = [x.get_attribute('href') for x in driver.find_elements(By.XPATH, "//a[#data-automation='jobTitle']")]
for job in link_job:
driver.get(job)
try:
quick_apply = WebDriverWait(driver, 3).until(EC.element_to_be_clickable((By.XPATH, "(//a[#data-automation='job-detail-apply' and #target='_self'])")))
quick_apply.click()
#sleep(3)
except:
print("No records found " + job)
pass
sleep(3)
driver.get(template.format(page))
If I understand you correctly you want to determine dynamically how many pages there are and loop over each of them.
I have managed to achieve this by using a while loop and look on each page if the "Next" button at the bottom is visible. If not, the last page was reached and you can exit the loop.
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from time import sleep
driver_service = Service(executable_path="C:\\Users\\Stefan\\bin\\chromedriver.exe")
driver = webdriver.Chrome(service=driver_service)
driver.maximize_window() # load web driver
wait = WebDriverWait(driver, 5)
url_test = driver.get('https://www.seek.com.au/data-jobs-in-information-communication-technology/in-All-Perth-WA')
url_template = driver.current_url
template = url_template+ '?page={}'
page = 1
while True:
# check if "Next" button is visible
# -> if not, the last page was reached
try:
driver.find_element(By.XPATH, "//a[#title='Next']")
except:
# last page reached
break
link_job = [x.get_attribute('href') for x in driver.find_elements(By.XPATH, "//a[#data-automation='jobTitle']")]
for job in link_job:
driver.get(job)
try:
quick_apply = WebDriverWait(driver, 3).until(EC.element_to_be_clickable((By.XPATH, "(//a[#data-automation='job-detail-apply' and #target='_self'])")))
quick_apply.click()
#sleep(3)
except:
print("No records found " + job)
pass
sleep(3)
page += 1
driver.get(template.format(page))
driver.close()
Seems your problem is with StaleElementException when you getting back from job page to jobs search results page.
The simplest approach to overcome this problem is to keep the jobs search results page url.
Actually I changed your code only with this point and it works.
I also changed driver.find_elements(By.XPATH, "//a[#data-automation='jobTitle']") with wait.until(EC.visibility_of_all_elements_located((By.XPATH, "//a[#data-automation='jobTitle']"))) for better performance.
The code below works, but the web site itself responds badly.
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
options = Options()
options.add_argument("start-maximized")
webdriver_service = Service('C:\webdrivers\chromedriver.exe')
driver = webdriver.Chrome(service=webdriver_service, options=options)
wait = WebDriverWait(driver, 10)
url = 'https://www.seek.com.au/data-jobs-in-information-communication-technology/in-All-Perth-WA?page={p}'
for p in range(1,20):
driver.get(url)
link_job = [x.get_attribute('href') for x in wait.until(EC.visibility_of_all_elements_located((By.XPATH, "//a[#data-automation='jobTitle']")))]
for job in link_job:
driver.get(job)
try:
wait.until(EC.element_to_be_clickable((By.XPATH, "(//a[#data-automation='job-detail-apply' and #target='_self'])"))).click()
print("applied")
except:
print("No records found " + job)
pass
driver.get(url)
I'm attempting to use Selenium to print out a list of users I'm subscribed to on a website, of which there are 3. The following code only prints out the first 2 of the three
from xml.dom.minidom import Element
from selenium import webdriver
from selenium.webdriver.support import wait
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service as ChromeService;
from webdriver_manager.chrome import ChromeDriverManager;
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver import ActionChains
from selenium.webdriver.common.actions.action_builder import ActionBuilder
from selenium.webdriver.common.actions.mouse_button import MouseButton
driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()));
#Chrome client should now start up
driver.get("website-url"); #Go to desired website
title = driver.title;
if title == "WebsiteTitle":
driver.implicitly_wait(5)
email_box = driver.find_element(By.NAME, "email"); #finds username block
pass_box = driver.find_element(By.NAME, "password"); #finds password block
email_box.send_keys('username'); #enter username
pass_box.send_keys('password'); #enter password
submit_button = driver.find_element(By.CSS_SELECTOR, "button[type='submit']"); #finds submit button
submit_button.click(); #clicks it
wait = WebDriverWait(driver, 15);
wait.until(EC.element_to_be_clickable((By.XPATH, "//*[#id='content']/div[1]/div/div/div/div[1]/h1/span"))); #waits 15sec until the "Home" button on the home page can be clicked
def print_subs_list(): #function to print list of users you are subscribed to
driver.get("website-subs-url");
subscriptions_list = driver.find_elements(By.XPATH, "//*[#id='content']/div[1]/div/div[4]/div/div/div[1]/div/div[2]/div/div[2]/div/div[1]/a/div")
for value in subscriptions_list:
print(value.text)
print_subs_list()
driver.quit()
Now, changing the XPath in subscriptions_list to //*[#id='content']/div[1]/div/div[4]/div/div/div[2]/div/div[2]/div/div[2]/div/div[1]/a/div will print out the 3rd result only.
However, my desired result would be to print all of the subscribed users, as there will definitely be more than 3.
How do I change it so that it will print out all of the subscribed users, regardless of the amount?
You identify all the three desired elements using the xpath:
//*[#id='content']/div[1]/div/div[4]/div/div//div/div/div[2]/div/div[2]/div/div[1]/a/div
Your effective line of code will be:
subscriptions_list = driver.find_elements(By.XPATH, "//*[#id='content']/div[1]/div/div[4]/div/div//div/div/div[2]/div/div[2]/div/div[1]/a/div")
If you visit this site,
https://www.premierleague.com/results
You will be able to see several match results. If you click on each match, you will be directed to another website.
My question is how can I get the href (link) of each match.
links = driver.find_elements(By.XPATH, '//*[#id="mainContent"]/div[3]/div[1]')
for link in links:
x = link.get_attribute("href")
List.append(x)
This is what I have so far and it is not working.
I see elements like
<div data-href="//www.premierleague.com/match/66686" ...>
and you could search
//div[#data-href]
and later use get_attribute("data-href")
Full working code
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
#from webdriver_manager.chrome import ChromeDriverManager
from webdriver_manager.firefox import GeckoDriverManager
#import time
url = 'https://www.premierleague.com/results'
#driver = webdriver.Chrome(executable_path=ChromeDriverManager().install())
driver = webdriver.Firefox(executable_path=GeckoDriverManager().install())
driver.get(url)
wait = WebDriverWait(driver, 10)
#time.sleep(5)
# close popup window with "Accept All Cookies"
button = wait.until(EC.visibility_of_element_located((By.XPATH, '//button[text()="Accept All Cookies"]')))
button.click()
all_items = driver.find_elements(By.XPATH, '//div[#data-href]')
print('len(all_items):', len(all_items))
for item in all_items:
print(item.get_attribute('data-href'))
Result:
len(all_items): 40
//www.premierleague.com/match/66686
//www.premierleague.com/match/66682
//www.premierleague.com/match/66687
//www.premierleague.com/match/66689
//www.premierleague.com/match/66691
//www.premierleague.com/match/66684
//www.premierleague.com/match/66705
//www.premierleague.com/match/66677
//www.premierleague.com/match/66674
//www.premierleague.com/match/66675
//www.premierleague.com/match/66676
//www.premierleague.com/match/66679
//www.premierleague.com/match/66672
//www.premierleague.com/match/66678
//www.premierleague.com/match/66680
//www.premierleague.com/match/66681
//www.premierleague.com/match/66673
//www.premierleague.com/match/66633
//www.premierleague.com/match/66584
//www.premierleague.com/match/66513
//www.premierleague.com/match/66637
//www.premierleague.com/match/66636
//www.premierleague.com/match/66635
//www.premierleague.com/match/66666
//www.premierleague.com/match/66670
//www.premierleague.com/match/66668
//www.premierleague.com/match/66665
//www.premierleague.com/match/66667
//www.premierleague.com/match/66669
//www.premierleague.com/match/66654
//www.premierleague.com/match/66656
//www.premierleague.com/match/66659
//www.premierleague.com/match/66657
//www.premierleague.com/match/66655
//www.premierleague.com/match/66652
//www.premierleague.com/match/66660
//www.premierleague.com/match/66661
//www.premierleague.com/match/66653
//www.premierleague.com/match/66658
//www.premierleague.com/match/66524
I'm trying to pull some Diablo II trading prices of a trading page using Selenium.
So far I've managed to locate the object I'm interested in using classes, but I can't retrieve the actual text which is what I need.
I have the following code:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
driver.get('https://traderie.com/diablo2resurrected/product/3382986705/buying?prop_Platform=PC&prop_Mode=softcore&makeOffer=false')
# Close cookie popup
driver.find_element(By.XPATH, '//*[#id="tyche_cmp_modal"]/div/div/div/div[5]/div[2]/a').click()
# Locate entire page of offers
All = driver.find_element(By.CLASS_NAME, "row")
# Locate individual offer
Offer = All.find_element(By.CLASS_NAME, "col-xs-12")
# Locate price in each offer
Price = Offer.find_element(By.CLASS_NAME, "sc-eCImPb")
# Print price
print(str(Price.text))
# Close page
driver.close()
The print turns out empty, but it should return something like 3x Vex, or similar. What am I doing wrong?
There are several issues here:
You should add waits. Preferably Expected Conditions explicit waits.
You are using a wrong locator for price element
Since there are multiple offers there you should iterate over the results in a loop.
Variable names should be lowercased according to accepted convention.
I think your code should be something like this:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
wait = WebDriverWait(driver, 20)
driver.get('https://traderie.com/diablo2resurrected/product/3382986705/buying?prop_Platform=PC&prop_Mode=softcore&makeOffer=false')
# Close cookie popup
wait.until(EC.visibility_of_element_located((By.XPATH, '//*[#id="tyche_cmp_modal"]/div/div/div/div[5]/div[2]/a'))).click()
wait.until(EC.visibility_of_element_located((By.XPATH, "//a[contains(#style,'capitalize')]")))
time.sleep(1)
prices = driver.find_elements(By.XPATH, "//a[contains(#style,'capitalize')]")
for price in prices:
print(price.text)
# Close page
driver.close()
UPD
To iterate over offers and get each offer seller's name and the price you can do something like this:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
wait = WebDriverWait(driver, 20)
driver.get('https://traderie.com/diablo2resurrected/product/3382986705/buying?prop_Platform=PC&prop_Mode=softcore&makeOffer=false')
# Close cookie popup
wait.until(EC.visibility_of_element_located((By.XPATH, '//*[#id="tyche_cmp_modal"]/div/div/div/div[5]/div[2]/a'))).click()
wait.until(EC.visibility_of_element_located((By.XPATH, "//a[contains(#style,'capitalize')]")))
time.sleep(1)
offers = driver.find_elements(By.CLASS_NAME, "col-xs-12")
for offer in offers:
name = offer.find_element(By.XPATH, ".//a[contains(#href,'profile')]")
prices = offer.find_elements(By.XPATH, ".//a[contains(#style,'capitalize')]")
#now you can extract texts from name with name.text
#and iterate over prices with
for price in prices:
price_text = price.text
#You can put all these in dictionary etc.
# Close page
driver.close()
I have one web site like
www.abc.com in this I need to add Email and press continue
and in second page I need to add password and check the check box
so how can I enter the password in 2nd page.
I tired with :
from selenium import webdriver
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.keys import Keys
uamId = "asas"
driver = webdriver.Chrome("chromedriver")
driver.get("www.abc.com")
print(driver.title)
userid = driver.find_element_by_name("P")
# fill UAM Number
userid.send_keys(a)
elem = driver.find_element_by_xpath('Xpath')
actions = ActionChains(driver)
actions.click(elem).perform()
Before getting the web element you have to validate the element is fully loaded and is clickable / visible.
Also in that specific site you have to click elements in order to insert the email and password.
Your code should look like this:
from selenium import webdriver
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
driver = webdriver.Chrome("chromedriver")
wait = WebDriverWait(driver, 20)
driver.get("www.abc.com")
print(driver.title)
#open account menu
wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, ".navigation__menu #account span"))).click()
wait.until(EC.visibility_of_element_located((By.XPATH, "//a[text()='Sign in']"))).click()
#now you will have to switch into the iframe
wait.until(EC.frame_to_be_available_and_switch_to_it((By.XPATH,"//iframe[#id='disneyid-iframe']")))
#now you can insert the credentials
wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "input[type='email']"))).send_keys(your_email)
wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "input[type='password']"))).send_keys(your_password)