Difficulty Selecting Element using Selenium - python

Glassdoor Job Search Link
Currently attempting to select the "sector" for a job posting. Not having an luck with xpath or css selector thus far...any assistance would be appreciated! My code is already iterating through each job posting successfully and pulling company name, location, job description etc. My vs code is below.
Original code credit: Omer Sakarya and Ken Jee.
Here are some of my attempts:
'.//span[#class="css-1ff36h2 e1pvx6aw0"]'
'.//div[#id="EmpBasicInfo"]//div[#class="d-flex flex-wrap"]/div[5]/span[#class="css-1ff36h2 e1pvx6aw0"]'
'.//div[#class="EmpBasicInfo"]//span[text()="Sector"]//following-sibling::*'
Glassdoor Job Posting/Sector Element
from selenium.common.exceptions import NoSuchElementException, ElementClickInterceptedException
from selenium import webdriver
import time
import pandas as pd
from selenium.webdriver.common.by import By
from selenium.webdriver.common.alert import Alert
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
def get_jobs(keyword, num_jobs, verbose, path, slp_time):
'''Gathers jobs as a dataframe, scraped from Glassdoor'''
#Initializing the webdriver
options = webdriver.ChromeOptions()
#Uncomment the line below if you'd like to scrape without a new Chrome window every time.
#options.add_argument('headless')
#Change the path to where chromedriver is in your home folder.
driver = webdriver.Chrome(executable_path=path, options=options)
driver.set_window_size(1120, 1000)
url='https://www.glassdoor.com/Job/' + keyword + '-jobs-SRCH_KO0,14.htm'
driver.get(url)
jobs = []
while len(jobs) < num_jobs: #If true, should be still looking for new jobs.
#Let the page load. Change this number based on your internet speed.
#Or, wait until the webpage is loaded, instead of hardcoding it.
time.sleep(slp_time)
#Test for the "Sign Up" prompt and get rid of it.
try:
driver.find_element(By.CSS_SELECTOR, '[data-selected="true"]').click()
except ElementClickInterceptedException:
pass
time.sleep(.1)
try:
driver.find_element(By.XPATH,('.//div[#id="JAModal"]//span[#alt="Close"]')).click()
except NoSuchElementException:
pass
#Going through each job in this page
job_buttons = driver.find_elements(By.CSS_SELECTOR,'[data-test="job-link"]') #jl for Job Listing. These are the buttons were going to click.
for job_button in job_buttons:
print("Progress: {}".format("" + str(len(jobs)) + "/" + str(num_jobs)))
if len(jobs) >= num_jobs:
break
job_button.click() #You might
time.sleep(1)
collected_successfully = False
while not collected_successfully:
try:
company_name = driver.find_element(By.XPATH,'.//div[#class="css-xuk5ye e1tk4kwz5"]').text
location = driver.find_element(By.XPATH,'.//div[#class="css-56kyx5 e1tk4kwz1"]').text
job_title = driver.find_element(By.XPATH,'.//div[contains(#class, "css-1j389vi e1tk4kwz2")]').text
job_description = driver.find_element(By.XPATH,'.//div[#class="jobDescriptionContent desc"]').text
collected_successfully = True
except:
time.sleep(5)
try:
salary_estimate = driver.find_element(By.XPATH,'.//span[#class="css-1hbqxax e1wijj240"]').text
except NoSuchElementException:
salary_estimate = -1 #You need to set a "not found value. It's important."
try:
rating = driver.find_element(By.CSS_SELECTOR,'[data-test="detailRating"]').text
except NoSuchElementException:
rating = -1 #You need to set a "not found value. It's important."
#Printing for debugging
if verbose:
print("Job Title: {}".format(job_title))
print("Salary Estimate: {}".format(salary_estimate))
print("Job Description: {}".format(job_description[:500]))
print("Rating: {}".format(rating))
print("Company Name: {}".format(company_name))
print("Location: {}".format(location))
#Going to the Company tab...
#clicking on this:
#<div class="tab" data-tab-type="overview"><span>Company</span></div>
try:
driver.find_element(By.XPATH,'.//div[#class="tab" and #data-tab-type="overview"]').click()
try:
#<div class="infoEntity">
# <label>Headquarters</label>
# <span class="value">San Francisco, CA</span>
#</div>
headquarters = driver.find_element(By.XPATH,'.//div[#class="infoEntity"]//label[text()="Headquarters"]//following-sibling::*').text
except NoSuchElementException:
headquarters = -1
try:
size = driver.find_element(By.XPATH,'.//div[#id="EmpBasicInfo"]//div[#class="d-flex flex-wrap"]/div[1]/span[#class="css-1ff36h2 e1pvx6aw0"]').text
except NoSuchElementException:
size = -1
try:
founded = driver.find_element(By.XPATH,'.//div[#class="css-1pldt9b e1pvx6aw1"]//span[text()="Founded"]//following-sibling::*').text
except NoSuchElementException:
founded = -1
try:
type_of_ownership = driver.find_element(By.XPATH,'.//div[#class="infoEntity"]//label[text()="Type"]//following-sibling::*').text
except NoSuchElementException:
type_of_ownership = -1
try:
industry = driver.find_element(By.XPATH,'.//div[#id="EmpBasicInfo"]//div[#class="d-flex flex-wrap"]/div[4]/span[#class="css-1ff36h2 e1pvx6aw0"]').text
except NoSuchElementException:
industry = -1
try:
sector = driver.find_element(By.XPATH,".//div[#id='EmpBasicInfo']//div[#class='d-flex flex-wrap']/div[5]/span[#class='css-1pldt9b e1pvx6aw1']//following-sibling::*").text
except NoSuchElementException:
sector = -1
try:
revenue = driver.find_element(By.XPATH,'.//span[#class="css-1ff36h2 e1pvx6aw0"]').text
except NoSuchElementException:
revenue = -1
try:
competitors = driver.find_element(By.XPATH,'.//div[#class="infoEntity"]//label[text()="Competitors"]//following-sibling::*').text
except NoSuchElementException:
competitors = -1
except NoSuchElementException: #Rarely, some job postings do not have the "Company" tab.
headquarters = -1
size = -1
founded = -1
type_of_ownership = -1
industry = -1
sector = -1
revenue = -1
competitors = -1
if verbose:
print("Headquarters: {}".format(headquarters))
print("Size: {}".format(size))
print("Founded: {}".format(founded))
print("Type of Ownership: {}".format(type_of_ownership))
print("Industry: {}".format(industry))
print("Sector: {}".format(sector))
print("Revenue: {}".format(revenue))
print("Competitors: {}".format(competitors))
print("####################################################")
jobs.append({"Job Title" : job_title,
"Salary Estimate" : salary_estimate,
"Job Description" : job_description,
"Rating" : rating,
"Company Name" : company_name,
"Location" : location,
"Headquarters" : headquarters,
"Size" : size,
"Founded" : founded,
"Type of ownership" : type_of_ownership,
"Industry" : industry,
"Sector" : sector,
"Revenue" : revenue,
"Competitors" : competitors})
#add job to jobs
#Clicking on the "next page" button
try:
driver.find_element(By.CSS_SELECTOR, "[alt='next-icon']").click
except NoSuchElementException:
print("Scraping terminated before reaching target number of jobs. Needed {}, got {}.".format(num_jobs, len(jobs)))
break
return pd.DataFrame(jobs) #This line converts the dictionary object into a pandas DataFrame.

Looks like I was missing the click through to the other tab on the page.
driver.find_element(By.XPATH,'.//div[#class="css-r7fjfn ead8scz1"]').click()
Thanks all that attempted to assist!

You have to use below xpath for Job posting.
//a[contains(#data-test, 'post-jobs')]

Related

How can I select the Suburb/City dropdown on Amazon using Selenium and Python?

I'm trying to make a bot to order a product on Amazon using Python and Selenium. I have it working so that it enters text input details such as name and everything however the way Amazon does dropdowns doesn't use normal select elements. I'm trying to click on the element which I believe to be the trigger with Selenium but that just isn't working. If anyone has any ideas that would be greatly appreciated.
The code is below, you will have to replace the details part so you can login and test it.
from selenium import webdriver
from selenium.webdriver.common.by import By
import selenium.webdriver.support.ui as ui
from selenium.webdriver.support.select import Select
from selenium.webdriver.support import expected_conditions as EC
import logging
AMAZON_PRODUCT_LINK = "https://www.amazon.com.au/PlayStation-5-Console/dp/B08HHV8945/ref=sr_1_2?crid=1ZDXB7YRF1RXT&keywords=playstation+5&qid=1648454510&rnid=5367991051&s=videogames&sprefix=playstatio%2Caps%2C303&sr=1-2"
TEST_LINK = "https://www.amazon.com.au/PlayStation-4-500GB-Console-Black/dp/B0773RV962/ref=sr_1_3?crid=1O9CWY87HOD3T&keywords=playstation&qid=1648454640&s=videogames&sprefix=playstation+%2Cvideogames%2C238&sr=1-3"
# AMAZON XPATHs
UNAVAILABLE_XPATH = "/html/body/div[2]/div[2]/div[6]/div[4]/div[4]/div[19]/div[1]/span"
AVAILABLE_XPATH = "/html/body/div[2]/div[2]/div[6]/div[4]/div[1]/div[2]/div/div/div/div/div/div/form/div/div/div/div/div[3]/div/div[4]/div/div[1]/span"
# FORM INFO
BUY_BUTTON_XPATH = '//*[#id="buy-now-button"]'
EMAIL_INPUT_XPATH = '//*[#id="ap_email"]'
CONTINUE_BUTTON_XPATH = '//*[#id="continue"]'
PASSWORD_INPUT_XPATH = '//*[#id="ap_password"]'
SUBMIT_BUTTON_XPATH = '//*[#id="signInSubmit"]'
# BILLING INFO
FULL_NAME_XPATH = '//*[#id="address-ui-widgets-enterAddressFullName"]'
PHONE_XPATH = '//*[#id="address-ui-widgets-enterAddressPhoneNumber"]'
COUNTRY_XPATH = '//*[#id="address-ui-widgets-countryCode-dropdown-nativeId"]'
ADDRESS_1_XPATH = '//*[#id="address-ui-widgets-enterAddressLine1"]'
ADDRESS_2_XPATH = '//*[#id="address-ui-widgets-enterAddressLine2"]'
POST_CODE_XPATH = '//*[#id="address-ui-widgets-enterAddressPostalCode"]'
CITY_XPATH = '/html/body/div[5]/div[2]/div[3]/div[1]/div/div[1]/form/div/div[1]/div/div[14]/span/span/span/span'
CONTINUE_XPATH = '/html/body/div[5]/div[2]/div[2]/div[1]/div/div[1]/form/div/span/span/span/input'
# DETAILS
ACCOUNT_EMAIL = "EMAIL#email.com"
ACCOUNT_PASSWORD = "password123"
FULL_NAME = "FULL NAME"
PHONE_NUMBER = "PHONE"
COUNTRY = 'COUNTRY'
ADDRESS_1 = "ADDRESS 1"
ADDRESS_2 = "ADDRESS 2"
POST_CODE = "CODE"
CITY = 'SUBURB'
driver = webdriver.Firefox()
wait = ui.WebDriverWait(driver,10)
try:
#driver.get(AMAZON_PRODUCT_LINK)
driver.get(TEST_LINK)
try:
try:
print(driver.find_element(by=By.XPATH, value=UNAVAILABLE_XPATH).text)
except:
value = wait.until(EC.presence_of_element_located((By.XPATH, AVAILABLE_XPATH)))
print(value.text)
buy_button = wait.until(EC.presence_of_element_located((By.XPATH, BUY_BUTTON_XPATH)))
buy_button.click()
email_input = wait.until(EC.presence_of_element_located((By.XPATH, EMAIL_INPUT_XPATH)))
email_input.send_keys(ACCOUNT_EMAIL)
continue_button = wait.until(EC.presence_of_element_located((By.XPATH, CONTINUE_BUTTON_XPATH)))
continue_button.click()
password_input = wait.until(EC.presence_of_element_located((By.XPATH, PASSWORD_INPUT_XPATH)))
password_input.send_keys(ACCOUNT_PASSWORD)
continue_button = wait.until(EC.presence_of_element_located((By.XPATH, SUBMIT_BUTTON_XPATH)))
continue_button.click()
try:
full_name = wait.until(EC.presence_of_element_located((By.XPATH, FULL_NAME_XPATH)))
full_name.send_keys(FULL_NAME)
phone = wait.until(EC.presence_of_element_located((By.XPATH, PHONE_XPATH)))
phone.send_keys(PHONE_NUMBER)
address_1 = wait.until(EC.presence_of_element_located((By.XPATH, ADDRESS_1_XPATH)))
address_1.send_keys(ADDRESS_1)
address_2 = wait.until(EC.presence_of_element_located((By.XPATH, ADDRESS_2_XPATH)))
address_2.send_keys(ADDRESS_2)
post_code = wait.until(EC.presence_of_element_located((By.XPATH, POST_CODE_XPATH)))
post_code.clear()
post_code.send_keys(POST_CODE)
country = wait.until(EC.presence_of_element_located((By.XPATH, COUNTRY_XPATH)))
country.click()
city = wait.until(EC.presence_of_element_located((By.XPATH, CITY_XPATH)))
#city.set_attribute('aria-pressed', 'true')
city.click()
city_chosen = wait.until(EC.presence_of_element_located((By.XPATH, CITY)))
city_chosen.click()
continue_button = wait.until(EC.presence_of_element_located((By.XPATH, CONTINUE_XPATH)))
continue_button.click()
except Exception as Argument:
logging.exception("Something went wrong when logging in.")
except Exception as Argument:
logging.exception("Something went wrong when checking the availability of this product.")
except Exception as Argument:
logging.exception("Something went wrong with the URL you provided.")
#driver.quit()
P.S. I don't want any comments about my use of try and except. Right now I just need something that's working.

scraping with selenium cant click on clickable text

I am trying to scrape some data from yahoo finance, for each stock, I want to get the historical data. Taking the Apple stock. I should go to https://finance.yahoo.com/quote/AAPL/history?p=AAPL and choose "MAX" from "Time Period". so
I believe the script I wrote so far is getting the date element, but somehow clicking on it to be able to choose "MAX" is not working.
here is my whole script:
# using linux here
project_path = os.getcwd()
driver_path = project_path + "/" + "chromedriver"
yahoo_finance = "https://finance.yahoo.com/quote/"
driver = webdriver.Chrome(driver_path)
def get_data(symbol='AAPL'):
stock_history_link = yahoo_finance + symbol + '/history?p=' + symbol
driver.get(stock_history_link)
date_picker = '//div[contains(#class, "D(ib)") and contains(#class, "Pos(r)") and contains(#class, "Cur(p)")' \
'and contains(#class, "O(n):f")]'
try:
print("I am inside")
date_picker_2 = "//div[#class='Pos(r) D(ib) O(n):f Cur(p)']"
date_picker_element = driver.find_element_by_xpath(date_picker_2)
print("date_picker_element: ", date_picker_element)
date_picker_element.click()
try:
print("I will be waiting for the date")
my_dropdown = WebDriverWait(driver, 100).until(
EC.presence_of_element_located((By.ID, 'dropdown-menu'))
)
print(my_dropdown)
print("I am not waiting anymore")
except TimeoutException as e:
print("wait timed out")
print(e)
except WebDriverException:
print("Something went wrong while trying to pick the max date")
if __name__ == '__main__':
try:
get_data()
except:
pass
# finally:
# driver.quit()
To click the button with Max just open it up and target it.
driver.get("https://finance.yahoo.com/quote/AAPL/history?p=AAPL")
wait = WebDriverWait(driver, 10)
wait.until(EC.element_to_be_clickable((By.XPATH, "//span[#class='C($linkColor) Fz(14px)']"))).click()
wait.until(EC.element_to_be_clickable((By.XPATH, "//button[#data-value='MAX']"))).click()
Element:
<button class="Py(5px) W(45px) Fz(s) C($tertiaryColor) Cur(p) Bd Bdc($seperatorColor) Bgc($lv4BgColor) Bdc($linkColor):h Bdrs(3px)" data-value="MAX"><span>Max</span></button>
Imports:
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
You have the wrong xpath for the date_picker_2:
date_picker_2 = '//*[#id="Col1-1-HistoricalDataTable-Proxy"]/section/div[1]/div[1]/div[1]/div/div/div/span'
Using requests:
import requests
import datetime
end = int(datetime.datetime.strptime(datetime.date.today().isoformat(), "%Y-%m-%d").timestamp())
url = f"https://finance.yahoo.com/quote/AAPL/history?period1=345427200&period2={end}&interval=1d&filter=history&frequency=1d&includeAdjustedClose=true"
requests.get(url)
Gets you to the same end page.

trying to close popover - python - selenium - Glassdoor

Trying to close a popover while scraping Glassdoor for jobs [It keeps popping up from time to time - need to close it every time].. I've tried quite a few things
Tried closing it by looking for the close button. Please help !
driver.find_element_by_class_name("SVG_Inline modal_closeIcon").click()
Tried looking for a ElementClickInterceptedException when the bot couldn't click on the next company, and everywhere else there was a click
element = WebDriverWait(driver, 3).until(EC.presence_of_element_located((By.CLASS_NAME, "SVG_Inline-svg modal_closeIcon-svg")))
element.click()
This is the website:
https://www.glassdoor.co.uk/Job/web-developer-jobs-SRCH_KO0,13.htm
This is the complete code:
from selenium.common.exceptions import NoSuchElementException, ElementClickInterceptedException, StaleElementReferenceException
from selenium import webdriver
import time
import pandas as pd
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
def get_jobs(keyword, num_jobs, verbose, place):
'''Gathers jobs as a dataframe, scraped from Glassdoor'''
#Initializing the webdriver
options = webdriver.ChromeOptions()
#Uncomment the line below if you'd like to scrape without a new Chrome window every time.
#options.add_argument('headless')
#Change the path to where chromedriver is in your home folder.
#driver = webdriver.Chrome(executable_path="/Users/omersakarya/Documents/GitHub/scraping-glassdoor-selenium/chromedriver", options=options)
driver = webdriver.Chrome()
driver.set_window_size(1120, 1000)
url = "https://www.glassdoor.co.uk/Job/web-developer-jobs-SRCH_KO0,13.htm"
driver.get(url)
jobs = []
time.sleep(3)
driver.find_element_by_id("onetrust-accept-btn-handler").click()
time.sleep(3)
while len(jobs) < num_jobs: #If true, should be still looking for new jobs.
job_buttons = driver.find_elements_by_class_name("jl") #jl for Job Listing. These are the buttons we're going to click.
try:
for job_button in job_buttons:
if len(jobs) >= num_jobs:
break
print("Progress: {}".format("" + str(len(jobs)) + "/" + str(num_jobs)))
job_button.click()
collected_successfully = False
while not collected_successfully:
try:
company_name = driver.find_element_by_xpath('.//div[#class="employerName"]').text
location = driver.find_element_by_xpath('.//div[#class="location"]').text
job_title = driver.find_element_by_xpath('.//div[contains(#class, "title")]').text
job_description = driver.find_element_by_xpath('.//div[#class="jobDescriptionContent desc"]').text
collected_successfully = True
except:
time.sleep(5)
try:
#salary_estimate = driver.find_element_by_xpath('.//span[#class="gray salary"]').text
salary_estimate = driver.find_element_by_xpath('//*[#id="HeroHeaderModule"]/div[3]/div[1]/div[4]/span').text
except NoSuchElementException:
salary_estimate = -1 #You need to set a "not found value. It's important."
try:
rating = driver.find_element_by_xpath('.//span[#class="rating"]').text
except NoSuchElementException:
rating = -1 #You need to set a "not found value. It's important."
#Printing for debugging
if verbose:
print("Job Title: {}".format(job_title))
print("Salary Estimate: {}".format(salary_estimate))
print("Job Description: {}".format(job_description[:500]))
print("Rating: {}".format(rating))
print("Company Name: {}".format(company_name))
print("Location: {}".format(location))
#Going to the Company tab...
#clicking on this:
#<div class="tab" data-tab-type="overview"><span>Company</span></div>
try:
driver.find_element_by_xpath('.//div[#class="tab" and #data-tab-type="overview"]').click()
try:
#<div class="infoEntity">
# <label>Headquarters</label>
# <span class="value">San Francisco, CA</span>
#</div>
headquarters = driver.find_element_by_xpath('.//div[#class="infoEntity"]//label[text()="Headquarters"]//following-sibling::*').text
except NoSuchElementException:
headquarters = -1
try:
size = driver.find_element_by_xpath('.//div[#class="infoEntity"]//label[text()="Size"]//following-sibling::*').text
except NoSuchElementException:
size = -1
try:
founded = driver.find_element_by_xpath('.//div[#class="infoEntity"]//label[text()="Founded"]//following-sibling::*').text
except (NoSuchElementException, StaleElementReferenceException):
founded = -1
try:
type_of_ownership = driver.find_element_by_xpath('.//div[#class="infoEntity"]//label[text()="Type"]//following-sibling::*').text
except NoSuchElementException:
type_of_ownership = -1
try:
industry = driver.find_element_by_xpath('.//div[#class="infoEntity"]//label[text()="Industry"]//following-sibling::*').text
except NoSuchElementException:
industry = -1
try:
sector = driver.find_element_by_xpath('.//div[#class="infoEntity"]//label[text()="Sector"]//following-sibling::*').text
except NoSuchElementException:
sector = -1
try:
revenue = driver.find_element_by_xpath('.//div[#class="infoEntity"]//label[text()="Revenue"]//following-sibling::*').text
except NoSuchElementException:
revenue = -1
try:
competitors = driver.find_element_by_xpath('.//div[#class="infoEntity"]//label[text()="Competitors"]//following-sibling::*').text
except NoSuchElementException:
competitors = -1
except (NoSuchElementException,ElementClickInterceptedException,StaleElementReferenceException): #Rarely, some job postings do not have the "Company" tab.
if NoSuchElementException:
time.sleep(1)
headquarters = -1
size = -1
founded = -1
type_of_ownership = -1
industry = -1
sector = -1
revenue = -1
competitors = -1
else:
driver.find_element_by_class_name("selected").click()
driver.find_element_by_class_name("SVG_Inline modal_closeIcon").click()
element = WebDriverWait(driver, 3).until(EC.presence_of_element_located((By.CLASS_NAME, "SVG_Inline-svg modal_closeIcon-svg")))
element.click()
pass
if verbose:
print("Headquarters: {}".format(headquarters))
print("Size: {}".format(size))
print("Founded: {}".format(founded))
print("Type of Ownership: {}".format(type_of_ownership))
print("Industry: {}".format(industry))
print("Sector: {}".format(sector))
print("Revenue: {}".format(revenue))
print("Competitors: {}".format(competitors))
print("####################################################")
jobs.append({"Job Title" : job_title,
"Salary Estimate" : salary_estimate,
"Job Description" : job_description,
"Rating" : rating,
"Company Name" : company_name,
"Location" : location,
"Headquarters" : headquarters,
"Size" : size,
"Founded" : founded,
"Type of ownership" : type_of_ownership,
"Industry" : industry,
"Sector" : sector,
"Revenue" : revenue,
"Competitors" : competitors})
#You might
#time.sleep(0.5)
except (ElementClickInterceptedException, StaleElementReferenceException):
alertObj = driver.switch_to.alert
alertObj.accept()
alertObj.dismiss()
driver.find_element_by_class_name("selected").click()
driver.find_element_by_class_name("SVG_Inline modal_closeIcon").click()
element = WebDriverWait(driver, 3).until(EC.presence_of_element_located((By.CLASS_NAME, "SVG_Inline-svg modal_closeIcon-svg")))
element.click()
pass
#add job to jobs
#Clicking on the "next page" button
# try:
# driver.find_element_by_xpath('.//li[#class="page"]//a').click()
# except NoSuchElementException:
# print("Scraping terminated before reaching target number of jobs. Needed {}, got {}.".format(num_jobs, len(jobs)))
# break
# time.sleep(5)
try:
driver.find_element_by_xpath('.//li[#class="next"]//a').click()
except (ElementClickInterceptedException):
#print("Scraping terminated before reaching target number of jobs. Needed {}, got {}.".format(num_jobs, len(jobs)))
driver.find_element_by_class_name("selected").click()
driver.find_element_by_class_name("SVG_Inline modal_closeIcon").click()
element = WebDriverWait(driver, 3).until(EC.presence_of_element_located((By.CLASS_NAME, "SVG_Inline-svg modal_closeIcon-svg")))
element.click()
element.text
pass
#break
return pd.DataFrame(jobs) #This line converts the dictionary object into a pandas DataFrame.
df = gs.get_jobs(keyword, num_jobs, False, place)
Trying to get rid of this:
enter image description here
[Screenshot of the element I need to close and continue with the loop][2]

In selenium how to find out the exact number of XPATH links with different ids?

With Python3 and selenium I want to automate the search on a public information site. In this site it is necessary to enter the name of a person, then select the spelling chosen for that name (without or with accents or name variations), access a page with the list of lawsuits found and in this list you can access the page of each case.
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException
from selenium.webdriver.common.keys import Keys
import time
import re
Name that will be searched
name = 'JOSE ROBERTO ARRUDA'
Create path, search start link, and empty list to store information
firefoxPath="/home/abraji/Documentos/Code/geckodriver"
link = 'https://ww2.stj.jus.br/processo/pesquisa/?aplicacao=processos.ea'
processos = []
Call driver and go to first search page
driver = webdriver.Firefox(executable_path=firefoxPath)
driver.get(link)
Position cursor, fill and click
WebDriverWait(driver, 5).until(EC.element_to_be_clickable((By.CSS_SELECTOR, '#idParteNome'))).click()
time.sleep(1)
driver.find_element_by_xpath('//*[#id="idParteNome"]').send_keys(name)
time.sleep(6)
WebDriverWait(driver, 5).until(EC.element_to_be_clickable((By.CSS_SELECTOR, '#idBotaoPesquisarFormularioExtendido'))).click()
Mark all spelling possibilities for searching
WebDriverWait(driver, 5).until(EC.element_to_be_clickable((By.CSS_SELECTOR, '#idBotaoMarcarTodos'))).click()
WebDriverWait(driver, 5).until(EC.element_to_be_clickable((By.CSS_SELECTOR, '#idBotaoPesquisarMarcados'))).click()
time.sleep(1)
Check how many pages of data there are - to be used in "for range"
capta = driver.find_element_by_xpath('//*[#id="idDivBlocoPaginacaoTopo"]/div/span/span[2]').text
print(capta)
paginas = int(re.search(r'\d+', capta).group(0))
paginas = int(paginas) + 1
print(paginas)
Capture routine
for acumula in range(1, paginas):
# Fill the field with the page number and press enter
driver.find_element_by_xpath('//*[#id="idDivBlocoPaginacaoTopo"]/div/span/span[2]/input').send_keys(acumula)
driver.find_element_by_xpath('//*[#id="idDivBlocoPaginacaoTopo"]/div/span/span[2]/input').send_keys(Keys.RETURN)
time.sleep(2)
# Captures the number of processes found on the current page - qt
qt = driver.find_element_by_xpath('//*[#id="idDivBlocoMensagem"]/div/b').text
qt = int(qt) + 2
print(qt)
# Iterate from found number of processes
for item in range(2, qt):
# Find the XPATH of each process link - start at number 2
vez = '//*[#id="idBlocoInternoLinhasProcesso"]/div[' + str(item) + ']/span[1]/span[1]/span[1]/span[2]/a'
print(vez)
# Access the direct link and click
element = WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.XPATH, vez)))
element.click()
# Run tests to get data
try:
num_unico = driver.find_element_by_xpath('//*[#id="idProcessoDetalhesBloco1"]/div[6]/span[2]/a').text
except NoSuchElementException:
num_unico = "sem_numero_unico"
try:
nome_proc = driver.find_element_by_xpath('//*[#id="idSpanClasseDescricao"]').text
except NoSuchElementException:
nome_proc = "sem_nome_encontrado"
try:
data_autu = driver.find_element_by_xpath('//*[#id="idProcessoDetalhesBloco1"]/div[5]/span[2]').text
except NoSuchElementException:
data_autu = "sem_data_encontrada"
# Fills dictionary and list
dicionario = {"num_unico": num_unico,
"nome_proc": nome_proc,
"data_autu": data_autu
}
processos.append(dicionario)
# Return a page to click on next process
driver.execute_script("window.history.go(-1)")
# Close driver
driver.quit()
In this case I captured the number of link pages (3) and the total number of links (84). So my initial idea was to do the "for" three times and within them split the 84 links
The direct address of each link is in XPATH (//*[#id="idBlocoInternoLinhasProcesso"]/div[41]/span[1]/span[1]/span[1]/span[2]/a) which I replace with the "item" to click
For example, when it arrives at number 42 I have an error because the first page only goes up to 41
My problem is how to go to the second page and then restart only "for" secondary
I think the ideal would be to know the exact number of links on each of the three pages
Anyone have any ideas?
Code below is "Capture routine":
wait = WebDriverWait(driver, 20)
#...
while True:
links = wait.until(EC.presence_of_all_elements_located((By.XPATH, "//span[contains(#class,'classSpanNumeroRegistro')]")))
print("links len", len(links))
for i in range(1, len(links) + 1):
# Access the direct link and click
.until(EC.element_to_be_clickable((By.XPATH, f"(//span[contains(#class,'classSpanNumeroRegistro')])[{i}]//a"))).click()
# Run tests to get data
try:
num_unico = driver.find_element_by_xpath('//*[#id="idProcessoDetalhesBloco1"]/div[6]/span[2]/a').text
except NoSuchElementException:
num_unico = "sem_numero_unico"
try:
nome_proc = driver.find_element_by_xpath('//*[#id="idSpanClasseDescricao"]').text
except NoSuchElementException:
nome_proc = "sem_nome_encontrado"
try:
data_autu = driver.find_element_by_xpath('//*[#id="idProcessoDetalhesBloco1"]/div[5]/span[2]').text
except NoSuchElementException:
data_autu = "sem_data_encontrada"
# Fills dictionary and list
dicionario = {"num_unico": num_unico,
"nome_proc": nome_proc,
"data_autu": data_autu
}
processos.append(dicionario)
# Return a page to click on next process
driver.execute_script("window.history.go(-1)")
# wait.until(EC.presence_of_element_located((By.CLASS_NAME, "classSpanPaginacaoImagensDireita")))
next_page = driver.find_elements_by_css_selector(".classSpanPaginacaoProximaPagina")
if len(next_page) == 0:
break
next_page[0].click()
You can try run the loop until next button is present on the screen. the logic will look like this,
try:
next_page = driver.find_element_by_class_name('classSpanPaginacaoProximaPagina')
if(next_page.is_displayed()):
next_page.click()
except NoSuchElementException:
print('next page does not exists')

Python Selenium + Datepicker Click

I have been banging my head around trying to get the price of a room like this for example by clicking the first available (green) datepicker checkin input and then clicking the first available datepicker checkout input so the price for the minium period is generated.
My code is a mess so i would really appreciate if someone could post a cleaner code to achieve that.
I am using Python selenium + scrapy although something in Java for example would still help.
UPDATE:
here is the code:
def availability(self, doc):
url = doc['url'] + '#calendar'
self.driver.get(url)
is_active = True
# We want to the availability/price for each day in a month.
availabilities = []
# wait for the check in input to load
wait = WebDriverWait(self.driver, 10)
try:
elem = wait.until(
EC.visibility_of_element_located(
(By.CSS_SELECTOR, ".dates-group input[name=startDateInput]")
)
)
except TimeoutException:
pass
else:
elem.click() # open calendar
# wait for datepicker to load
wait.until(
EC.visibility_of_element_located(
(By.CSS_SELECTOR, '.ui-datepicker:not(.loading)'))
)
days = self.driver.find_elements_by_css_selector(
"#ui-datepicker-div tr td"
)
for cell in days:
day = cell.text.strip()
if not day:
continue
if "full-changeover" not in cell.get_attribute("class"):
available = False
else:
available = True
self.logger.warning('CELL "%s"', cell)
self.logger.warning('DAY "%s"', day)
self.logger.warning('available "%s"', available)
# The first iteration was to list the availability, now we want to
# click the first available element to get the price
for cell in days:
day = cell.text.strip()
if not day:
continue
if "full-changeover" in cell.get_attribute("class"):
self.logger.warning('CLICK IT "%s"', day)
self.driver.implicitly_wait(10)
x = self.driver.find_element_by_xpath("//table/tbody/tr/td/a[text()=" + day + "]")
self.driver.implicitly_wait(10)
x.click() # Element not found in the cache issue here
# import ipdb; ipdb.set_trace()
# self.logger.warning('CELL "%s"', cell)
# self.logger.warning('DAY "%s"', day)
# self.logger.warning('available "%s"', available)
# elem.click() # close checkin calendar
# Now lets click on the checkout input to get the price and minimum
# number of days. We probably don't have to wait for the checkout
# because its already loaded but you never know.
try:
elem = wait.until(
EC.visibility_of_element_located(
(By.CSS_SELECTOR,
".dates-group input[name=endDateInput]")
)
)
except TimeoutException:
pass
else:
# elem.click() # open calendar in checkout input
# wait for datepicker to load
wait.until(
EC.visibility_of_element_located(
(By.CSS_SELECTOR, '.ui-datepicker:not(.loading)'))
)
days = self.driver.find_elements_by_css_selector(
"#ui-datepicker-div tr td"
)
for cell in days:
day = cell.text.strip()
if not day:
continue
# This is the first available date to checkout
if "full-changeover" in cell.get_attribute("class"):
self.logger.warning('CLICK IT "%s"', available)
import ipdb; ipdb.set_trace()
# Here we would get the generated price
self.logger.warning('CELL "%s"', cell)
self.logger.warning('DAY "%s"', day)
self.logger.warning('available "%s"', available)
import ipdb; ipdb.set_trace()
return {'availabilities': availabilities, 'is_active': is_active}
Thanks
One tricky thing about this calendar is that you first need to hover a particular day and then relocate the active day and click it. Here is a working implementation that selects the first available start and end dates and prints the calculated price:
from selenium import webdriver
from selenium.webdriver import ActionChains
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
driver = webdriver.Firefox()
driver.maximize_window()
wait = WebDriverWait(driver, 10)
url = 'https://www.homeaway.pt/arrendamento-ferias/p1418427a?uni_id=1590648'
driver.get(url)
# pick start date
start_date = wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, ".quotebar-container input[name=startDateInput]")))
start_date.click()
first_available_date = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, "#ui-datepicker-div td.full-changeover > a")))
ActionChains(driver).move_to_element(first_available_date).perform()
driver.find_element_by_css_selector("#ui-datepicker-div td.full-selected.full-changeover > a").click()
# pick end date (TODO: violates DRY principle, refactor!)
end_date = wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, ".quotebar-container input[name=endDateInput]")))
end_date.click()
first_available_date = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, "#ui-datepicker-div td.full-changeover > a")))
ActionChains(driver).move_to_element(first_available_date).perform()
driver.find_element_by_css_selector("#ui-datepicker-div td.full-selected.full-changeover > a").click()
# get the calculated price
price = wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, ".price-quote .price-total")))
print(price.text)
driver.close()
At the moment, it selects 20/04/2016 and 23/04/2016 and prints 180€.
Hope that helps.

Categories

Resources