I am very new to scraping please bear with me and this is my 1st project. I am trying to scrape a site using selenium - python

"problem lines"
for_tariff_loop = driver.find_elements_by_xpath("//span[#class='phx-radio__element']")
radio_label_list = for_tariff_loop[i].find_element_by_css_selector('span[class="phx-radio__label"]')
print(radio_label_list)
time.sleep(1)
website I'm scraping https://www.telekom.de/unterwegs/apple/apple-iphone-13-pro/graphit-512gb
label image
I was not able to print the radio buttons label according to checked button. I don't know what is the mistake and where I did it. could anyone help on this. It will be helpful for me to learn. Change tariff links given below links,
import xlwt
from selenium import webdriver
import re
import time
from datetime import date
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.keys import Keys
class telekommobiles:
def __init__(self):
self.url="https://www.telekom.de/mobilfunk/geraete/smartphone?page=1&pageFilter=promotion"
self.country='DE'
self.currency='GBP'
self.VAT='Included'
self.shipping = 'free shipping within 3-4 weeks'
self.Pre_PromotionPrice ='N/A'
self.color ='N/A'
def telekom(self):
#try:
driver=webdriver.Chrome()
driver.maximize_window()
driver.get(self.url)
today = date.today()
time.sleep(5)
cookies = driver.find_element_by_css_selector('button.cl-btn.cl-btn--accept-all').click()
print("cookies accepted")
links_prod_check = []
prod_models = []
prod_manufacturer =[]
prod_memorys = []
product_colors =[]
product_price_monthly_payments = []
product_price_one_time_payments =[]
product_links = []
containers = driver.find_elements_by_css_selector('div[class="styles_item__12Aw4"]')
i = 1
for container in containers:
p_links =container.find_element_by_tag_name('a').get_attribute('href')
i = i + 1
product_links.append(p_links)
#print(p_links)
for links in product_links:
driver.get(links)
#time.sleep(5)
#print(driver.current_url)
#links_prod_check.append(driver.current_url)
coloroptions = WebDriverWait(driver, 30).until(EC.presence_of_all_elements_located((By.XPATH,"//li[#data-qa='list_ColorVariant']")))
#print(coloroptions)
for i in range(len(coloroptions)):
coloroption = driver.find_elements_by_xpath("//li[#data-qa='list_ColorVariant']")
coloroption[i].click()
#print(coloroption[i])
time.sleep(3)
memoryoptions = WebDriverWait(driver, 30).until(EC.presence_of_all_elements_located((By.XPATH,"//span[#class='phx-radio__element']")))
for i in range(len(memoryoptions)):
memoryoption = driver.find_elements_by_xpath("//span[#class='phx-radio__element']")
try:
memoryoption[i].click()
except:
pass
time.sleep(5)
change_traiff = driver.find_element_by_css_selector('button[class="phx-link phx-list-of-links__link js-mod tracking-added"]').click()
time.sleep(3)
#looping for each section
section_loops = driver.find_elements_by_css_selector('section[class="tariff-catalog--layer"]')
#print(len(section_loops))
for section_loop in section_loops:
#print(section_loop)
time.sleep(5)
#Headings
heading_1 = section_loop.find_element_by_css_selector('h2[class="page-title page-title--lowercase"]').text
print(heading_1)
# looping for each separate boxes
each_box_subcontainers = section_loop.find_elements_by_css_selector('.phx-tariff-box__section')
#print(len(each_box_subcontainers))
for subcontainer in each_box_subcontainers:
#print(subcontainer)
looping_for_tariff = WebDriverWait(driver, 10).until(EC.presence_of_all_elements_located((By.XPATH,"//span[#class='phx-radio__element']")))
#print(looping_for_tariff)
for i in range(len(looping_for_tariff)):
#print(i)
try:
for_tariff_loop = driver.find_elements_by_xpath("//span[#class='phx-radio__element']")
for_tariff_loop[i].click()
time.sleep(3)
except:
pass
for_tariff_loop = driver.find_elements_by_xpath("//span[#class='phx-radio__element']")
radio_label_list = for_tariff_loop[i].find_element_by_css_selector('span[class="phx-radio__label"]')
print(radio_label_list)
time.sleep(1)
change_traiff_close_button = driver.find_element_by_css_selector('span[class="icon-after-yellow-close right close popup-close-tr js-popup-close"]').click()
telekom_de=telekommobiles()
telekom_de.telekom()

You are trying to find element within an element. Finding radio_label_list using for_tariff_loop[i], xpath for radio_label_list will become like below:
//span[#class='phx-radio__element']//span[#class="phx-radio__label"]
Which does not exist in the DOM.
I tried the last part of the code. And was able to print the Memory size like below. Do try and confirm:
Replaced css-selector for radio_label_list with this xpath ./following-sibling::span
looping_for_tariff = WebDriverWait(driver, 10).until(EC.presence_of_all_elements_located((By.XPATH, "//span[#class='phx-radio__element']")))
# print(looping_for_tariff)
for i in range(len(looping_for_tariff)):
# print(i)
try:
for_tariff_loop = driver.find_elements_by_xpath("//span[#class='phx-radio__element']")
for_tariff_loop[i].click()
time.sleep(3)
except:
pass
for_tariff_loop = driver.find_elements_by_xpath("//span[#class='phx-radio__element']")
radio_label_list = for_tariff_loop[i].find_element_by_xpath("./following-sibling::span").text
print(radio_label_list)
time.sleep(1)
As per the comments, check this code:
driver.get("https://www.telekom.de/unterwegs/apple/apple-iphone-13-pro/graphit-512gb")
wait = WebDriverWait(driver,30)
wait.until(EC.element_to_be_clickable((By.XPATH,"//button[text()='Accept All']"))).click()
wait.until(EC.element_to_be_clickable((By.XPATH,"//ul[contains(#class,'phx-tariff-notification-box-new__element--desktop-tablet')]/li[2]/button"))).click()
length = len(driver.find_elements_by_class_name("phx-tariff-box__section"))
for i in range(length):
print("----------------------------------------------------------------------------------------------------------")
options = driver.find_elements_by_class_name("phx-tariff-box__section")
datas = options[i].find_element_by_xpath(".//div[contains(#class,'phx-tariff-box__volume')]").get_attribute("innerText")
print("data: {}".format(datas))
len_types = len(options[i].find_elements_by_xpath(".//div[#class='phx-tariff-box__radios-inner']//label"))
types = options[i].find_elements_by_xpath(".//div[#class='phx-tariff-box__radios-inner']//label")
if len(types) == 0:
price = options[i].find_element_by_xpath(".//p[#data-qa='block_TariffPrice']").get_attribute("innerText")
print(price)
else:
for j in range(len_types):
types[j].click()
time.sleep(2)
options = driver.find_elements_by_class_name("phx-tariff-box__section")
types = options[i].find_elements_by_xpath(".//div[#class='phx-tariff-box__radios-inner']//label")
try:
types[j].find_element_by_xpath("./input[#checked]")
type = types[j].find_element_by_xpath("./span[2]").get_attribute("innerText")
price = options[i].find_element_by_xpath(".//p[#data-qa='block_TariffPrice']").get_attribute("innerText")
print(f"{type}: {price}")
except:
pass

Related

Scraped data is not saving to csv file as it keeps returning a blank csv file

My scraper is calling the website and hitting each of the 44 pages and creating a csv file but the csv file is empty. I am returning after each of the functions and saving the data to a csv at the end of the scraper.
Can anyone see what is wrong with my code?
Code:
import pandas,requests,bs4,time
from seleniumwire import webdriver
from webdriver_manager.firefox import GeckoDriverManager
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
import datetime
TODAY = datetime.datetime.today().strftime("%Y%m%d")
SAVE_FILENAME = "/Users/180284/jupyter-1.0.0/pssi_jobs-"+TODAY+".csv"
driver = webdriver.Chrome('~/Desktop/chromedriver_mac64')
driver.implicitly_wait(30)
URL_BASE = "https://jobs.pssi.com/us/en/search-resultskeywords=%22food%20safety%20team%20member%22&s=1"
MAX_PAGE = 44
HEADERS = {
'From': 'myemail'
}
def interceptor(request):
del request.headers['From']
request.headers['From'] = HEADERS["From"]
driver.request_interceptor = interceptor
def parse_job_post_div(div_html):
soup = bs4.BeautifulSoup(div_html)
job_ls = soup.findAll("div",{"class":"information"})
job_data = []
for job in job_ls:
job_listing = job.find("div",{"class":"information"}).get_text(separator=", ").strip()
title = job.find("span",{"role":"heading"}).get_text(separator=", ").strip()
job_location = job.find("p",{"class":"job-info"}).get_text(separator=", ").strip()
new_row = {"job_listing":job,"title":title,"job_location":job_location}
job_data.append(new_row)
return job_data
def get_data(wd):
job_postings = driver.find_element(By.CLASS_NAME, "information")
html = job_postings.get_attribute("innerHTML")
parsed = parse_job_post_div(html)
return pandas.DataFrame(parsed)
def process_page(url):
driver.get(url)
master_data = []
i = 0
while True:
df = get_data(driver)
master_data.append(df)
if i == (MAX_PAGE - 1):
break
driver.find_element(By.XPATH, "//span[#class='icon icon-arrow-right']").click()
time.sleep(10)
print(i)
i+=1
return pandas.concat(master_data,ignore_index=True)
data = process_page(URL_BASE)
data.to_csv(SAVE_FILENAME)
`
I have tried the above code.
The first problem I found in your code is that the job_ls is an empty list, i.e. soup.findAll("div",{"class":"information"}) doesn't find anything.
Moreover, job_postings contains only one webelement (i.e. the first job of the list) instead of all 10 jobs shown in the page, that's because you used .find_element instead of .find_elements. As a result of these and other problems, process_page(URL_BASE) returns an empty dataframe.
In this case you can speed up the process and use less code using directly selenium instead of bs4
driver.get(URL_BASE)
driver.implicitly_wait(30)
MAX_PAGE = 4
titles, locations, descriptions = [], [], []
for i in range(MAX_PAGE):
print('current page:',i+1,end='\r')
titles += [title.text for title in driver.find_elements(By.CSS_SELECTOR, '.information > span[role=heading]')]
locations += [loc.text.replace('\n',', ') for loc in driver.find_elements(By.CSS_SELECTOR, '.information > p[class=job-info]')]
descriptions += [title.text for title in driver.find_elements(By.CSS_SELECTOR, '.information > p[data-ph-at-id=jobdescription-text')]
if i < MAX_PAGE-1:
driver.find_element(By.XPATH, "//span[#class='icon icon-arrow-right']").click()
else:
break
df = pandas.DataFrame({'title':titles,'location':locations,'description':descriptions})
df.to_csv(SAVE_FILENAME, index=False)
and df will be something like

Already complete scraping scrapes everything on the page. I would like to limit the scraping to only a certain section

I placed the code of a complete and properly functioning scraping that I own. Successfully scrapes all elements on the page.
However, I would like to scrape only a small limited section of the page with the same elements as scraping. This limited section is already scraped correctly along with all elements of the page, but I would like to scrape only it and not "all + it". The link is here
There are 4 tables on the page, but I would like to scrape just one, that is the table called "Programma", ie the html section "event-summary event" or "leagues-static event-summary-leagues ". But of this section only the elements of the last round (Matchday 14). Matchday 14 only. No round 15. So obviously that with each update of the page rounds, the last round is always scraped as well.
So I would need to insert something that makes scraping understand to download only the elements (which it already owns and scrapes) of of that section and the last round.
The code is already complete and works fine, so I'm not looking for code services, but for a little hint to tell me how to limit the scraping to just the section mentioned above. Scraping is in Selenium. I would like to stick with Selenium and my code as it is already functional and complete. Thanks
import selenium
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
driver = webdriver.Firefox()
driver.get("url")
driver.implicitly_wait(12)
#driver.minimize_window()
wait = WebDriverWait(driver, 10)
all_rows = driver.find_elements(By.CSS_SELECTOR, "div[class^='event__round'],div[class^='event__match']")
current_round = '?'
for bundesliga in all_rows:
classes = bundesliga.get_attribute('class')
#print(classes)
if 'event__round' in classes:
#round = row.find_elements(By.CSS_SELECTOR, "[class^='event__round event__round--static']")
#current_round = row.text # full text `Round 20`
current_round = bundesliga.text.split(" ")[-1] # only `20` without `Round`
else:
datetime = bundesliga.find_element(By.CSS_SELECTOR, "[class^='event__time']")
#Divide la data e l'ora
date, time = datetime.text.split(" ")
date = date.rstrip('.') # right-strip to remove `.` at the end of date
team_home = bundesliga.find_element(By.CSS_SELECTOR, "[class^='event__participant event__participant--home']")
team_away = bundesliga.find_element(By.CSS_SELECTOR, "[class^='event__participant event__participant--away']")
score_home = bundesliga.find_element(By.CSS_SELECTOR, "[class^='event__score event__score--home']")
score_away = bundesliga.find_element(By.CSS_SELECTOR, "[class^='event__score event__score--away']")
bundesliga = [current_round, date, time, team_home.text, team_away.text, score_home.text, score_away.text]
bundesliga.append(bundesliga)
print(bundesliga)
I think all you need to do is limit all_rows variable. One way to do this is finding the tab you are looking for with text and then getting the parent elements.
import selenium
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException
driver = webdriver.Firefox()
driver.get("https://www.someurl/some/other/page")
driver.implicitly_wait(12)
#driver.minimize_window()
wait = WebDriverWait(driver, 10)
# all_rows = driver.find_elements(By.CSS_SELECTOR, "div[class^='event__round'],div[class^='event__match']")
############### UPDATE ####################
def parent_element(element):
return element.find_element(By.XPATH, './..')
programma_element = WebDriverWait(driver, 10).until(
EC.visibility_of_element_located((By.XPATH, "//div[text()='Programma']")))
programma_element_p1 = parent_element(programma_element)
programma_element_p2 = parent_element(programma_element_p1)
programma_element_p3 = parent_element(programma_element_p2)
all_rows = programma_element_p3.find_elements(By.CSS_SELECTOR, "div[class^='event__round'],div[class^='event__match']")
filter_rows = []
for row in all_rows:
if "event__match--last" in row.get_attribute('class'):
filter_rows.append(row)
break
else:
filter_rows.append(row)
############### UPDATE ####################
current_round = '?'
for bundesliga in filter_rows:
classes = bundesliga.get_attribute('class')
#print(classes)
if 'event__round' in classes:
#round = row.find_elements(By.CSS_SELECTOR, "[class^='event__round event__round--static']")
#current_round = row.text # full text `Round 20`
current_round = bundesliga.text.split(" ")[-1] # only `20` without `Round`
else:
datetime = bundesliga.find_element(By.CSS_SELECTOR, "[class^='event__time']")
#Divide la data e l'ora
date, time = datetime.text.split(" ")
date = date.rstrip('.') # right-strip to remove `.` at the end of date
team_home = bundesliga.find_element(By.CSS_SELECTOR, "[class^='event__participant event__participant--home']")
team_away = bundesliga.find_element(By.CSS_SELECTOR, "[class^='event__participant event__participant--away']")
# score_home = bundesliga.find_element(By.CSS_SELECTOR, "[class^='event__score event__score--home']")
# score_away = bundesliga.find_element(By.CSS_SELECTOR, "[class^='event__score event__score--away']")
try:
score_home = bundesliga.find_element(By.CSS_SELECTOR, "[class^='event__score event__score--home']")
except (TimeoutException, NoSuchElementException):
MyObject = type('MyObject', (object,), {})
score_home = MyObject()
score_home.text = "-"
try:
score_away = bundesliga.find_element(By.CSS_SELECTOR, "[class^='event__score event__score--away']")
except (TimeoutException, NoSuchElementException):
MyObject = type('MyObject', (object,), {})
score_away = MyObject()
score_away.text = "-"
bundesliga = [current_round, date, time, team_home.text, team_away.text, score_home.text, score_away.text]
bundesliga.append(bundesliga)
print(bundesliga)

Why my Python code is extracting the same data for all the elements in my list?

My project consists of making a competitive watch table for hotel rates for an agency. It is a painful action that I wanted to automate, the code extract correctly the name of hotels and the prices I want to extract but it's working correctly only for the first hotel and I don't know where is the problem. I provide you with the code and the output, if any of you can help me and thank you in advance.
NB : the code 2 works correctly but when i've added more operations the problem appeared
code 1
#!/usr/bin/env python
# coding: utf-8
import time
from time import sleep
import ast
import pandas as pd
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait, Select
from selenium.common.exceptions import StaleElementReferenceException, NoSuchElementException
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
driver = webdriver.Chrome("C:\\Users\\marketing2\\Documents\\chromedriver.exe")
driver.get('https://tn.tunisiebooking.com/')
# params to select
params = {
'destination': 'Tozeur',
'date_from': '11/09/2021',
'date_to': '12/09/2021',
'bedroom': '1'
}
# select destination
destination_select = Select(WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.ID, 'ville_des'))))
destination_select.select_by_value(params['destination'])
# select bedroom
bedroom_select = Select(WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.ID, 'select_ch'))))
bedroom_select.select_by_value(params['bedroom'])
# select dates
script = f"document.getElementById('checkin').value ='{params['date_from']}';"
script += f"document.getElementById('checkout').value ='{params['date_to']}';"
script += f"document.getElementById('depart').value ='{params['date_from']}';"
script += f"document.getElementById('arrivee').value ='{params['date_to']}';"
driver.execute_script(script)
# submit form
btn_rechercher = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, '//*[#id="boutonr"]')))
btn_rechercher.click()
urls = []
hotels = WebDriverWait(driver, 10).until(EC.presence_of_all_elements_located((By.XPATH, "//div[starts-with(#id,'produit_affair')]")))
for hotel in hotels:
link = hotel.find_element_by_xpath(".//span[#class='tittre_hotel']/a").get_attribute("href")
urls.append(link)
for url in urls:
driver.get(url)
def existsElement(xpath):
try:
driver.find_element_by_id(xpath);
except NoSuchElementException:
return "false"
else:
return "true"
if (existsElement('result_par_arrangement')=="false"):
btn_t = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, '//*[#id="moteur_rech"]/form/div/div[3]/div')))
btn_t.click()
sleep(10)
else :
pass
try:
name = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.XPATH, "//div[#class='bloc_titre_hotels']/h2"))).text
arropt = driver.find_element_by_xpath("//div[contains(#class,'line_result')][1]")
opt = arropt.find_element_by_tag_name("b").text
num = len(arropt.find_elements_by_tag_name("option"))
optiondata = {}
achats = {}
marges= {}
selection = Select(driver.find_element_by_id("arrangement"))
for i in range(num):
try:
selection = Select(driver.find_element_by_id("arrangement"))
selection.select_by_index(i)
time.sleep(2)
arr = driver.find_element_by_xpath("//select[#id='arrangement']/option[#selected='selected']").text
prize = driver.find_element_by_id("prix_total").text
optiondata[arr] = (int(prize))
btn_passe = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, '//*[#id="resultat"]/div/form/div/div[2]/div[1]/div[2]/div[2]/div')))
btn_passe.click()
# params to select
params = {
'civilite_acheteur': 'Mlle',
'prenom_acheteur': 'test',
'nom_acheteur': 'test',
'e_mail_acheteur': 'test#gmail.com',
'portable_acheteur': '22222222',
'ville_acheteur': 'Test',
}
# select civilite
civilite_acheteur = Select(WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.NAME, 'civilite_acheteur'))))
civilite_acheteur.select_by_value(params['civilite_acheteur'])
# saisir prenom
script = f"document.getElementsByName('prenom_acheteur')[0].value ='{params['prenom_acheteur']}';"
script += f"document.getElementsByName('nom_acheteur')[0].value ='{params['nom_acheteur']}';"
script += f"document.getElementsByName('e_mail_acheteur')[0].value ='{params['e_mail_acheteur']}';"
script += f"document.getElementsByName('portable_acheteur')[0].value ='{params['portable_acheteur']}';"
script += f"document.getElementsByName('ville_acheteur')[0].value ='{params['ville_acheteur']}';"
driver.execute_script(script)
# submit form
btn_agence = driver.find_element_by_id('titre_Nabeul')
btn_agence.click()
btn_continuez = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.ID, 'boutonr')))
btn_continuez.click()
achat = int(driver.find_element_by_xpath('/html/body/header/div[2]/div[1]/div[1]/div[4]/div[2]/div[2]').text.replace(' TND', ''))
achats[arr]=achat
marge =int(((float(prize) - float(achat)) / float(achat)) * 100);
marges[arr]=marge
optiondata[arr]=prize,achat,marge
driver.get(url)
btn_display = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, '//*[#id="moteur_rech"]/form/div/div[3]/div')))
btn_display.click()
sleep(10)
except StaleElementReferenceException:
pass
except NoSuchElementException:
pass
s="- {} | {} : {}".format(name, opt, optiondata)
print(s)
ds = []
for l in s.splitlines():
d = l.split('-')
if len(d) > 1:
df = pd.DataFrame(ast.literal_eval(d[1].strip()))
ds.append(df)
for df in ds:
df.reset_index(drop=True, inplace=True)
df = pd.concat(ds, axis= 1)
cols = df.columns
cols = [((col.split('.')[0], col)) for col in df.columns]
df.columns=pd.MultiIndex.from_tuples(cols)
print(df.T)
#print("{} : {} - {}".format(name, opt, optiondata))
code 2
from selenium.webdriver.support.ui import Select
from selenium.common.exceptions import StaleElementReferenceException,NoSuchElementException
urls = []
hotels = driver.find_elements_by_xpath("//div[starts-with(#id,'produit_affair')]")
for hotel in hotels:
link = hotel.find_element_by_xpath(".//span[#class='tittre_hotel']/a").get_attribute("href")
urls.append(link)
for url in urls:
driver.get(url)
try:
name = driver.find_element_by_xpath("//div[#class='bloc_titre_hotels']/h2").text
arropt = driver.find_element_by_xpath("//div[contains(#class,'line_result')][1]")
opt = arropt.find_element_by_tag_name("b").text
num = len(arropt.find_elements_by_tag_name("option"))
optiondata = {}
selection = Select(driver.find_element_by_id("arrangement"))
for i in range(num):
try:
selection = Select(driver.find_element_by_id("arrangement"))
selection.select_by_index(i)
time.sleep(2)
arr = driver.find_element_by_xpath("//select[#id='arrangement']/option[#selected='selected']").text
prize = driver.find_element_by_id("prix_total").text
optiondata[arr]=prize
except StaleElementReferenceException:
pass
except NoSuchElementException:
pass
print("{} : {} - {} - {}".format(name,opt,num,optiondata))
Your code is outdated. The HTML has been changed/updated and elements such as the one with identity boutonr doesn't exist on the page anymore.
Your loop and order of execution is wrong so this makes the code evaluating still the same fields.
You should not use or at least minimise the usage of time.sleep() to an absolute minimum as it is a waste of time for your code execution. Use WebDriverWait(...) instead
I don't speak French so I could not understand what you are after in your code, but this minimised example below should help you to understand the principle.
#!/usr/bin/env python
# coding: utf-8
import time
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait, Select
from selenium.common.exceptions import StaleElementReferenceException, NoSuchElementException
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
driver = webdriver.Chrome("C:\chromedriver.exe")
driver.get('https://tn.tunisiebooking.com/')
# params to select
params = { 'destination': 'Nabeul',
'date_from': '25/08/2021',
'date_to': '26/08/2021',
'bedroom': '1' }
# select destination
destination_select = Select(WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.ID, 'ville_des'))))
destination_select.select_by_value(params['destination'])
# select bedroom
bedroom_select = Select(WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.ID, 'select_ch'))))
bedroom_select.select_by_value(params['bedroom'])
# select dates
script = f"document.getElementById('checkin').value ='{params['date_from']}';"
script += f"document.getElementById('checkout').value ='{params['date_to']}';"
script += f"document.getElementById('depart').value ='{params['date_from']}';"
script += f"document.getElementById('arrivee').value ='{params['date_to']}';"
driver.execute_script(script)
# submit form
btn_rechercher = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, '//div[#onclick="return submit_hotel_recherche()"]')))
btn_rechercher.click()
urls = []
hotels = WebDriverWait(driver, 10).until(EC.presence_of_all_elements_located((By.XPATH, "//div[starts-with(#id,'produit_affair')]")))
for hotel in hotels:
link = hotel.find_element_by_xpath(".//span[#class='tittre_hotel']/a").get_attribute("href")
urls.append(link)
for url in urls:
driver.get(url)
try:
name = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.XPATH, "//div[#class='bloc_titre_hotels']/h2"))).text
arropt = driver.find_element_by_xpath("//div[contains(#class,'line_result')][1]")
opt = arropt.find_element_by_tag_name("b").text
num = len(arropt.find_elements_by_tag_name("option"))
optiondata = {}
achats = {}
marges= {}
for i in range(num):
try:
selection = Select(WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.ID, 'arrangement')))).select_by_index(i)
time.sleep(0.5)
arr = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, "//select[#id='arrangement']/option[#selected='selected']"))).text
prize = driver.find_element_by_id("prix_total").text
optiondata[arr] = int(prize)
except StaleElementReferenceException:
pass
print("{} : {} - {}".format(name, opt, optiondata))
except NoSuchElementException:
pass
driver.quit()
Result:
Byzance Nabeul : Chambre Double - {'All Inclusive soft': 93, 'Demi Pension': 38, 'Petit Dejeuner': 28, 'Pension Complete': 78}
Palmyra Club Nabeul Nabeul : Double Standard - {'All Inclusive soft': 92}
The following code goes to the payment page and extracts all the info there:
#!/usr/bin/env python
# coding: utf-8
import time
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait, Select
from selenium.common.exceptions import StaleElementReferenceException, NoSuchElementException
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
driver = webdriver.Chrome("/usr/local/bin/chromedriver")
driver.get('https://tn.tunisiebooking.com/')
# params to select
params = {
'destination': 'Nabeul',
'date_from': '29/08/2021',
'date_to': '30/08/2021',
'bedroom': '1'
}
# select destination
destination_select = Select(WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.ID, 'ville_des'))))
destination_select.select_by_value(params['destination'])
# select bedroom
bedroom_select = Select(WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.ID, 'select_ch'))))
bedroom_select.select_by_value(params['bedroom'])
# select dates
script = f"document.getElementById('checkin').value ='{params['date_from']}';"
script += f"document.getElementById('checkout').value ='{params['date_to']}';"
script += f"document.getElementById('depart').value ='{params['date_from']}';"
script += f"document.getElementById('arrivee').value ='{params['date_to']}';"
driver.execute_script(script)
# submit form
btn_rechercher = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, '//div[#onclick="return submit_hotel_recherche()"]')))
btn_rechercher.click()
urls = []
hotels = WebDriverWait(driver, 10).until(EC.presence_of_all_elements_located((By.XPATH, "//div[starts-with(#id,'produit_affair')]")))
for hotel in hotels:
link = hotel.find_element_by_xpath(".//span[#class='tittre_hotel']/a").get_attribute("href")
urls.append(link)
for url in urls:
driver.get(url)
try:
name = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.XPATH, "//div[#class='bloc_titre_hotels']/h2"))).text
arropt = driver.find_element_by_xpath("//div[contains(#class,'line_result')][1]")
opt = arropt.find_element_by_tag_name("b").text
num = len(arropt.find_elements_by_tag_name("option"))
optiondata = {}
achats = {}
marges= {}
try:
selection = Select(WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.ID, 'arrangement'))))
time.sleep(0.5)
arr = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, "//select[#id='arrangement']/option[#selected='selected']"))).text
prize = driver.find_element_by_id("prix_total").text
optiondata[arr] = (int(prize))
btn_passe = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.ID, 'resa')))
btn_passe.click()
tot = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.ID, 'montant_total_apres_code')))
total = int(tot.text.replace(' €', ''))
# params to select
params = {
'civilite_acheteur': 'Mlle',
'prenom_acheteur': 'test',
'nom_acheteur': 'test',
'e_mail_acheteur': 'test#gmail.com',
'portable_acheteur': '22222222',
'ville_acheteur': 'Test',
}
# select civilite
civilite_acheteur = Select(WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.NAME, 'civilite_acheteur'))))
civilite_acheteur.select_by_value(params['civilite_acheteur'])
# saisir prenom
script = f"document.getElementsByName('prenom_acheteur')[0].value ='{params['prenom_acheteur']}';"
script += f"document.getElementsByName('nom_acheteur')[0].value ='{params['nom_acheteur']}';"
script += f"document.getElementsByName('e_mail_acheteur')[0].value ='{params['e_mail_acheteur']}';"
script += f"document.getElementsByName('portable_acheteur')[0].value ='{params['portable_acheteur']}';"
script += f"document.getElementsByName('ville_acheteur')[0].value ='{params['ville_acheteur']}';"
driver.execute_script(script)
# submit form
btn_agence = driver.find_element_by_class_name('continuez_resa')
btn_agence.click()
achat1 = int(driver.find_element_by_id('montant_a_payer').text.replace(' €', ''))
achat = int(driver.find_element_by_id('montant_restant').text.replace(' €', ''))
achat3 = float(driver.find_element_by_xpath('//div[#class="ligne_interne_total"]/div[3]/div[#class="prix_total1 text_shadow"]').text.replace(' TND', ''))
achats[arr]=achat
marge =int(((float(prize) - float(achat)) / float(achat)) * 100);
marges[arr]=marge
optiondata[arr]=prize,total,achat1,achat,achat3,marge
except StaleElementReferenceException:
pass
print("{} : {} - {}".format(name, opt, optiondata))
except NoSuchElementException:
pass
driver.quit()
Output:
Byzance Nabeul : Chambre Double - {'Petit Dejeuner': (36, 41, 12, 29, 4.0, 24)}
Where:
36 = Prix Total
41 = Montant Total
12 = Montant de l'acompte
29 = Vous payerez le reste à votre arrivée à l'hôtel
4.0 = Total taxe de séjour à payer sur place à l'hôtel est
24 = Marges
Hotel page:
You are using sleeps to load the pages in your first example but not in your second one (the one that you state works just fine).
This is typically not the way you want to actually use selenium and leads me to believe that your timing is off.
This SO answer shows you how to use "Explicit Waits" on "expected_conditions" to not have "specific timings" which can/will fail.
You even create a wait object but never use it.
Use it in conjunction with expected_conditions and remove the specific timed sleeps and things will get better.
expected_conditions docs are here
The problem was that it can't access to the element listing arrangements for the rest of the hotels in the list i've added a function that tests the presence of the data and it workod
for url in urls:
driver.get(url)
def existsElement(xpath):
try:
driver.find_element_by_id(xpath);
except NoSuchElementException:
return "false"
else:
return "true"
if (existsElement('result_par_arrangement')=="false"):
btn_t = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, '//*[#id="moteur_rech"]/form/div/div[3]/div')))
btn_t.click()
else :
pass

How to stop selenium scraper from redirecting to another internal weblink of the scraped website?

Was wondering if anyone knows of a way for instructing a selenium script to avoid visiting/redirecting to an internal page that wasn't part of the code. Essentially, my code opens up this page:
https://cryptwerk.com/companies/?coins=1,6,11,2,3,8,17,7,13,4,25,29,24,32,9,38,15,30,43,42,41,12,40,44,20
keeps clicking on show more button until there's none (at end of page) - which by then - it should have collected the links of all the products listed on the page it scrolled through till the end, then visit each one respectively.
What happens instead, it successfully clicks on show more till the end of the page, but then it visits this weird promotion page of the same website instead of following each of the gathered links respectively and then scraping further data points located off each of those newly opened ones.
In a nutshell, would incredibly appreciate it if someone can explain how to avoid this automated redirection on its own! And this is the code in case someone can gratefully nudge me in the right direction :)
from selenium.webdriver import Chrome
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
import time
from selenium.common.exceptions import NoSuchElementException, ElementNotVisibleException
import json
import selenium.common.exceptions as exception
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.keys import Keys
from selenium import webdriver
webdriver = '/Users/karimnabil/projects/selenium_js/chromedriver-1'
driver = Chrome(webdriver)
driver.implicitly_wait(5)
url = 'https://cryptwerk.com/companies/?coins=1,6,11,2,3,8,17,7,13,4,25,29,24,32,9,38,15,30,43,42,41,12,40,44,20'
driver.get(url)
links_list = []
coins_list = []
all_names = []
all_cryptos = []
all_links = []
all_twitter = []
all_locations = []
all_categories = []
all_categories2 = []
wait = WebDriverWait(driver, 2)
sign_in = driver.find_element_by_xpath("//li[#class='nav-item nav-guest']/a")
sign_in.click()
time.sleep(2)
user_name = wait.until(EC.presence_of_element_located((By.XPATH, "//input[#name='login']")))
user_name.send_keys("karimnsaber95#gmail.com")
password = wait.until(EC.presence_of_element_located((By.XPATH, "//input[#name='password']")))
password.send_keys("PleomaxCW#2")
signIn_Leave = driver.find_element_by_xpath("//div[#class='form-group text-center']/button")
signIn_Leave.click()
time.sleep(3)
while True:
try:
loadMoreButton = driver.find_element_by_xpath("//button[#class='btn btn-outline-primary']")
time.sleep(2)
loadMoreButton.click()
time.sleep(2)
except exception.StaleElementReferenceException:
print('stale element')
break
print('no more elements to show')
try:
company_links = driver.find_elements_by_xpath("//div[#class='companies-list items-infinity']/div[position() > 3]/div[#class='media-body']/div[#class='title']/a")
for link in company_links:
links_list.append(link.get_attribute('href'))
except:
pass
try:
with open("links_list.json", "w") as f:
json.dump(links_list, f)
with open("links_list.json", "r") as f:
links_list = json.load(f)
except:
pass
try:
for link in links_list:
driver.get(link)
name = driver.find_element_by_xpath("//div[#class='title']/h1").text
try:
show_more_coins = driver.find_element_by_xpath("//a[#data-original-title='Show more']")
show_more_coins.click()
time.sleep(1)
except:
pass
try:
categories = driver.find_elements_by_xpath("//div[contains(#class, 'categories-list')]/a")
categories_list = []
for category in categories:
categories_list.append(category.text)
except:
pass
try:
top_page_categories = driver.find_elements_by_xpath("//ol[#class='breadcrumb']/li/a")
top_page_categories_list = []
for category in top_page_categories:
top_page_categories_list.append(category.text)
except:
pass
coins_links = driver.find_elements_by_xpath("//div[contains(#class, 'company-coins')]/a")
all_coins = []
for coin in coins_links:
all_coins.append(coin.get_attribute('href'))
try:
location = driver.find_element_by_xpath("//div[#class='addresses mt-3']/div/div/div/div/a").text
except:
pass
try:
twitter = driver.find_element_by_xpath("//div[#class='links mt-2']/a[2]").get_attribute('href')
except:
pass
try:
print('-----------')
print('Company name is: {}'.format(name))
print('Potential Categories are: {}'.format(categories_list))
print('Potential top page categories are: {}'.format(top_page_categories_list))
print('Supporting Crypto is:{}'.format(all_coins))
print('Registered location is: {}'.format(location))
print('Company twitter profile is: {}'.format(twitter))
time.sleep(1)
except:
pass
all_names.append(name)
all_categories.append(categories_list)
all_categories2.append(top_page_categories_list)
all_cryptos.append(all_coins)
all_twitter.append(twitter)
all_locations.append(location)
except:
pass
df = pd.DataFrame(list(zip(all_names, all_categories, all_categories2, all_cryptos, all_twitter, all_locations)), columns=['Company name', 'Categories1', 'Categories2', 'Supporting Crypto', 'Twitter Handle', 'Registered Location'])
CryptoWerk_Data = df.to_csv('CryptoWerk4.csv', index=False)
Redirect calls happen for two reasons, in your case either by executing some javascript code when clicking the last time on the load more button or by receiving an HTTP 3xx code, which is the least likely in your case.
So you need to identify when this javascript code is executed and send an ESC_KEY before it loads and then executing the rest of your script.
You could also scrape the links and append them to your list before clicking the load more button and each time it is clicked, make an if statement the verify the link of the page you're in, if it is that of the promotion page then execute the rest of your code, else click load more.
while page_is_same:
scrape_elements_add_to_list()
click_load_more()
verify_current_page_link()
if current_link_is_same != link_of_scraped_page:
page_is_same = False
# rest of the code here

Absolute Path: Selenium Xpath Can't Identify Element

I am attempting to identify an HTML Button with Xpath and have attempted both the relative and absolute Xpath without success. I am attempting to click the button.
The relative path:
click = webdriver.find.element_by_xpath("//onboarding-mobile-fixed-bottom-container/div[1]/div/sprout-button/button").click()
Absolute path: /html/body/cfapp-root/main/cfapp-spa-host/main/onboarding-root/div/div[1]/main/onboarding-business-phone/section/form/onboarding-next-button/onboarding-mobile-fixed-bottom-container/div[2]/sprout-button/button
absolute = webdriver.find.element_by_xpath("/html/body/cfapp-root/main/cfapp-spa-host/main/onboarding-root/div/div[1]/main/onboarding-business-phone/section/form/onboarding-next-button/onboarding-mobile-fixed-bottom-container/div[2]/sprout-button/button").click()
Even when using the absolute xpath (I know, frowned upon practice) I can't get the button to click.
For reference, I am automating: site: https://account.kabbage.com/onboarding/data/number-of-employees; Username: testingoverflow#aol.com; Pw: Kabbage123
(click finish applying; finish applying; working on the continue box)
Any help is much appreciated!!
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from time import sleep, strftime
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
import csv
import xlrd
info = "info.xlsx"
openwb = xlrd.open_workbook(info)
inputws = openwb.sheet_by_index(0)
print(inputws.nrows)
print(inputws.ncols)
print(inputws.cell_value(1,0))
email_log = inputws.cell_value(2,0)
businesslog = inputws.cell_value(2,1)
firstname = inputws.cell_value(2,2)
lastname = inputws.cell_value(2,3)
phone = int(inputws.cell_value(2,4))
employees = int(inputws.cell_value(2,5))
business_types = inputws.cell_value(2,6)
print(email_log)
print(businesslog)
print(firstname)
print(lastname)
print(phone)
sleep(1)
chromedriver_path = 'C:/Users/Documents/Scale/Programs/chromedriver.exe'
webdriver = webdriver.Chrome(executable_path=chromedriver_path)
webdriver.get('https://app.kabbage.com/signup/create_account')
sleep(1)
#input email
input_emails = webdriver.find_element_by_xpath('//*[#id="CreateAccount.EmailAddress_inner"]').send_keys(email_log)
sleep(1)
#re-input email
reinput = webdriver.find_element_by_xpath('//*[#id="CreateAccount.ConfirmEmail_inner"]').send_keys(email_log)
# Password
passwrd = webdriver.find_element_by_xpath('//*[#id="CreateAccount.CreatePassword"]')
sleep(1)
passwrd.send_keys('Paycheck11!!')
sleep(1)
button_started = webdriver.find_element_by_class_name("btn-full-width").click()
sleep(5)
#ApplyNow
#apply = webdriver.find_element_by_class_name('spr-btn spr-btn-primary')
#apply = webdriver.find_elements_by_class_name("spr-btn-primary").click()
#xpath("//div[#class='fc-day-content' and text()='15']")
applynow = webdriver.find_element_by_xpath("//sprout-button/button[contains(#class, 'spr-btn-primary')]").click()
sleep(5)
applyfinal = webdriver.find_element_by_xpath("//sprout-button/button[contains(#class, 'spr-btn-primary')]").click()
sleep(5)
business_name = webdriver.find_element_by_xpath('//*[#id="businessName-input"]').send_keys(businesslog)
business_send = webdriver.find_element_by_xpath("/html/body/cfapp-root/main/cfapp-spa-host/main/onboarding-root/div/div[1]/main/onboarding-business-name/section/form/onboarding-next-button/onboarding-mobile-fixed-bottom-container/div[2]/sprout-button/button").click()
sleep(5)
first_name = webdriver.find_element_by_xpath('//*[#id="lastName-input"]').send_keys(lastname)
last_name = webdriver.find_element_by_xpath('//*[#id="firstName-input"]').send_keys(firstname)
names_send = webdriver.find_element_by_xpath("/html/body/cfapp-root/main/cfapp-spa-host/main/onboarding-root/div/div[1]/main/onboarding-personal-name/section/form/onboarding-next-button/onboarding-mobile-fixed-bottom-container/div[2]/sprout-button/button").click()
sleep(5)
phone_num = webdriver.find_element_by_xpath('//*[#id="businessPhone-input"]').send_keys(phone)
phone_check = webdriver.find_element_by_xpath('//html/body/cfapp-root/main/cfapp-spa-host/main/onboarding-root/div/div[1]/main/onboarding-business-phone/section/form/kbg-consent-box/div/sprout-checkbox/div/label').click()
#phone_send = names_send = webdriver.find_element_by_xpath("/html/body/cfapp-root/main/cfapp-spa-host/main/onboarding-root/div/div[1]/main/onboarding-personal-name/section/form/onboarding-next-button/onboarding-mobile-fixed-bottom-container/div[2]/sprout-button/button").click()
phone_submits = webdriver.find_element_by_xpath("/html/body/cfapp-root/main/cfapp-spa-host/main/onboarding-root/div/div[1]/main/onboarding-business-phone/section/form/onboarding-next-button/onboarding-mobile-fixed-bottom-container/div[2]/sprout-button/button").click()
sleep(5)
num_empl = webdriver.find_element_by_xpath('//*[#id="numberOfEmployees-input"]').send_keys(employees)
#emp_submit = webdriver.find_element_by_xpath("//sprout-button/button[contains(#class, 'spr-btn-block')][2]").click()
sending = webdriver.find.element_by_xpath("//button[#class='spr-btn spr-btn-primary' and contains(text(),'Continue')]").click()
You can use any of these Xpaths:
Correct relative XPath for Continue button
Xpath 1:
*//onboarding-next-button//onboarding-mobile-fixed-bottom-container//div[2]//sprout-button//button[contains(text(),Continue)]
Xpath 2:
*//sprout-button[#class='desktop-button']//button[contains(text(),Continue)]
Give this a go:
webdriver.find_element_by_xpath("//button[#class="spr-btn spr-btn-primary" and contains(text(),'Continue')]").click()

Categories

Resources