Selenium Looping Dynamic Drop Downs with Submit in Python - python

I have been banging away at this for about a day and a half now but I cannot seem to make any headway. I am new to Selenium and am currently trying to collect data from a dynamic web app where BeautifulSoup (my typical go-to) is not an option as there are no static urls. The app I am scraping is an auto parts catalog and I need to select the first available year, which loads the available makes, select the first make, which loads the available models, select the first model, then click submit, scrape the data that results, then go back and do the next model. When all models are done, move on to the next make and start on more models. When all the models are done, move to the next year, then do all available makes, then models. My nested for-loops are as follows:
from selenium import webdriver
from selenium.webdriver.support.ui import Select
from selenium.webdriver.support.ui import WebDriverWait
from selenium.common.exceptions import StaleElementReferenceException
def make_waitfor_elem_updated_predicate(driver,
waitfor_elem_xpath_select):
elem = driver.find_element_by_xpath(waitfor_elem_xpath_select)
def elem_updated(driver):
try:
elem.text
except StaleElementReferenceException:
return True
except:
pass
return False
return lambda driver: elem_updated(driver)
class Partbot:
def __init__(self):
self.home_page = 'http://monroe-px.rtrk.com/en-US/e-catalog'
self.driver = webdriver.Chrome()
self.year_xpath = '//select[#id="widget-ymm-year-desktop"]'
self.make_xpath = '//select[#id="widget-ymm-make-desktop"]'
self.model_xpath = '//select[#id="widget-ymm-model-desktop"]'
def get_select(self, xpath_select):
select_elem = self.driver.find_element_by_xpath(xpath_select)
select = Select(select_elem)
return select
def select_option(self, xpath_select, value,
waitfor_elem_xpath_select=None):
if waitfor_elem_xpath_select:
func = make_waitfor_elem_updated_predicate(
self.driver,
waitfor_elem_xpath_select
)
select = self.get_select(xpath_select)
select.select_by_value(value)
return self.get_select(xpath_select)
def make_select_option_iterator(self, xpath_select,
waitfor_elem_xpath_select):
def next_option(xpath_select_select,
waitfor_elem_xpath_select):
select = self.get_select(xpath_select)
select_option_values = [
'{}'.format(op.get_attribute('value'))
for op
in select.options[1:]
]
for v in select_option_values:
select = self.select_option(xpath_select, v,
waitfor_elem_xpath_select)
yield select.first_selected_option.text
return lambda: next_option(xpath_select,
waitfor_elem_xpath_select)
def load_page(self):
self.driver.get(self.home_page)
def page_loaded(driver):
path = '//select[#id="widget-ymm-year-desktop"]'
return driver.find_element_by_xpath(path)
wait = WebDriverWait(self.driver, 10)
wait.until(page_loaded)
def are_extras_present(self):
extras = self.driver.find_elements_by_xpath("//*
[contains(#id, 'widget-ymm-moreinfo')]")
if len(extras) >= 1:
return True
else:
return False
def scrape_items(self):
years = self.make_select_option_iterator(
self.year_xpath,
self.make_xpath
)
makes = self.make_select_option_iterator(
self.make_xpath,
self.model_xpath
)
models = self.make_select_option_iterator(
self.model_xpath,
None
)
self.load_page()
try:
for year in years():
print(year)
for make in makes():
print(2*' ', make)
for model in models():
print(4*' ', model)
subm =
self.driver.find_element_by_id('lookup-form-desktop')
subm.find_element_by_tag_name("button").click()
time.sleep(2)
except:
self.driver.quit()
if __name__ == '__main__':
pb = Partbot()
pb.scrape_items()

Related

I am very new to scraping please bear with me and this is my 1st project. I am trying to scrape a site using selenium

"problem lines"
for_tariff_loop = driver.find_elements_by_xpath("//span[#class='phx-radio__element']")
radio_label_list = for_tariff_loop[i].find_element_by_css_selector('span[class="phx-radio__label"]')
print(radio_label_list)
time.sleep(1)
website I'm scraping https://www.telekom.de/unterwegs/apple/apple-iphone-13-pro/graphit-512gb
label image
I was not able to print the radio buttons label according to checked button. I don't know what is the mistake and where I did it. could anyone help on this. It will be helpful for me to learn. Change tariff links given below links,
import xlwt
from selenium import webdriver
import re
import time
from datetime import date
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.keys import Keys
class telekommobiles:
def __init__(self):
self.url="https://www.telekom.de/mobilfunk/geraete/smartphone?page=1&pageFilter=promotion"
self.country='DE'
self.currency='GBP'
self.VAT='Included'
self.shipping = 'free shipping within 3-4 weeks'
self.Pre_PromotionPrice ='N/A'
self.color ='N/A'
def telekom(self):
#try:
driver=webdriver.Chrome()
driver.maximize_window()
driver.get(self.url)
today = date.today()
time.sleep(5)
cookies = driver.find_element_by_css_selector('button.cl-btn.cl-btn--accept-all').click()
print("cookies accepted")
links_prod_check = []
prod_models = []
prod_manufacturer =[]
prod_memorys = []
product_colors =[]
product_price_monthly_payments = []
product_price_one_time_payments =[]
product_links = []
containers = driver.find_elements_by_css_selector('div[class="styles_item__12Aw4"]')
i = 1
for container in containers:
p_links =container.find_element_by_tag_name('a').get_attribute('href')
i = i + 1
product_links.append(p_links)
#print(p_links)
for links in product_links:
driver.get(links)
#time.sleep(5)
#print(driver.current_url)
#links_prod_check.append(driver.current_url)
coloroptions = WebDriverWait(driver, 30).until(EC.presence_of_all_elements_located((By.XPATH,"//li[#data-qa='list_ColorVariant']")))
#print(coloroptions)
for i in range(len(coloroptions)):
coloroption = driver.find_elements_by_xpath("//li[#data-qa='list_ColorVariant']")
coloroption[i].click()
#print(coloroption[i])
time.sleep(3)
memoryoptions = WebDriverWait(driver, 30).until(EC.presence_of_all_elements_located((By.XPATH,"//span[#class='phx-radio__element']")))
for i in range(len(memoryoptions)):
memoryoption = driver.find_elements_by_xpath("//span[#class='phx-radio__element']")
try:
memoryoption[i].click()
except:
pass
time.sleep(5)
change_traiff = driver.find_element_by_css_selector('button[class="phx-link phx-list-of-links__link js-mod tracking-added"]').click()
time.sleep(3)
#looping for each section
section_loops = driver.find_elements_by_css_selector('section[class="tariff-catalog--layer"]')
#print(len(section_loops))
for section_loop in section_loops:
#print(section_loop)
time.sleep(5)
#Headings
heading_1 = section_loop.find_element_by_css_selector('h2[class="page-title page-title--lowercase"]').text
print(heading_1)
# looping for each separate boxes
each_box_subcontainers = section_loop.find_elements_by_css_selector('.phx-tariff-box__section')
#print(len(each_box_subcontainers))
for subcontainer in each_box_subcontainers:
#print(subcontainer)
looping_for_tariff = WebDriverWait(driver, 10).until(EC.presence_of_all_elements_located((By.XPATH,"//span[#class='phx-radio__element']")))
#print(looping_for_tariff)
for i in range(len(looping_for_tariff)):
#print(i)
try:
for_tariff_loop = driver.find_elements_by_xpath("//span[#class='phx-radio__element']")
for_tariff_loop[i].click()
time.sleep(3)
except:
pass
for_tariff_loop = driver.find_elements_by_xpath("//span[#class='phx-radio__element']")
radio_label_list = for_tariff_loop[i].find_element_by_css_selector('span[class="phx-radio__label"]')
print(radio_label_list)
time.sleep(1)
change_traiff_close_button = driver.find_element_by_css_selector('span[class="icon-after-yellow-close right close popup-close-tr js-popup-close"]').click()
telekom_de=telekommobiles()
telekom_de.telekom()
You are trying to find element within an element. Finding radio_label_list using for_tariff_loop[i], xpath for radio_label_list will become like below:
//span[#class='phx-radio__element']//span[#class="phx-radio__label"]
Which does not exist in the DOM.
I tried the last part of the code. And was able to print the Memory size like below. Do try and confirm:
Replaced css-selector for radio_label_list with this xpath ./following-sibling::span
looping_for_tariff = WebDriverWait(driver, 10).until(EC.presence_of_all_elements_located((By.XPATH, "//span[#class='phx-radio__element']")))
# print(looping_for_tariff)
for i in range(len(looping_for_tariff)):
# print(i)
try:
for_tariff_loop = driver.find_elements_by_xpath("//span[#class='phx-radio__element']")
for_tariff_loop[i].click()
time.sleep(3)
except:
pass
for_tariff_loop = driver.find_elements_by_xpath("//span[#class='phx-radio__element']")
radio_label_list = for_tariff_loop[i].find_element_by_xpath("./following-sibling::span").text
print(radio_label_list)
time.sleep(1)
As per the comments, check this code:
driver.get("https://www.telekom.de/unterwegs/apple/apple-iphone-13-pro/graphit-512gb")
wait = WebDriverWait(driver,30)
wait.until(EC.element_to_be_clickable((By.XPATH,"//button[text()='Accept All']"))).click()
wait.until(EC.element_to_be_clickable((By.XPATH,"//ul[contains(#class,'phx-tariff-notification-box-new__element--desktop-tablet')]/li[2]/button"))).click()
length = len(driver.find_elements_by_class_name("phx-tariff-box__section"))
for i in range(length):
print("----------------------------------------------------------------------------------------------------------")
options = driver.find_elements_by_class_name("phx-tariff-box__section")
datas = options[i].find_element_by_xpath(".//div[contains(#class,'phx-tariff-box__volume')]").get_attribute("innerText")
print("data: {}".format(datas))
len_types = len(options[i].find_elements_by_xpath(".//div[#class='phx-tariff-box__radios-inner']//label"))
types = options[i].find_elements_by_xpath(".//div[#class='phx-tariff-box__radios-inner']//label")
if len(types) == 0:
price = options[i].find_element_by_xpath(".//p[#data-qa='block_TariffPrice']").get_attribute("innerText")
print(price)
else:
for j in range(len_types):
types[j].click()
time.sleep(2)
options = driver.find_elements_by_class_name("phx-tariff-box__section")
types = options[i].find_elements_by_xpath(".//div[#class='phx-tariff-box__radios-inner']//label")
try:
types[j].find_element_by_xpath("./input[#checked]")
type = types[j].find_element_by_xpath("./span[2]").get_attribute("innerText")
price = options[i].find_element_by_xpath(".//p[#data-qa='block_TariffPrice']").get_attribute("innerText")
print(f"{type}: {price}")
except:
pass

How to stop selenium scraper from redirecting to another internal weblink of the scraped website?

Was wondering if anyone knows of a way for instructing a selenium script to avoid visiting/redirecting to an internal page that wasn't part of the code. Essentially, my code opens up this page:
https://cryptwerk.com/companies/?coins=1,6,11,2,3,8,17,7,13,4,25,29,24,32,9,38,15,30,43,42,41,12,40,44,20
keeps clicking on show more button until there's none (at end of page) - which by then - it should have collected the links of all the products listed on the page it scrolled through till the end, then visit each one respectively.
What happens instead, it successfully clicks on show more till the end of the page, but then it visits this weird promotion page of the same website instead of following each of the gathered links respectively and then scraping further data points located off each of those newly opened ones.
In a nutshell, would incredibly appreciate it if someone can explain how to avoid this automated redirection on its own! And this is the code in case someone can gratefully nudge me in the right direction :)
from selenium.webdriver import Chrome
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
import time
from selenium.common.exceptions import NoSuchElementException, ElementNotVisibleException
import json
import selenium.common.exceptions as exception
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.keys import Keys
from selenium import webdriver
webdriver = '/Users/karimnabil/projects/selenium_js/chromedriver-1'
driver = Chrome(webdriver)
driver.implicitly_wait(5)
url = 'https://cryptwerk.com/companies/?coins=1,6,11,2,3,8,17,7,13,4,25,29,24,32,9,38,15,30,43,42,41,12,40,44,20'
driver.get(url)
links_list = []
coins_list = []
all_names = []
all_cryptos = []
all_links = []
all_twitter = []
all_locations = []
all_categories = []
all_categories2 = []
wait = WebDriverWait(driver, 2)
sign_in = driver.find_element_by_xpath("//li[#class='nav-item nav-guest']/a")
sign_in.click()
time.sleep(2)
user_name = wait.until(EC.presence_of_element_located((By.XPATH, "//input[#name='login']")))
user_name.send_keys("karimnsaber95#gmail.com")
password = wait.until(EC.presence_of_element_located((By.XPATH, "//input[#name='password']")))
password.send_keys("PleomaxCW#2")
signIn_Leave = driver.find_element_by_xpath("//div[#class='form-group text-center']/button")
signIn_Leave.click()
time.sleep(3)
while True:
try:
loadMoreButton = driver.find_element_by_xpath("//button[#class='btn btn-outline-primary']")
time.sleep(2)
loadMoreButton.click()
time.sleep(2)
except exception.StaleElementReferenceException:
print('stale element')
break
print('no more elements to show')
try:
company_links = driver.find_elements_by_xpath("//div[#class='companies-list items-infinity']/div[position() > 3]/div[#class='media-body']/div[#class='title']/a")
for link in company_links:
links_list.append(link.get_attribute('href'))
except:
pass
try:
with open("links_list.json", "w") as f:
json.dump(links_list, f)
with open("links_list.json", "r") as f:
links_list = json.load(f)
except:
pass
try:
for link in links_list:
driver.get(link)
name = driver.find_element_by_xpath("//div[#class='title']/h1").text
try:
show_more_coins = driver.find_element_by_xpath("//a[#data-original-title='Show more']")
show_more_coins.click()
time.sleep(1)
except:
pass
try:
categories = driver.find_elements_by_xpath("//div[contains(#class, 'categories-list')]/a")
categories_list = []
for category in categories:
categories_list.append(category.text)
except:
pass
try:
top_page_categories = driver.find_elements_by_xpath("//ol[#class='breadcrumb']/li/a")
top_page_categories_list = []
for category in top_page_categories:
top_page_categories_list.append(category.text)
except:
pass
coins_links = driver.find_elements_by_xpath("//div[contains(#class, 'company-coins')]/a")
all_coins = []
for coin in coins_links:
all_coins.append(coin.get_attribute('href'))
try:
location = driver.find_element_by_xpath("//div[#class='addresses mt-3']/div/div/div/div/a").text
except:
pass
try:
twitter = driver.find_element_by_xpath("//div[#class='links mt-2']/a[2]").get_attribute('href')
except:
pass
try:
print('-----------')
print('Company name is: {}'.format(name))
print('Potential Categories are: {}'.format(categories_list))
print('Potential top page categories are: {}'.format(top_page_categories_list))
print('Supporting Crypto is:{}'.format(all_coins))
print('Registered location is: {}'.format(location))
print('Company twitter profile is: {}'.format(twitter))
time.sleep(1)
except:
pass
all_names.append(name)
all_categories.append(categories_list)
all_categories2.append(top_page_categories_list)
all_cryptos.append(all_coins)
all_twitter.append(twitter)
all_locations.append(location)
except:
pass
df = pd.DataFrame(list(zip(all_names, all_categories, all_categories2, all_cryptos, all_twitter, all_locations)), columns=['Company name', 'Categories1', 'Categories2', 'Supporting Crypto', 'Twitter Handle', 'Registered Location'])
CryptoWerk_Data = df.to_csv('CryptoWerk4.csv', index=False)
Redirect calls happen for two reasons, in your case either by executing some javascript code when clicking the last time on the load more button or by receiving an HTTP 3xx code, which is the least likely in your case.
So you need to identify when this javascript code is executed and send an ESC_KEY before it loads and then executing the rest of your script.
You could also scrape the links and append them to your list before clicking the load more button and each time it is clicked, make an if statement the verify the link of the page you're in, if it is that of the promotion page then execute the rest of your code, else click load more.
while page_is_same:
scrape_elements_add_to_list()
click_load_more()
verify_current_page_link()
if current_link_is_same != link_of_scraped_page:
page_is_same = False
# rest of the code here

How to periodically fetch records from a website using selenium?

I have a small script that fetches company data from a website. This website gets regularly updated with new company information. How can I update my csv with new records on a periodic basis? Also as you can see in the code I have used an explicit range for the pages, what other solutions are possible?
The following is the code -
from selenium.webdriver import Firefox
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from time import sleep
import csv
#navigate to the ystory companies page
#start collecting data from ystory
START_URL = 'https://yourstory.com/companies/search?page=1&hitsPerPage=30'
#when the collection populates 30 elements then click on next page
class CompDeetz():
def __init__(self):
self.browser = Firefox()
self.browser.get(START_URL)
sleep(20)
self.browser.find_element_by_xpath('/html/body/div[12]/div/div/button').click()
sleep(5)
self.browser.find_element_by_xpath('/html/body/div[1]/div[4]').click()
self.database = []
def write_row(self,record):
with open('test.csv', 'a') as t:
writer = csv.writer(t)
writer.writerows(record)
def get_everything(self):
all_list = [ (a.text) for a in self.browser.find_elements_by_xpath('//tr[#class="hit"]')]
all_records = []
for company in all_list:
record = company.split('\n')
all_records.append(record)
self.write_row(all_records)
def next_page(self):
self.browser.find_element_by_xpath('//ul[#class="ais-Pagination-list"]/li[7]/a').click()
sleep(20)
def main():
t = CompDeetz()
t.get_everything()
for i in range(33):
t.next_page()
t.get_everything()
if __name__ == "__main__":
main()
Instead of having two different methods get_everything and next_page and calling them multiple times. You can have one method get_everything and call it once.
def get_everything(self):
all_records = []
nextPage = True
while nextPage:
all_list = [ (a.text) for a in self.browser.find_elements_by_xpath('//tr[#class="hit"]')]
for company in all_list:
record = company.split('\n')
all_records.append(record)
try:
nextPagelink = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, "//a[#aria-label='Next page']")))
driver.execute_script("arguments[0].scrollIntoView();", nextPagelink)
driver.execute_script("arguments[0].click();", nextPagelink)
time.sleep(5) # for next [age to load
#As on last page, next page link is not available. It will throw exception
except NoSuchElementException:
nextpage = False
self.write_row(all_records)
Note : take care of Pop up coming on page. I hope you already have mechanism to handle it.

Webdriver/BeautifulSoup Getting my program to check if part of a string exists on the webpage

I am writing a bot that purchases items automatically. The current way I am going about this is I am putting the product info in a dictionary titled INFO, and referencing it whenever I need a specific product/color/etc.
Currently my code (specifically in findProduct()) checks to see if the index in temp_tuple is the same as INFO['product'] for instance.
In my case, I look for a product and my code returns an error because there is a space at the end of some of the names, and my code cannot handle that.
However, I want to modify it to is check whether or not the string is on the webpage so that way my code runs even with that extra space.
Enough of my code that works as it is:
#!/usr/bin/env python3
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.support.select import Select
import time
import requests
import bs4 as bs
from splinter import Browser
import helpers
from selenium.common.exceptions import ElementNotInteractableException
from config import INFO
def __init__(self, **info):
self.base_url = 'http://www.supremenewyork.com/'
self.shop = 'shop/all/'
self.checkout = 'checkout/'
self.info = info
class supremeBot(object):
def __init__(self, **info):
self.base_url = 'http://www.supremenewyork.com/'
self.shop = 'shop/all/'
self.info = info
def initializeBrowser(self):
driverz = self.info["driver"]
path = helpers.get_driver_path(driver)
if driverz == "geckodriver":
self.b = Browser()
elif driverz == "chromedriver":
executable_path = {"executable_path": path}
self.b = Browser('chrome', **executable_path)
#This looks for the product based on what the category is
def findProduct(self):
category = str(INFO['category'])
source = requests.get("http://www.supremenewyork.com/shop/all/"+category).text
soup = bs.BeautifulSoup(source, 'lxml')
temp_link = []
temp_tuple = []
for link in soup.find_all('a', href=True):
temp_tuple.append((link['href'], link.text))
for i in temp_tuple:
if i[1] == INFO['product'] or i[1] == INFO['color']: # <------------ I want this to recognize a partial string
temp_link.append(i[0])
#print(temp_link)
#This creates end of the final link
self.final_link = list(
set([x for x in temp_link if temp_link.count(x) == 2]))[0]
#Concatenates the previous link w/ the website
link = 'http://www.supremenewyork.com'+str(self.final_link)
driver.get(link)
if __name__ == "__main__":
driver = webdriver.Chrome('./chromedriver')
'''
BOT = supremeBot(**INFO)
BOT.findProduct()
order()
'''
BOT = supremeBot(**INFO)
found_product = False
counter = 1
max_iter = 5
while not found_product and counter < max_iter:
found_product = BOT.findProduct()
print("We tried ",counter," times.")
counter +=1
if found_product:
print('Couldn\'t find it')
continue
else:
print('found it')
order()
INFO = {
"driver": "chromedriver",
"product": "Supreme®/MLB New Era®", # "Big Duffle Bag " is an example of a product that has the space after it
"color": "Navy",
"category": "hats",
"size": "Medium",
"namefield": "Bucky McNuts",
"emailfield": "email#email.com",
"phonefield": "(555)555-5555",
"addressfield": "321 St",
}
In this case, if you were to replace Supreme®/MLB New Era® with "Big Duffle Bag " you'll see the code doesn't run if you removed the space after the word bag.
If anybody could help I would really appreciate it!
You can do this check for partial string:
if "part" in "partstring":
print("the word 'part' is within 'partsting'")
Possible use here:
if INFO['product'] in i[1].lower() or INFO['color'] in i[1].lower():
#do something
The .lower() is to make sure the text on the site is lower case

using Enum to build locator for selenium test

I come out one question about selenium web test. Since all elements require their own xpath or css selector to be action by selenium webdriver.
I have tried to use python Enum and create something like
file: elementEnum.py
from enum import Enum
class PageA(Enum):
pElementA = '//div[{}]'
pElementB = '//a[.="{}"]'
pElementC = '//div[#class={}]'
class PageB(Enum):
pElementA = '//button[.="{}""]'
pElementB = '//table/tr[{}]/td[{}]'
but it turns out lots of time I require to build the string in python format function and it does not see pythonoic.
from elementEnum import *
driver.find_element_by_xpath('//div[#class="aaaaaa"]/{}'.format((PageA.pElementA.value).format(1)))
driver.find_element_by_xpath('{}/{}'.format(PageA.pElementA.value).format(1), PageA.pElementB.value.format(2)))
driver.find_element_by_xpath('{}/{}'.format(PageB.pElementB.value).format(1, 3), PageA.pElementA.value.format(2)))
What is the best way for me to list out all corresponse element and their locator.
you could use the
EC.visibility_of_element_located to locate the element
http://selenium-python.readthedocs.io/waits.html
sample code :
class SeleniumBaseClass(object):
def __init__(self,driver):
self.driver = driver
def open(self,URL):
self.driver.get(URL)
def driverURLChange(self,URL):
print("change URL" + URL)
self.driver.get(URL)
def currentUrl(self):
print("URL " + self.driver.current_url)
return self.driver.current_url
def switchNewWindow(self):
self.driver.switch_to_window(self.driver.window_handles[1])
return self.driver.title
def locateElement(self, loc):
try:
print(loc)
element = WebDriverWait(self.driver,10).until(EC.visibility_of_element_located(loc))
return element
except:
print ("cannot find {0} element".format(loc))
return None
and you could
password_loc =(By.NAME,'password')
webdriver = SeleniumBaseClass(driver)
webdriver.locateElement(password_loc )
this could you pass the tuple to locate the element

Categories

Resources