I have built a script using Selenium that loops through a page, prints the data and goes to the next page and does the same.
Now I am trying to save the data to a CSV file and thus need to create a nested loop - Currently I am repeating the loop multiple times (as below).
How do I create a nested loop then save to the CSV file?
Also will the script fail if it gets the last page and their isn't a next button there?
Thanks - This is the code I am using.
from selenium import webdriver
import time
browser = webdriver.Firefox(executable_path="/Users/path/geckodriver")
browser.get('https://www.tripadvisor.co.uk/Restaurants-g186338-zfn29367-London_England.html#EATERY_OVERVIEW_BOX')
meci = browser.find_elements_by_class_name('property_title')
for items in meci:
title = items.text
href = items.get_attribute('href')
print(title)
print(href)
time.sleep(3)
browser.find_element_by_css_selector('.next').click()
time.sleep(3)
meci = browser.find_elements_by_class_name('property_title')
for items in meci:
title = items.text
href = items.get_attribute('href')
print(title)
print(href)
time.sleep(3)
browser.find_element_by_css_selector('.next').click()
time.sleep(3)
meci = browser.find_elements_by_class_name('property_title')
for items in meci:
title = items.text
href = items.get_attribute('href')
print(title)
print(href)
browser.quit()
I have used try-except so the program will exit when there isn't a next button.
Instead of printing, you can write the result to a CSV file.
while True:
try:
meci = browser.find_elements_by_class_name('property_title')
for items in meci:
title = items.text
href = items.get_attribute('href')
print(title)
print(href)
time.sleep(3)
browser.find_element_by_css_selector('.next').click()
time.sleep(3)
except:
break
browser.quit()
Related
I have a question. I find an element on the pages using the class, and display the text from there, and then split() it disassembles, but there is an error when there is no element, it does not parse.
Code:
spans = driver.find_elements(By.XPATH, "//span[#class='ipsContained ipsType_break']")
for span in spans:
atag = span.find_element(By.XPATH, ".//a")
print(atag.get_attribute('href'))
urlik = atag.get_attribute('href')
driver.get(url=urlik)
time.sleep(2)
urla = driver.find_element(By.CLASS_NAME, "ipsPagination_pageJump").text
for page_number in range(int(urla.split()[3])):
page_number = page_number + 1
driver.get(url=urlik + f"page/{page_number}")
time.sleep(2)
imgs = driver.find_elements(By.CLASS_NAME, "cGalleryPatchwork_image")
for i in imgs:
driver.execute_script("arguments[0].scrollIntoView(true);", i)
time.sleep(0.2)
print(i.get_attribute("src"))
I need check this:
urla = driver.find_element(By.CLASS_NAME, "ipsPagination_pageJump").text
To attempt to find an element on the pages using the class and display the text from there irrespective of the element being present or not you can wrap up the code in a try-except{} block handling the NoSuchElementException as follows:
driver.get(url=urlik)
time.sleep(2)
try:
urla = driver.find_element(By.CLASS_NAME, "ipsPagination_pageJump").text
for page_number in range(int(urla.split()[3])):
page_number = page_number + 1
driver.get(url=urlik + f"page/{page_number}")
time.sleep(2)
imgs = driver.find_elements(By.CLASS_NAME, "cGalleryPatchwork_image")
for i in imgs:
driver.execute_script("arguments[0].scrollIntoView(true);", i)
time.sleep(0.2)
print(i.get_attribute("src"))
except NoSuchElementException:
print("Element is not present")
Instead of
urla = driver.find_element(By.CLASS_NAME, "ipsPagination_pageJump")
Use
urla = driver.find_elements(By.CLASS_NAME, "ipsPagination_pageJump")
if urla:
urla[0].text
find_elements method returns a list of web elements matching the passed locator.
So, in case such element(s) existing urla will be a non-empty list while non-empty list is interpreted in Python as a Boolean True.
In case no matching elements found urla will be an empty list while empty list is interpreted in Python as a Boolean False.
I scrape data from this url then click the Next button and wait 10 seconds before using requests and bs4 to scrape the next page but the url doesn't change so I just end up scraping the original page data twice. I've tried WebDriverWait until elements on the first page become stale as well as trying to use requests to get the xhr log api call directly (I am not well-versed in ajax however) and can't find a solution. Here is the code as it stands:
loop = True
while loop:
try:
current_url = driver.current_url
next_btn = WebDriverWait(driver,5).until(EC.element_to_be_clickable((By.XPATH, '//button[text()="Next"]')))
actions = ActionChains(driver)
actions.move_to_element(next_btn).perform()
if next_btn:
next_btn.click()
except Exception as e:
current_url = driver.current_url
loop = False
print(e,f"somewhere in {current_url} while loop")
else:
time.sleep(10)
next_page = driver.current_url
get_page_content(next_page)
break
Here is the URL of the first page: https://www.hunterdouglas.com/locator/results?address=San%20Ramon&country=US&source=
Any direction would be appreciated! Thank you!
For anyone who is interested I got this to work by just using selenium. Here is the code the argument data is just the name of the city I'm submitting to the master_function(data)
def get_links(page):
for p in page:
for l in p.find_elements_by_tag_name("a"):
link = l.get_attribute('href')
if link != None:
link_split = link.split('\n')
for l in link_split:
if "http" in link:
test_list.append(link)
def master_function(data):
for d in data:
base_url = "https://www.hunterdouglas.com/locator"
driver.get(base_url)
url = pop_up_one(driver)
submit(url,driver,d)
loop = True
while loop:
try:
current_url = driver.current_url
next_btn = WebDriverWait(driver,5).until(EC.element_to_be_clickable((By.XPATH, '//button[text()="Next"]')))
actions = ActionChains(driver)
actions.move_to_element(next_btn).perform()
if next_btn:
next_btn.click()
except Exception as e:
current_url = driver.current_url
loop = False
print(e,f"somewhere in {current_url} while loop")
else:
time.sleep(1)
page = WebDriverWait(driver,5).until(EC.presence_of_all_elements_located((By.XPATH, '//div[#id="loc-results"]')))
get_links(page)
Image is for CSS selector and xpath for pagination.
I also wanted to perform a regex in to to separate Apple, iPhone 12, Neo Galactic Silver like this I wanted to print it in new line.
After finishing the product list of this current page, I want to be able to click next and perform the same procedure with the products on the next page.
This is the problem: when it reaches the 10 items of the current page, I have no idea how to change to another page and start all over again.
import xlwt
from selenium import webdriver
import re
import time
class cometmobiles:
def __init__(self):
self.url='https://www.mediaworld.it/catalogo/telefonia/smartphone-e-cellulari/smartphone'
def comet(self):
try:
driver=webdriver.Chrome()
driver.get(self.url)
time.sleep(5)
cookies = driver.find_element_by_id("onetrust-accept-btn-handler")
cookies.click()
print("accepted cookies")
driver.maximize_window()
print("window maximized")
mylist = []
hasNextPate = True
while hasNextPate:
containers = []
containters =driver.find_elements_by_css_selector('article[class="product clearfix p-list-js"]')
for container in containters:
#Title
try:
title = container.find_element_by_css_selector('h3[class="product-name"]').text
print(title)
except:
pass
#price
try:
price = container.find_element_by_css_selector('span[class="price mw-price enhanced"]').text
print(price)
except:
pass
try:
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(5)
nxt=driver.find_elements_by_css_selector('span[class="pages"] a')
time.sleep(5)
nxt.click()
except:
break
except:
pass
comets=cometmobiles()
comets.comet()
Instead of this part
try:
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(5)
nxt=driver.find_elements_by_css_selector('span[class="pages"] a')
time.sleep(5)
nxt.click()
except:
break
You can use this and also if the page number doesn't exist website turn the main page so you should add
try:
x=0
while True:
x+=1
driver.get(url+"?pageNumber="+str(x)) #Get the next page
if driver.current_url == url: #If there is no next page it will turn main page and you can break at this time
break
except:
pass
I'm beginner to Python & Selenium, I am just trying to get all LinkedIn profile hrefs on specific page with Selenium for adding to a list but I don't know why it returns all same URL 10 times:
This is my code:
try:
browser.find_element_by_id("username").send_keys(email_address)
sleep(1)
browser.find_element_by_id("password").send_keys(password)
sleep(1)
browser.find_element_by_xpath("//button[#class='btn__primary--large from__button--floating']").click()
sleep(1)
element = WebDriverWait(browser, 10).until(
EC.presence_of_element_located((By.ID, "global-nav")))
sleep(0.5)
browser.get('https://www.linkedin.com/search/results/people/?geoUrn=%5B%22104246759%22%2C%2290009659%22%2C%22106383538%22%5D&keywords=mark%20johnson&origin=GLOBAL_SEARCH_HEADER')
user = []
url = browser.find_elements_by_xpath("//a[#class='app-aware-link']")
for i in range(10):
href = url[i].get_attribute('href')
user.append(href)
print(user)
except Exception as e:
traceback.print_exc()
It looks like the xpath is matching multiple elements with the same href. You could make a list of unique hrefs:
user = []
url = browser.find_elements_by_xpath("//a[#class='app-aware-link']")
unique_hrefs = []
[unique_hrefs.append(x.get_attribute('href')) for x in url if x.get_attribute('href') not in unique_hrefs]
for i in unique_hrefs:
print(i)
I'm trying to loop through a dropdown menu on at this url: https://www.accuform.com/safety-sign/danger-danger-authorized-personnel-only-MADM006
So, for example, the first dropdown menu - under options - lists out different materials and I want to select each one in turn and then gather some other information from the webpage before moving on to the next material. Here is my current code:
driver = webdriver.Firefox()
driver.get('https://www.accuform.com/safety-sign/danger-danger-authorized-personnel-only-MADM006')
time.sleep(3)
driver.find_element_by_id('x-mark-icon').click()
select = Select(driver.find_element_by_name('Wiqj7mb4rsAq9LB'))
options = select.options
optionsList = []
driver.find_elements_by_class_name('select-wrapper')[0].click()
element = driver.find_element_by_xpath("//select[#name='Wiqj7mb4rsAq9LB']")
actions = ActionChains(driver)
actions.move_to_element(element).perform()
# driver.execute_script("arguments[0].scrollIntoView();", element)
for option in options: #iterate over the options, place attribute value in list
optionsList.append(option.get_attribute("value"))
for optionValue in optionsList:
print("starting loop on option %s" % optionValue)
# select = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, "//select[#name='Wiqj7mb4rsAq9LB']")))
# select = Select(select)
select.select_by_value(optionValue)
I started with just the loop, but got this error:
ElementNotInteractableException: Message: Element <option> could not be scrolled into view
I then added the webdriverwait and get a TimeoutException error.
I then realized I should probably click on the wrapper in which the dropdown is held, so I added the click, which does pup up the menu, but I still got the TimeoutException.
So I thought, maybe I should move to the element, which I tried with the action chain lines and I got this error
WebDriverException: Message: TypeError: rect is undefined
I tried to avoid that error by using this code instead:
# driver.execute_script("arguments[0].scrollIntoView();", element)
Which just resulted in the timeoutexception again.
I pretty new to Python and Selenium and have basically just been modifying code from SO answers to similar questions, but nothing has worked.
I'm using python 3.6 and the current versions of Selenium and firefox webdriver.
If anything is unclear or if you need more info just let me know.
Thanks so much!
EDIT: Based on the answer and comments by Kajal Kunda, I've updated my code to the following:
`material_dropdown = driver.find_element_by_xpath("//input[#class='select-
dropdown']")
driver.execute_script("arguments[0].click();", material_dropdown)
materials=driver.find_elements_by_css_selector("div.select-wrapper
ul.dropdown-content li")
for material in materials:
# material_dropdown =
driver.find_element_by_xpath("//input[#class='select-dropdown']")
# driver.execute_script("arguments[0].click();", material_dropdown)
# materials=driver.find_elements_by_css_selector("div.select-wrapper ul.dropdown-content li")
material_ele=material.find_element_by_tag_name('span')
if material_ele.text!='':
material_ele.click()
time.sleep(5)
price = driver.find_element_by_class_name("dataPriceDisplay")
print(price.text)`
The result is that it successfully prints the price for the first type of material, but then it returns:
StaleElementReferenceException: Message: The element reference of <li class=""> is stale;...
I've tried variations of having the hashed out lines in and outside of the loop, but always get a version of the StaleElementReferenceException error.
Any suggestions?
Thanks!
You could do the whole thing with requests. Grab the drop down list from the options listed in drop down then concatenate the value attributes into requests url that retrieves json containing all the info on the page. Same principle applies for adding in other dropdown values. The ids for each drop down selection are the value attributes of the options in the drop down and appear in the url I show separated by // for each drop down selection.
import requests
from bs4 import BeautifulSoup as bs
url = 'https://www.accuform.com/product/getSku/danger-danger-authorized-personnel-only-MADM006/1/false/null//{}//WHFIw3xXmQx8zlz//6wr93DdrFo5JV//WdnO0RpwKpc4fGF'
startURL = 'https://www.accuform.com/safety-sign/danger-danger-authorized-personnel-only-MADM006'
res = requests.get(startURL)
soup = bs(res.content, 'lxml')
materials = [item['value'] for item in soup.select('#Wiqj7mb4rsAq9LB option')]
sizes = [item['value'] for item in soup.select('#WvXESrTyQjM3Ciw option')]
languages = [item['value'] for item in soup.select('#WUYWGMePtpmpmhy option')]
units = [item['value'] for item in soup.select('#W91eqaJ0WPXwe9b option')]
for material in materials:
data = requests.get(url.format(material)).json()
soup = bs(data['dataMaterialBullets'], 'lxml')
lines = [item.text for item in soup.select('li')]
print(lines)
print(data['dataPriceDisplay'])
# etc......
Sample of JSON:
Try the below code.It should work.
driver = webdriver.Firefox()
driver.get('https://www.accuform.com/safety-sign/danger-danger-authorized-personnel-only-MADM006')
time.sleep(3)
driver.find_element_by_id('x-mark-icon').click()
material_dropdown = driver.find_element_by_xpath("//input[#class='select-dropdown']")
driver.execute_script("arguments[0].click();", material_dropdown)
#Code for material dropdown
materials=driver.find_elements_by_css_selector("div.select-wrapper ul.dropdown-content li")
material_optionsList = []
for material in materials:
material_ele=material.find_element_by_tag_name('span')
if material_ele.text!='':
material_optionsList.append(material_ele.text)
print(material_optionsList)
driver.execute_script("arguments[0].click();", material_dropdown)
size_dropdown = driver.find_element_by_xpath("(//input[#class='select-dropdown'])[2]")
driver.execute_script("arguments[0].click();", size_dropdown)
#Code for size dropdown
Sizes=driver.find_elements_by_css_selector("div.select-wrapper ul.dropdown-content li")
size_optionsList = []
for size in Sizes:
size_ele=size.find_element_by_tag_name('span')
if size_ele.text!='':
size_optionsList.append(size_ele.text)
driver.execute_script("arguments[0].click();", size_dropdown)
Output :
[u'Adhesive Vinyl', u'Plastic', u'Adhesive Dura-Vinyl', u'Aluminum', u'Dura-Plastic\u2122', u'Aluma-Lite\u2122', u'Dura-Fiberglass\u2122', u'Accu-Shield\u2122']
Hope you will do the remaining.Let me know if it works for you.
EDIT Code for loop through and get the price value of materials.
for material in range(len(materials)):
material_ele=materials[material]
if material_ele.text!='':
#material_optionsList.append(material_ele.text)
#material_ele.click()
driver.execute_script("arguments[0].click();", material_ele)
time.sleep(2)
price = driver.find_element_by_id("priceDisplay")
print( price.text)
time.sleep(2)
material_dropdown = driver.find_element_by_xpath("//input[#class='select-dropdown']")
driver.execute_script("arguments[0].click();", material_dropdown)
materials = driver.find_elements_by_css_selector("div.select-wrapper ul.dropdown-content li")
material+=2
Output :
$8.31
$9.06
$13.22
$15.91
$15.91