Multiple clicking loop on URL issue with Python Selenium - python

how do I fix this code?
As I'm trying to create a multiple clicking on a multiple URL loop but it just kept at the same link over and over.
if the url contains dr.macio
and contains this div class ('_3ao649')
from selenium import webdriver
from selenium.webdriver.common.action_chains import ActionChains
import time
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
import csv
import time
url = 'https://shopee.com.my/search?keyword=mattress'
driver = webdriver.Chrome(executable_path=r'E:/users/Francabicon/Desktop/Bots/others/chromedriver.exe')
driver.get(url)
time.sleep(0.8)
# select language
driver.find_element_by_xpath('//div[#class="language-selection__list"]/button').click()
time.sleep(3)
# scroll few times to load all items
for x in range(10):
driver.execute_script("window.scrollBy(0,300)")
time.sleep(0.1)
# get all links (without clicking)
all_items = driver.find_elements_by_xpath('//a[#data-sqe="link"]')
print('len:', len(all_items))
all_urls = []
for item in all_items:
url = item.get_attribute('href')
all_urls.append(url)
print(url)
# now use links
for item in all_urls:
a = item.splitlines("\n")
if url.contains("dr.macio"):
continue
else:
driver.get(chr(a))
driver.back()

If I understood your use case that you would like to visit each product url except which contains dr.macio.
Induce WebdriverWait and visibility_of_all_elements_located() and get all the links href value and then during iteration varify the links contains.
Try below code.
from selenium import webdriver
from selenium.webdriver.common.action_chains import ActionChains
import time
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
import csv
import time
url = 'https://shopee.com.my/search?keyword=mattress'
driver = webdriver.Chrome(executable_path=r'E:/users/Francabicon/Desktop/Bots/others/chromedriver.exe')
driver.get(url)
# select language
WebDriverWait(driver,5).until(EC.element_to_be_clickable((By.XPATH,'//div[#class="language-selection__list"]/button'))).click()
# scroll few times to load all items
for x in range(10):
driver.execute_script("window.scrollBy(0,300)")
time.sleep(0.1)
# get all links (without clicking)
all_items=[item.get_attribute('href') for item in WebDriverWait(driver,15).until(EC.visibility_of_all_elements_located((By.XPATH,'//a[#data-sqe="link"]')))]
print(all_items)
for item in all_items:
#Checking here link contains `dr.macio`
if "dr.macio" in item:
continue
else:
driver.get(item)
driver.back()

Related

Iterating and adding links in selenium WebDriver

Hello I got an issue to make it work.
I was able to make this working for the single automation, but now I want to do the for loop just to add multiple links into the form.
Basically, I would like to take from .csv file with links. After adding first link, second row is copied and used search.send_keys("element").
Many thanks for suggestions
First code works as a single input.
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import time
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver import ActionChains
options = Options()
options.add_argument("--user-data-dir=/users/sb/Documents/UserData")
options.page_load_strategy = 'normal'
driver = webdriver.Chrome(options=options)
driver.get("database for links")
time.sleep(5)
search = driver.find_element("id", "link box")
search.send_keys("link to be added")
search.send_keys(Keys.RETURN)
#waiting for adding the link
time.sleep(20)
#End
search = driver.find_element("id", "modalclose").click()
Iteration implantation, which is not working
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import time
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver import ActionChains
import xlrd
data_from_excel('/users/user/Downloads/links.csv.xls')
data_list = []
data = xlrd.open_workbook('/users/user/Downloads/links.csv.xls')
sheet = data.sheet_by_index(0)
sheet.cell_value(0, 0)
for i in range(sheet.nrows):
data_list.append(sheet.cell_value(i, 0))
return data_list
for element in data_list:
options = Options()
options.add_argument("--user-data-dir=/users/sb/Documents/UserData")
options.page_load_strategy = 'normal'
driver = webdriver.Chrome(options=options)
driver.get("link to the form")
time.sleep(5)
search = driver.find_element("id", "link box")
search.send_keys("element")
search.send_keys(Keys.RETURN)
#waiting for adding the link
time.sleep(10)
#end
search = driver.find_element("id", "modalclose").click()

can't get all image urls right in python selenium

for a personal project, I am trying to scrape this webpage:
https://www.ebay.com/b/Jordan-11-Retro-Cool-Grey-2001/15709/bn_7117643306
trying to get all img URLs, using Selenium.
here is the code:
url = 'https://www.ebay.com/b/Jordan-11-Retro-Cool-Grey-2001/15709/bn_7117643306'
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
# open url
browser = webdriver.Chrome('/Users/mreznik/V5/chromedriver')
browser.implicitly_wait(2)
browser.get(url)
elems = browser.find_elements_by_tag_name("img")
for elem in elems:
print(elem.get_attribute('src'))
and it gets me a list of results:
...
https://i.ebayimg.com/thumbs/images/g/M-sAAOSwahdgrd0x/s-l300.webp
https://i.ebayimg.com/thumbs/images/g/bpUAAOSwoa9gtlWw/s-l300.webp
https://ir.ebaystatic.com/cr/v/c1/s_1x2.gif
...
as one can see by running this, these are listings on the page who's URL is not on the list - and stranger yet, images here that are not on the page!
how can I get this right?
You should get only the elements containing products images.
Please try this:
product_img_xpath = '//div[contains(#class,"s-item")]//img'
elems = browser.find_elements_by_xpath(product_img_xpath)
for elem in elems:
print(elem.get_attribute('src'))
Don't forget some delay / wait before getting the elements list, something like this:
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
wait = WebDriverWait(browser, 20)
product_img_xpath = '//div[contains(#class,"s-item")]//img'
wait.until(EC.visibility_of_element_located((By.XPATH, product_img_xpath)))
time.sleep(1)
imgs = browser.find_elements_by_xpath(product_img_xpath)
for img in imgs:
print(img.get_attribute('src'))
UPD
In case you still not getting all the elements in the list please try scrolling to the element before accessing it properties.
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains
wait = WebDriverWait(browser, 20)
actions = ActionChains(browser)
product_img_xpath = '//div[contains(#class,"s-item")]//img'
wait.until(EC.visibility_of_element_located((By.XPATH, product_img_xpath)))
time.sleep(1)
imgs = browser.find_elements_by_xpath(product_img_xpath)
for img in imgs:
actions.move_to_element(img).perform()
print(img.get_attribute('src'))

Clicking each link under a specific div - python selenium

I am trying to click each link in the ListNews div in the below website (chinalaborwatch).
I have done a bit of research and the following should have worked, but instead, it only clicks on one link and then it stops.
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
driver = webdriver.Chrome(executable_path=r"C:\webdrivers\chromedriver.exe")
driver.get("http://www.chinalaborwatch.org/news")
WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.XPATH,'/html/body/form/div[5]/div/div[2]'))).click()
What am I missing?
thanks!
You could get the url list firstly.Then visit them and scrape the data what you want:
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium import webdriver
driver = webdriver.Chrome()
driver.get("http://www.chinalaborwatch.org/news")
element_list = driver.find_elements_by_css_selector('#form1 > div:nth-child(5) > div > div.ListNews > div')
url_list = [element.find_element_by_tag_name('a').get_attribute('href') for element in element_list] # get all the url
for i in url_list:
driver.get(i) # switch the url
# then it is your work,scrape the text you want.

Value of CSS Property Selenium returning None for all images

I'm trying to scrape all of the images on this site. However, when I run my script and try to get the CSS attribute of 'background-image' to extract the url of each web element, the result is printing out "None". I have no idea why it would be returning None as I print out the web element and the attribute does exist. Any help would be greatly appreciated!
import re
import selenium
import io
import pandas as pd
import urllib.request
import urllib.parse
import requests
from bs4 import BeautifulSoup
import pandas as pd
from selenium.webdriver.common.action_chains import ActionChains
from selenium import webdriver
import time
from _datetime import datetime
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
def parse_style_attribute(style_string):
if 'background-image' in style_string:
style_string = style_string.split(' url("')[1].replace('");', '')
return style_string
return None
#setup opening url window of website to be scraped
options = webdriver.ChromeOptions()
options.headless=False
prefs = {"profile.default_content_setting_values.notifications" : 2}
options.add_experimental_option("prefs", prefs)
#driver = webdriver.Chrome("/Users/rishi/Downloads/chromedriver 3") #possible issue by not including the file extension
# driver.maximize_window()
# time.sleep(5)
# driver.get("""https://www.tripadvisor.com/""") #get the information from the page
driver = webdriver.Chrome("/Users/rishi/Downloads/chromedriver 3")
driver.maximize_window()
driver.get("https://www.tripadvisor.com/Hotel_Review-g28970-d84078-Reviews-Hyatt_Regency_Washington_on_Capitol_Hill-Washington_DC_District_of_Columbia.html#/media/84078/?albumid=101&type=2&category=101")
time.sleep(1)
#waits for that amount of time
driver.implicitly_wait(12)
#find the searchbar and then plug in the key
#driver.find_element_by_xpath('//*[#class="typeahead_input"]').send_keys("Washington D.C.", Keys.ENTER)
#wait
time.sleep(1)
#list all of the hotels in that page
images = driver.find_elements_by_xpath('//*[#class="media-viewer-tile-gallery-v2-TileGallery__entryInner--JaADY "]')
image_url = []
for i in range(len(images)):
image_url.append(images[i].value_of_css_property("background-image"))
print("Total Number of images: ", len(images))
# print(images)
firstimage = images[0].get_attribute("innerHTML")
print(firstimage)
for i in range(len(image_url)):
print(image_url[i])
try this. it works for me.
# attach your code as set browser option
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
driver.get(
"https://www.tripadvisor.com/Hotel_Review-g28970-d84078-Reviews-Hyatt_Regency_Washington_on_Capitol_Hill-Washington_DC_District_of_Columbia.html#/media/84078/?albumid=101&type=2&category=101")
images = WebDriverWait(driver, 20).until(
EC.presence_of_all_elements_located(
(By.XPATH, '//*[#class="media-viewer-dt-root-GalleryImageWithOverlay__galleryImage--1Drp0"]')))
image_url = []
for index, image in enumerate(images):
image_url.append(images[index].value_of_css_property("background-image"))
print(image_url)

Element not found on page while scrolling down (StaleElementReferenceException)

I am currently scrolling through a TripAdivsor page to scrape images and need to scroll until the bottom of the page but keep getting the error of:
selenium.common.exceptions.StaleElementReferenceException: Message: stale element reference: element is not attached to the page document.
I am assuming it is because it is trying to go very fast through the page but even when I change the implicit wait time to be larger, it does not solve the issue. I also tried making sure the new location is visible first before parsing through to get the url but that also did not do any good. Any help would be greatly appreciated!
# import dependencies
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
import re
import selenium
import io
import pandas as pd
import urllib.request
import urllib.parse
import requests
from bs4 import BeautifulSoup
import pandas as pd
from selenium.webdriver.common.action_chains import ActionChains
from selenium import webdriver
import time
from _datetime import datetime
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
options = webdriver.ChromeOptions()
options.headless=False
driver = webdriver.Chrome("/Users/rishi/Downloads/chromedriver 3")
driver.maximize_window()
prefs = {"profile.default_content_setting_values.notifications" : 2}
options.add_experimental_option("prefs", prefs)
#open up website
driver.get(
"https://www.tripadvisor.com/Hotel_Review-g28970-d84078-Reviews-Hyatt_Regency_Washington_on_Capitol_Hill-Washington_DC_District_of_Columbia.html#/media/84078/?albumid=101&type=2&category=101")
image_url = []
end = False
while not(end):
#wait until element is found and then store all webelements into list
images = WebDriverWait(driver, 20).until(
EC.presence_of_all_elements_located(
(By.XPATH, '//*[#class="media-viewer-dt-root-GalleryImageWithOverlay__galleryImage--1Drp0"]')))
#iterate through visible images and acquire their url based on background image style
for index, image in enumerate(images):
image_url.append(images[index].value_of_css_property("background-image"))
#if you are at the end of the page then leave loop
# if(length == end_length):
# end = True
#move to next visible images in the array
driver.execute_script("arguments[0].scrollIntoView();", images[-1])
#time.sleep(1)
#wait until the new web element is visible
driver.implicitly_wait(10)
#WebDriverWait(driver, 20).until(EC.visibility_of_element_located(images[-1]))
#clean the list to provide clear links
for i in range(len(image_url)):
start = image_url[i].find("url(\"") + len("url(\"")
end = image_url[i].find("\")")
print(image_url[i][start:end])

Categories

Resources