can't get all image urls right in python selenium - python

for a personal project, I am trying to scrape this webpage:
https://www.ebay.com/b/Jordan-11-Retro-Cool-Grey-2001/15709/bn_7117643306
trying to get all img URLs, using Selenium.
here is the code:
url = 'https://www.ebay.com/b/Jordan-11-Retro-Cool-Grey-2001/15709/bn_7117643306'
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
# open url
browser = webdriver.Chrome('/Users/mreznik/V5/chromedriver')
browser.implicitly_wait(2)
browser.get(url)
elems = browser.find_elements_by_tag_name("img")
for elem in elems:
print(elem.get_attribute('src'))
and it gets me a list of results:
...
https://i.ebayimg.com/thumbs/images/g/M-sAAOSwahdgrd0x/s-l300.webp
https://i.ebayimg.com/thumbs/images/g/bpUAAOSwoa9gtlWw/s-l300.webp
https://ir.ebaystatic.com/cr/v/c1/s_1x2.gif
...
as one can see by running this, these are listings on the page who's URL is not on the list - and stranger yet, images here that are not on the page!
how can I get this right?

You should get only the elements containing products images.
Please try this:
product_img_xpath = '//div[contains(#class,"s-item")]//img'
elems = browser.find_elements_by_xpath(product_img_xpath)
for elem in elems:
print(elem.get_attribute('src'))
Don't forget some delay / wait before getting the elements list, something like this:
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
wait = WebDriverWait(browser, 20)
product_img_xpath = '//div[contains(#class,"s-item")]//img'
wait.until(EC.visibility_of_element_located((By.XPATH, product_img_xpath)))
time.sleep(1)
imgs = browser.find_elements_by_xpath(product_img_xpath)
for img in imgs:
print(img.get_attribute('src'))
UPD
In case you still not getting all the elements in the list please try scrolling to the element before accessing it properties.
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains
wait = WebDriverWait(browser, 20)
actions = ActionChains(browser)
product_img_xpath = '//div[contains(#class,"s-item")]//img'
wait.until(EC.visibility_of_element_located((By.XPATH, product_img_xpath)))
time.sleep(1)
imgs = browser.find_elements_by_xpath(product_img_xpath)
for img in imgs:
actions.move_to_element(img).perform()
print(img.get_attribute('src'))

Related

Selenium can't find all elements by xpath

I use selenium to crawl data of a website and conduct the code below
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
chromeDriverPath = 'C:\Program Files (x86)\chromedriver.exe'
url = 'https://shopee.vn/Th%E1%BB%9Di-Trang-Nam-cat.11035567?page=0'
driver = webdriver.Chrome(chromeDriverPath)
driver.get(url)
try:
main_xpath = '/html/body/div[1]/div/div[3]/div/div[4]/div[2]/div/div[2]'
main = WebDriverWait(driver,20).until(
EC.presence_of_element_located((By.XPATH,main_xpath))
)
product_list = main.find_elements(By.XPATH,'./div[#data-sqe="item"]')
i=1
for i in range(0,len(product_list)):
print(i)
print(product_list[i].text)
finally:
driver.close()
but it returns only the first 15 non-empty elements and the rest are empty although it seems to return number element of product_list exactly .
So, how can i find all elements of product_list ?
for i in range(1,int(len(product_list)/15)):
driver.execute_script("arguments[0].scrollIntoView();", product_list[i*15])
time.sleep(5)
Here's a hack I made to scroll 15 elements down and then wait a bit till it loads.

Selenium not printing inner text of div

I am using selenium to try to scrape data from a website (https://www.mergentarchives.com/), and I am attempting to get the innerText from this element:
<div class="x-paging-info" id="ext-gen200">Displaying reports 1 - 15 of 15</div>
This is my code so far:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support.ui import WebDriverWait
driver = webdriver.Firefox()
driver.maximize_window()
search_url = 'https://www.mergentarchives.com/search.php'
driver.get(search_url)
assert 'Mergent' in driver.title
company_name_input = '//*[#id="ext-comp-1009"]'
search_button = '//*[#id="ext-gen287"]'
driver.implicitly_wait(10)
driver.find_element_by_xpath(company_name_input).send_keys('3com corp')
driver.find_element_by_xpath(search_button).click()
driver.implicitly_wait(20)
print(driver.find_element_by_css_selector('#ext-gen200').text)
basically I am just filling out a search form, which works, and its taking me to a search results page, where the number of results is listed in a div element. When I attempt to print the text of this element, I simply get a blank space, there is nothing written and no error.
[Finished in 21.1s]
What am I doing wrong?
I think you may need explicit Wait :
wait = WebDriverWait(driver, 10)
info = wait.until(EC.visibility_of_element_located((By.XPATH, "//div[#class = 'x-paging-info' and #id='ext-gen200']"))).get_attribute('innerHTML')
print(info)
Imports :
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
You may need to put a condition by verifying if search results loaded or not and once its loaded you can use below code
print(driver.find_element_by_id('ext-gen200').text)

selenium more result xpath

im trying to print all links but i have an error ( ut is not clickable at point (781,748) because another element obscures it
the code updated:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains
import time
#driver = webdriver.Chrome(executable_path='chromedriver.exe')
driver = webdriver.Firefox(executable_path='geckodriver')
wait = WebDriverWait(driver, 20)
actions = ActionChains(driver)
driver.get("https://www.architectes-pour-tous.fr/")
driver.find_element_by_xpath("//button[contains(#class,'decline-button')]").click();
driver.find_element_by_xpath(".//a[#id='pager']").click();
wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "div.image-projet img")))
time.sleep(1)
for img in driver.find_elements_by_css_selector('div.image-projet img'):
print(a.get_attribute('href'))
driver.find_elements_by_css_selector('button.agree-button').click()
pager = driver.find_element_by_xpath('//*[#id="pager"]')
actions.move_to_element(pager).build().perform()
time.sleep(0.5)
pager.click()
You have to accept/decline the cookies before accessing any element on the page.
driver.find_element_by_xpath("//button[contains(#class,'decline-button')]").click();
driver.find_element_by_xpath(".//a[#id='pager']").click();
The element you trying to access is initially out of the visible screen so you have to scroll to it before clicking it.
Also possibly you will have to close the accept cookies pop-up prior to clicking this element.
Also I'm quite sure you are getting no links with
for a in driver.find_elements_by_xpath('.//a'):
print(a.get_attribute('href'))
since you trying to do that before the page is loaded.
Also, if you are trying to get the search results links you have to use another locator.
So would suggest to change your code as following:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains
import time
#driver = webdriver.Chrome(executable_path='chromedriver.exe')
driver = webdriver.Firefox(executable_path='geckodriver')
wait = WebDriverWait(driver, 20)
actions = ActionChains(driver)
driver.get("https://www.architectes-pour-tous.fr/")
wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "div.image-projet img")))
time.sleep(1)
for img in driver.find_elements_by_css_selector('div.image-projet img'):
print(a.get_attribute('href'))
driver.find_element_by_css_selector('button.agree-button').click()
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
pager = driver.find_element_by_xpath('//*[#id="pager"]')
actions.move_to_element(pager).perform()
time.sleep(0.5)
pager.click()

Clicking each link under a specific div - python selenium

I am trying to click each link in the ListNews div in the below website (chinalaborwatch).
I have done a bit of research and the following should have worked, but instead, it only clicks on one link and then it stops.
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
driver = webdriver.Chrome(executable_path=r"C:\webdrivers\chromedriver.exe")
driver.get("http://www.chinalaborwatch.org/news")
WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.XPATH,'/html/body/form/div[5]/div/div[2]'))).click()
What am I missing?
thanks!
You could get the url list firstly.Then visit them and scrape the data what you want:
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium import webdriver
driver = webdriver.Chrome()
driver.get("http://www.chinalaborwatch.org/news")
element_list = driver.find_elements_by_css_selector('#form1 > div:nth-child(5) > div > div.ListNews > div')
url_list = [element.find_element_by_tag_name('a').get_attribute('href') for element in element_list] # get all the url
for i in url_list:
driver.get(i) # switch the url
# then it is your work,scrape the text you want.

Multiple clicking loop on URL issue with Python Selenium

how do I fix this code?
As I'm trying to create a multiple clicking on a multiple URL loop but it just kept at the same link over and over.
if the url contains dr.macio
and contains this div class ('_3ao649')
from selenium import webdriver
from selenium.webdriver.common.action_chains import ActionChains
import time
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
import csv
import time
url = 'https://shopee.com.my/search?keyword=mattress'
driver = webdriver.Chrome(executable_path=r'E:/users/Francabicon/Desktop/Bots/others/chromedriver.exe')
driver.get(url)
time.sleep(0.8)
# select language
driver.find_element_by_xpath('//div[#class="language-selection__list"]/button').click()
time.sleep(3)
# scroll few times to load all items
for x in range(10):
driver.execute_script("window.scrollBy(0,300)")
time.sleep(0.1)
# get all links (without clicking)
all_items = driver.find_elements_by_xpath('//a[#data-sqe="link"]')
print('len:', len(all_items))
all_urls = []
for item in all_items:
url = item.get_attribute('href')
all_urls.append(url)
print(url)
# now use links
for item in all_urls:
a = item.splitlines("\n")
if url.contains("dr.macio"):
continue
else:
driver.get(chr(a))
driver.back()
If I understood your use case that you would like to visit each product url except which contains dr.macio.
Induce WebdriverWait and visibility_of_all_elements_located() and get all the links href value and then during iteration varify the links contains.
Try below code.
from selenium import webdriver
from selenium.webdriver.common.action_chains import ActionChains
import time
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
import csv
import time
url = 'https://shopee.com.my/search?keyword=mattress'
driver = webdriver.Chrome(executable_path=r'E:/users/Francabicon/Desktop/Bots/others/chromedriver.exe')
driver.get(url)
# select language
WebDriverWait(driver,5).until(EC.element_to_be_clickable((By.XPATH,'//div[#class="language-selection__list"]/button'))).click()
# scroll few times to load all items
for x in range(10):
driver.execute_script("window.scrollBy(0,300)")
time.sleep(0.1)
# get all links (without clicking)
all_items=[item.get_attribute('href') for item in WebDriverWait(driver,15).until(EC.visibility_of_all_elements_located((By.XPATH,'//a[#data-sqe="link"]')))]
print(all_items)
for item in all_items:
#Checking here link contains `dr.macio`
if "dr.macio" in item:
continue
else:
driver.get(item)
driver.back()

Categories

Resources