I use selenium to crawl data of a website and conduct the code below
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
chromeDriverPath = 'C:\Program Files (x86)\chromedriver.exe'
url = 'https://shopee.vn/Th%E1%BB%9Di-Trang-Nam-cat.11035567?page=0'
driver = webdriver.Chrome(chromeDriverPath)
driver.get(url)
try:
main_xpath = '/html/body/div[1]/div/div[3]/div/div[4]/div[2]/div/div[2]'
main = WebDriverWait(driver,20).until(
EC.presence_of_element_located((By.XPATH,main_xpath))
)
product_list = main.find_elements(By.XPATH,'./div[#data-sqe="item"]')
i=1
for i in range(0,len(product_list)):
print(i)
print(product_list[i].text)
finally:
driver.close()
but it returns only the first 15 non-empty elements and the rest are empty although it seems to return number element of product_list exactly .
So, how can i find all elements of product_list ?
for i in range(1,int(len(product_list)/15)):
driver.execute_script("arguments[0].scrollIntoView();", product_list[i*15])
time.sleep(5)
Here's a hack I made to scroll 15 elements down and then wait a bit till it loads.
Related
I am using selenium to try to scrape data from a website (https://www.mergentarchives.com/), and I am attempting to get the innerText from this element:
<div class="x-paging-info" id="ext-gen200">Displaying reports 1 - 15 of 15</div>
This is my code so far:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support.ui import WebDriverWait
driver = webdriver.Firefox()
driver.maximize_window()
search_url = 'https://www.mergentarchives.com/search.php'
driver.get(search_url)
assert 'Mergent' in driver.title
company_name_input = '//*[#id="ext-comp-1009"]'
search_button = '//*[#id="ext-gen287"]'
driver.implicitly_wait(10)
driver.find_element_by_xpath(company_name_input).send_keys('3com corp')
driver.find_element_by_xpath(search_button).click()
driver.implicitly_wait(20)
print(driver.find_element_by_css_selector('#ext-gen200').text)
basically I am just filling out a search form, which works, and its taking me to a search results page, where the number of results is listed in a div element. When I attempt to print the text of this element, I simply get a blank space, there is nothing written and no error.
[Finished in 21.1s]
What am I doing wrong?
I think you may need explicit Wait :
wait = WebDriverWait(driver, 10)
info = wait.until(EC.visibility_of_element_located((By.XPATH, "//div[#class = 'x-paging-info' and #id='ext-gen200']"))).get_attribute('innerHTML')
print(info)
Imports :
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
You may need to put a condition by verifying if search results loaded or not and once its loaded you can use below code
print(driver.find_element_by_id('ext-gen200').text)
for a personal project, I am trying to scrape this webpage:
https://www.ebay.com/b/Jordan-11-Retro-Cool-Grey-2001/15709/bn_7117643306
trying to get all img URLs, using Selenium.
here is the code:
url = 'https://www.ebay.com/b/Jordan-11-Retro-Cool-Grey-2001/15709/bn_7117643306'
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
# open url
browser = webdriver.Chrome('/Users/mreznik/V5/chromedriver')
browser.implicitly_wait(2)
browser.get(url)
elems = browser.find_elements_by_tag_name("img")
for elem in elems:
print(elem.get_attribute('src'))
and it gets me a list of results:
...
https://i.ebayimg.com/thumbs/images/g/M-sAAOSwahdgrd0x/s-l300.webp
https://i.ebayimg.com/thumbs/images/g/bpUAAOSwoa9gtlWw/s-l300.webp
https://ir.ebaystatic.com/cr/v/c1/s_1x2.gif
...
as one can see by running this, these are listings on the page who's URL is not on the list - and stranger yet, images here that are not on the page!
how can I get this right?
You should get only the elements containing products images.
Please try this:
product_img_xpath = '//div[contains(#class,"s-item")]//img'
elems = browser.find_elements_by_xpath(product_img_xpath)
for elem in elems:
print(elem.get_attribute('src'))
Don't forget some delay / wait before getting the elements list, something like this:
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
wait = WebDriverWait(browser, 20)
product_img_xpath = '//div[contains(#class,"s-item")]//img'
wait.until(EC.visibility_of_element_located((By.XPATH, product_img_xpath)))
time.sleep(1)
imgs = browser.find_elements_by_xpath(product_img_xpath)
for img in imgs:
print(img.get_attribute('src'))
UPD
In case you still not getting all the elements in the list please try scrolling to the element before accessing it properties.
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains
wait = WebDriverWait(browser, 20)
actions = ActionChains(browser)
product_img_xpath = '//div[contains(#class,"s-item")]//img'
wait.until(EC.visibility_of_element_located((By.XPATH, product_img_xpath)))
time.sleep(1)
imgs = browser.find_elements_by_xpath(product_img_xpath)
for img in imgs:
actions.move_to_element(img).perform()
print(img.get_attribute('src'))
I am trying to create a python function that can scrape the article titles of a search result on Popular Science's website.
I have written this code, which has worked for a similar science-related website but when I run it specifically for Popular Science, it returns an empty list.
Code:
from selenium import webdriver
import pandas as pd
def scraper(text):
driver = webdriver.Chrome(executable_path='chromedriver.exe')
wired_dict = []
driver.get("https://www.popsci.com/search-results/" + text + "/")
search = driver.find_elements_by_class_name("siq-partner-result")
for words in search:
wired_dict.append(words.text)
return wired_dict
print(scraper("science"))
You can use driver.implicitly_wait(10) for wait while page is loaded.
from selenium import webdriver
def scrapper(text):
driver = webdriver.Chrome('./chromedriver')
driver.get(f"https://www.popsci.com/search-results/{text}/")
driver.implicitly_wait(10)
search = driver.find_elements_by_class_name("siq-partner-result")
wired_dict = [word.text for word in search]
print(wired_dict)
scrapper('sample')
This page takes a while to load. You are using driver.find_elements_by_class_name before the page has finished loading, so it's not finding those elements.
You can test this theory by import time and time.sleep(5) just before the search code.
The best solution is to keep checking until the elements are loaded with WebDriverWait() wait until the elements have loaded.
from selenium import webdriver
import pandas as pd
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException
def scraper(text):
driver = webdriver.Chrome(executable_path='chromedriver.exe')
wired_dict = []
driver.get("https://www.popsci.com/search-results/" + text + "/")
delay = 3
WebDriverWait(driver, delay).until(EC.presence_of_element_located((By.CLASS_NAME, 'siq-partner-result')))
search = driver.find_elements_by_class_name("siq-partner-result")
for words in search:
wired_dict.append(words.text)
return wired_dict
You can use WebDriverWait for the desired element to visible and then try to find the elements.
Using XPATH :
WebDriverWait(driver, 30).until(EC.visibility_of_element_located((By.XPATH, "//*[#class='siq-partner-result']")))
search = driver.find_elements_by_class_name("siq-partner-result")
Note : You have to add the following imports :
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
I've written Python 3 script which uses Selenium to extract data from a table within an IFrame from Rooster Resource. This table contains the MLB Schedule for 2018.
However, when the script is executed I receive the following error:
selenium.common.exceptions.TimeoutException:
when it reaches the line containing iframe within my script. Why is this the case?
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
driver = webdriver.Chrome()
driver.get("link above")
wait = WebDriverWait(driver, 10)
wait.until(EC.frame_to_be_available_and_switch_to_it(wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "iframe#pageswitcher-content")))))
for items in wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, "table.waffle tr"))):
data = [item.text for item in items.find_element_by_css_selector("td")]
print(data)
driver.quit()
Btw, If you browse the above link you can see the table containing different colorful logos and texts
FYI, I don't wish to resuse the link within that iframe; rather, I wanna switch to it to get the data.
There are two nested iframes in that page to reach the content. Try this instead:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
driver = webdriver.Chrome()
driver.get("above link")
wait = WebDriverWait(driver, 10)
wait.until(EC.frame_to_be_available_and_switch_to_it(wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "iframe")))))
wait.until(EC.frame_to_be_available_and_switch_to_it(wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "iframe#pageswitcher-content")))))
for items in wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, "table.waffle tr"))):
data = [item.text for item in items.find_elements_by_css_selector("td")]
print(data)
driver.quit()
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
def wait(dr, x):
element = WebDriverWait(dr, 50).until(
EC.presence_of_all_elements_located((By.XPATH, x))
)
return element
from selenium import webdriver
browser = webdriver.Firefox()
browser.get("http://www.dinamalar.com/user_comments.asp? uid=14701&name=%E0%AE%A4%E0%AE%AE%E0%AE%BF%E0%AE%B4%E0%AF%8D%E0%AE%9A%E0%AF%86%E0%AE%B2%E0%AF%8D%E0%AE%B5%E0%AE%A9%E0%AF%8D")
for elem in wait(browser, '//*[#id="commsec"]/div[2]/div[1]'):
print elem.text
This is the link i need to extract all the comments http://www.dinamalar.com/user_comments.asp?uid=14701&name=%E0%AE%A4%E0%AE%AE%E0%AE%BF%E0%AE%B4%E0%AF%8D%E0%AE%9A%E0%AF%86%E0%AE%B2%E0%AF%8D%E0%AE%B5%E0%AE%A9%E0%AF%8D
But my code extracting only the first 10 comments. After clicking the button the other 10 comments are loaded dynamically. How to extract all these comments using python selenium
The idea would be to look for how many "more ideas" elements are present on the page. Every time you click the button and load more comments, one more "more ideas" red button becomes present. Implementation:
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium import webdriver
browser = webdriver.Firefox()
wait = WebDriverWait(browser, 10)
browser.get("http://www.dinamalar.com/user_comments.asp?uid=14701&name=%E0%AE%A4%E0%AE%AE%E0%AE%BF%E0%AE%B4%E0%AF%8D%E0%AE%9A%E0%AF%86%E0%AE%B2%E0%AF%8D%E0%AE%B5%E0%AE%A9%E0%AF%8D")
# initial wait for the page to load
wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, ".morered")))
pages = 1
while True:
browser.find_elements_by_css_selector(".morered")[-1].click()
# wait for more "load more" buttons to be present
try:
wait.until(lambda browser: len(browser.find_elements_by_css_selector(".morered")) > pages)
except TimeoutException:
break # no more data loaded, exit the loop
print("Comments loaded: %d" % len(browser.find_elements_by_css_selector(".dateg")))
pages += 1
browser.close()
Note that I've also removed that extra space inside the URL.