this code when given a list of cities goes and searches on google and extract data then covert it into a dataframe
In some cases have to use different xpaths to extract the data. there are three xpaths in total.
Trying to do this :
if
1 doesnt work go to 2
2 doesnt work go to 3
3 doesnt work.
use driver.quit ()
tried this code used NoSuchElementException
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions
import pandas as pd
from selenium.common.exceptions import NoSuchElementException
df_output = pd.DataFrame(columns=["City", "pincode"])
url = "https://www.google.com/"
chromedriver = ('/home/me/chromedriver/chromedriver.exe')
driver = webdriver.Chrome(chromedriver)
driver.implicitly_wait(30)
driver.get(url)
search = driver.find_element_by_name('q')
mlist1=['polasa']
for i in mlist1:
try:
search.send_keys(i,' pincode')
search.send_keys(Keys.RETURN)
WebDriverWait(driver, 10).until(expected_conditions.visibility_of_element_located((By.XPATH, '//div[#class="IAznY"]//div[#class="title"]')))
elmts = driver.find_elements_by_xpath('//div[#class="IAznY"]//div[#class="title"]')
df_output = df_output.append(pd.DataFrame(columns=["City", "pincode"], data=[[i,elmts[0].text]]))
driver.quit()
except NoSuchElementException:
try:
elements=driver.find_element_by_xpath("//div[#class='Z0LcW']")
df_output = df_output.append(pd.DataFrame(columns=["City", "pincode"], data=[[i,elements.text]]))
driver.quit()
except NoSuchElementException:
try:
elements=driver.find_element_by_xpath("//div[#class='Z0LcW AZCkJd']")
df_output = df_output.append(pd.DataFrame(columns=["City", "pincode"], data=[[i,elements.text]]))
driver.quit()
except:
driver.quit()
this code works used one of the 3 tags here
need to combine 3 tags in a single code.
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup
import re
import pandas as pd
import os
import html5lib
import json
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions
import pandas as pd
url = "https://www.google.com/"
chromedriver = ('/home/me/chromedriver/chromedriver.exe')
driver = webdriver.Chrome(chromedriver)
driver.implicitly_wait(30)
driver.get(url)
search = driver.find_element_by_name('q')
search.send_keys('polasa',' pincode')
search.send_keys(Keys.RETURN)
elements=driver.find_element_by_xpath("//div[#class='Z0LcW']")
elements.text
``
You don't really need 3 try-catchs. You can do this without throwing exceptions by locating elements (plural) given a locator and then check the length of the collection returned. If length = 0, no elements were found.
The locators you are using don't require XPath so you can instead use a CSS selector and combine all three with an OR and avoid the three checks. (Note: you can do the same thing with XPath but the results are messier and harder to read)
Here are your 3 locators combined into one using OR (the comma) in CSS selector syntax
div.IAznY div.title, div.Z0LcW, div.Z0LcW.AZCkJd
...and the updated code using the combined locator and without the nested try-catch.
...
locator = (By.CSS_SELECTOR, 'div.IAznY div.title, div.Z0LcW, div.Z0LcW.AZCkJd')
for i in mlist1:
search.send_keys(i,' pincode')
search.send_keys(Keys.RETURN)
WebDriverWait(driver, 10).until(expected_conditions.visibility_of_element_located(*locator)
elements = driver.find_elements_by_css_selector(*locator)
df_output = df_output.append(pd.DataFrame(columns=["City", "pincode"], data=[[i,elements[0].text]]))
driver.quit()
NOTE: I used your original locators and wasn't returning any results with any of the three. Are you sure they are correct?
Also note... I pulled the driver.quit() out of the loop. I'm not sure if you intended it to be inside or not but from the code provided, if the try succeeds in the first iteration, the browser will quit. You only have one item so you probably didn't notice this yet but would have been confused when you added another item to the iteration.
Related
I am trying to webscrape from multiple pages, my code seems to work really well for just page one and when I use loop to do web scrapping for example first 5 pages then im getting below error:TimeoutException: Message:
Stacktrace:
Backtrace:
My code is below
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
import pandas as pd
from bs4 import BeautifulSoup
import requests as r
import time
from selenium.webdriver.support.ui import Select
PATH="chromedriver.exe"
driver=webdriver.Chrome(PATH)
_list=[]
for page_num in range(1,3):
#print("----")
url=f"https://valuebuds.com/pages/search-results-page?tab=products&page={page_num}"
driver.get(url)
Select(WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.CSS_SELECTOR, "select#year_field")))).select_by_visible_text('1999')
driver.find_element_by_class_name("agree").click()
title=driver.find_elements_by_class_name("snize-overhidden")
for j in title:
Pro=j.find_element_by_class_name("snize-title").text
Price=j.find_element_by_class_name("snize-price-list").text
Desc=j.find_element_by_class_name("snize-description").text
prec_item={
"Product":Pro,
"Price":Price,
"Description":Desc
}
_list.append(prec_item)
df = pd.DataFrame(_list)
df.to_csv("Value Buds HTML Pricing.csv")
print("saved to file.")
please advise! Thanks in advance
The code block
Select(WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.CSS_SELECTOR, "select#year_field")))).select_by_visible_text('1999')
driver.find_element_by_class_name("agree").click()
Is relevant when you landing the home page first time.
Once you have selected the year and clicked Agree button you will be able to see all the pages of presented results with no need to select that year again.
So, your code could be something like this:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
import pandas as pd
from bs4 import BeautifulSoup
import requests as r
import time
from selenium.webdriver.support.ui import Select
PATH="chromedriver.exe"
driver=webdriver.Chrome(PATH)
_list=[]
for page_num in range(1,3):
#print("----")
url=f"https://valuebuds.com/pages/search-results-page?tab=products&page={page_num}"
driver.get(url)
if page_num == 1:
Select(WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.CSS_SELECTOR, "select#year_field")))).select_by_visible_text('1999')
driver.find_element_by_class_name("agree").click()
else:
time.sleep(2)
title=driver.find_elements_by_class_name("snize-overhidden")
for j in title:
Pro=j.find_element_by_class_name("snize-title").text
Price=j.find_element_by_class_name("snize-price-list").text
Desc=j.find_element_by_class_name("snize-description").text
prec_item={
"Product":Pro,
"Price":Price,
"Description":Desc
}
_list.append(prec_item)
df = pd.DataFrame(_list)
df.to_csv("Value Buds HTML Pricing.csv")
print("saved to file.")
I have added a delay for non-first iteration to make pages loaded before you will scrape their data.
I will be better if you use Expected Conditions explicit waits there.
I don't know what condition to use there, leaved that for you decision.
I am using selenium to try to scrape data from a website (https://www.mergentarchives.com/), and I am attempting to get the innerText from this element:
<div class="x-paging-info" id="ext-gen200">Displaying reports 1 - 15 of 15</div>
This is my code so far:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support.ui import WebDriverWait
driver = webdriver.Firefox()
driver.maximize_window()
search_url = 'https://www.mergentarchives.com/search.php'
driver.get(search_url)
assert 'Mergent' in driver.title
company_name_input = '//*[#id="ext-comp-1009"]'
search_button = '//*[#id="ext-gen287"]'
driver.implicitly_wait(10)
driver.find_element_by_xpath(company_name_input).send_keys('3com corp')
driver.find_element_by_xpath(search_button).click()
driver.implicitly_wait(20)
print(driver.find_element_by_css_selector('#ext-gen200').text)
basically I am just filling out a search form, which works, and its taking me to a search results page, where the number of results is listed in a div element. When I attempt to print the text of this element, I simply get a blank space, there is nothing written and no error.
[Finished in 21.1s]
What am I doing wrong?
I think you may need explicit Wait :
wait = WebDriverWait(driver, 10)
info = wait.until(EC.visibility_of_element_located((By.XPATH, "//div[#class = 'x-paging-info' and #id='ext-gen200']"))).get_attribute('innerHTML')
print(info)
Imports :
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
You may need to put a condition by verifying if search results loaded or not and once its loaded you can use below code
print(driver.find_element_by_id('ext-gen200').text)
I am trying to create a python function that can scrape the article titles of a search result on Popular Science's website.
I have written this code, which has worked for a similar science-related website but when I run it specifically for Popular Science, it returns an empty list.
Code:
from selenium import webdriver
import pandas as pd
def scraper(text):
driver = webdriver.Chrome(executable_path='chromedriver.exe')
wired_dict = []
driver.get("https://www.popsci.com/search-results/" + text + "/")
search = driver.find_elements_by_class_name("siq-partner-result")
for words in search:
wired_dict.append(words.text)
return wired_dict
print(scraper("science"))
You can use driver.implicitly_wait(10) for wait while page is loaded.
from selenium import webdriver
def scrapper(text):
driver = webdriver.Chrome('./chromedriver')
driver.get(f"https://www.popsci.com/search-results/{text}/")
driver.implicitly_wait(10)
search = driver.find_elements_by_class_name("siq-partner-result")
wired_dict = [word.text for word in search]
print(wired_dict)
scrapper('sample')
This page takes a while to load. You are using driver.find_elements_by_class_name before the page has finished loading, so it's not finding those elements.
You can test this theory by import time and time.sleep(5) just before the search code.
The best solution is to keep checking until the elements are loaded with WebDriverWait() wait until the elements have loaded.
from selenium import webdriver
import pandas as pd
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException
def scraper(text):
driver = webdriver.Chrome(executable_path='chromedriver.exe')
wired_dict = []
driver.get("https://www.popsci.com/search-results/" + text + "/")
delay = 3
WebDriverWait(driver, delay).until(EC.presence_of_element_located((By.CLASS_NAME, 'siq-partner-result')))
search = driver.find_elements_by_class_name("siq-partner-result")
for words in search:
wired_dict.append(words.text)
return wired_dict
You can use WebDriverWait for the desired element to visible and then try to find the elements.
Using XPATH :
WebDriverWait(driver, 30).until(EC.visibility_of_element_located((By.XPATH, "//*[#class='siq-partner-result']")))
search = driver.find_elements_by_class_name("siq-partner-result")
Note : You have to add the following imports :
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
I've written a script in python using selenium to scrape some text out of a webpage. The text I wanna scrape generates upon filling in an inputbox. My script can fill in in the right way and can populate the value. However, it can't parse the text. How can I do it?
This is what I've tried so far:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import expected_conditions as EC
driver = webdriver.Chrome()
wait = WebDriverWait(driver, 10)
driver.get("http://dev.delshlearning.com.au/test.php")
wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR,"#AM"))).send_keys("`(2(3^5-sqrt(3)))/2`",Keys.RETURN)
item = wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR,"#MQ"))).text
print(item)
driver.quit()
The send_keys() parameter is already filled in within the script for your consideration.
It is storing the text values in the attribute value. This should work:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import expected_conditions as EC
driver = webdriver.Chrome()
wait = WebDriverWait(driver, 10)
driver.get("http://dev.delshlearning.com.au/test.php")
# I changed your locater to ID since it's a little more clear
wait.until(EC.visibility_of_element_located((By.ID,"AM"))).send_keys("`(2(3^5-sqrt(3)))/2`",Keys.RETURN)
item = wait.until(EC.visibility_of_element_located((By.ID,"MQ"))).get_attribute('value')
print(item)
driver.quit()
I found it by going to the element's properties as shown highlighted here:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
chrome_path=r"C:\Users\Priyanshu\Downloads\Compressed\chromedriver_win32\chromedriver.exe"
driver=webdriver.Chrome(chrome_path)
driver.get("https://www.flipkart.com/?")
search = driver.find_element_by_name('q')
search.send_keys("laptop")
search.send_keys(Keys.RETURN)
driver.find_element_by_xpath(""" //*[#id="container"]/div/div[2]/div[2]/div/div[2]/div[2]/div/section/ul/li[2]""").click()
I am getting no such element present in the last line of code.
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
chrome_path=r"C:\Users\Priyanshu\Downloads\Compressed\chromedriver_win32\chromedriver.exe"
driver=webdriver.Chrome(chrome_path)
driver.get("https://www.flipkart.com/search?q=laptop&otracker=start&as-show=off&as=off")
driver.find_element_by_xpath(""" //*[#id="container"]/div/div[2]/div[2]/div/div[2]/div[2]/div/section/ul/li[2]""").click()
If I am doing like this its working fine.
The element is not immediately available, wait for it to be present first:
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
wait = WebDriverWait(driver, 10)
search = wait.until(EC.presence_of_element_located((By.NAME, 'q')))
search.send_keys("laptop")
search.send_keys(Keys.RETURN)
element = wait.until(EC.presence_of_element_located(By.XPATH, '//*[#id="container"]/div/div[2]/div[2]/div/div[2]/div[2]/div/section/ul/li[2]'))
element.click()
Note that, assuming you want to get to the "Popularity" menu header, why don't simplify the XPath expression and use the element's text:
//li[. = "Popularity"]