I am trying to scraping name and email of agents from this site. The code firstly captures link to every profile on first page and then visits each profile to get name and email. But the problem is that it is taking alot of time to get anchor tag having name of agent in it. Here's the code:
import os
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
class MessageIndividual(webdriver.Chrome):
def __init__(self, driver_path=r";C:/SeleniumDriver", teardown=False):
self.driver_path = driver_path
self.teardown = teardown
os.environ['PATH'] += self.driver_path
#options = webdriver.ChromeOptions()
#options.headless = True
super(MessageIndividual, self).__init__()
self.implicitly_wait(5)
self.maximize_window()
def __exit__(self, exc_type, exc_val, exc_tb):
if self.teardown:
self.quit()
def goToSite(self):
url = 'https://www.bhhs.com/agent-search-results'
self.get(url)
def getDetails(self):
mylist = [my_elem.get_attribute("href") for my_elem in WebDriverWait(self, 1000).until(
EC.visibility_of_all_elements_located((By.XPATH, "//section[#class='cmp-agent-results-list-view']/div[#class='cmp-agent-results-list-view__content container ']/div[#class='row associate pt-3 pb-3 ']/div[#class='col-6 col-sm-4 col-lg-3 order-lg-3 associate__btn-group']/section[2]/a[#href]")))]
for i in mylist:
self.execute_script("window.open('');")
self.switch_to.window(self.window_handles[1])
self.get(i)
name = WebDriverWait(self,5).until(
EC.presence_of_element_located((By.XPATH,'//h1[#class="cmp-agent__name"]/a[1]'))
)
print(name.text)
email = WebDriverWait(self,1).until(EC.presence_of_element_located((By.CLASS_NAME,'cmp-agent-details__mail')))
print(email.text)
self.close()
self.switch_to.window(self.window_handles[0])
if __name__ == '__main__':
inst = MessageIndividual(teardown=False)
inst.goToSite()
inst.getDetails()
Is there any way I can scrape name in lesser time?
I have change the xpath to identify the anchor tag and remove the new window open in each iteration. hope this will reduce some time.
def getDetails(self):
mylist = [my_elem.get_attribute("href") for my_elem in WebDriverWait(self, 1000).until(
EC.visibility_of_all_elements_located((By.XPATH, "//a[.//span[normalize-space(.)='agent details']]")))]
for i in mylist:
#self.execute_script("window.open('');")
#self.switch_to.window(self.window_handles[1])
self.get(i)
name = WebDriverWait(self,5).until(
EC.presence_of_element_located((By.XPATH,'//h1[#class="cmp-agent__name"]/a[1]'))
)
print(name.text)
email = WebDriverWait(self,1).until(EC.presence_of_element_located((By.CLASS_NAME,'cmp-agent-details__mail')))
print(email.text)
#self.close()
#self.switch_to.window(self.window_handles[0])
Related
I'm working on a project that can scrape comments off posts on instagram and write them into an excel file.
Here's my code:
from selenium.webdriver.common.by import By
from selenium import webdriver
import time
import sys
import pandas as pd
from pandas import ExcelWriter
import os.path
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains
url = [
"https://www.instagram.com/p/CcVTqRtJ2gj/",
"https://www.instagram.com/p/CcXpLHepve-/",
]
user_names = []
user_comments = []
driver = driver = webdriver.Chrome("C:\chromedriver.exe")
driver.get(url[0])
time.sleep(3)
username = WebDriverWait(driver, 30).until(EC.element_to_be_clickable((By.CSS_SELECTOR, "input[name='username']")))
password = WebDriverWait(driver, 30).until(EC.element_to_be_clickable((By.CSS_SELECTOR,"input[name='password']")))
username.clear()
username.send_keys("username")
password.clear()
password.send_keys("pwd")
Login_button = (
WebDriverWait(driver, 2)
.until(EC.element_to_be_clickable((By.CSS_SELECTOR, "button[type='submit']")))
.click()
)
time.sleep(4)
not_now = (
WebDriverWait(driver, 30)
.until(
EC.element_to_be_clickable((By.XPATH, '//button[contains(text(), "Not Now")]'))
)
.click()
)
for n in url:
try:
driver.get(n)
time.sleep(3)
load_more_comment = driver.find_element_by_xpath("//button[class='wpO6b ']")
print("Found {}".format(str(load_more_comment)))
i = 0
while load_more_comment.is_displayed() and i < 10:
load_more_comment.click()
time.sleep(1.5)
load_more_comment = driver.find_element_by_xpath(
"//button[class='wpO6b ']"
)
print("Found {}".format(str(load_more_comment)))
i += 1
user_names.pop(0)
user_comments.pop(0)
except Exception as e:
print(e)
pass
comment = driver.find_elements_by_class_name("gElp9 ")
for c in comment:
container = c.find_element_by_class_name("C4VMK")
name = container.find_element_by_class_name("_6lAjh ").text
content = container.find_element_by_class_name("MOdxS ").text
content = content.replace("\n", " ").strip().rstrip()
user_names.append(name)
user_comments.append(content)
print(content)
user_names.pop(0)
user_comments.pop(0)
# export(user_names, user_comments)
driver.close()
df = pd.DataFrame(list(zip(user_names, user_comments)), columns=["Name", "Comments"])
# df.to_excel("Anime Content Engagement.xlsx")
print(df)
And the load-more-comments part, doesn't seem to work.
Since there are more than one buttons with the same class name, I"m not able to choose the right button to click on. And I'm a beginner so if there's anyone with any solution to how I can solve this it would be great.
you can select by aria-label text:
driver.find_element_by_css_selector("svg._8-yf5[aria-label='TEXT']")
i believe the text inside changes according to instagram language, put it according to what appears on your
"problem lines"
for_tariff_loop = driver.find_elements_by_xpath("//span[#class='phx-radio__element']")
radio_label_list = for_tariff_loop[i].find_element_by_css_selector('span[class="phx-radio__label"]')
print(radio_label_list)
time.sleep(1)
website I'm scraping https://www.telekom.de/unterwegs/apple/apple-iphone-13-pro/graphit-512gb
label image
I was not able to print the radio buttons label according to checked button. I don't know what is the mistake and where I did it. could anyone help on this. It will be helpful for me to learn. Change tariff links given below links,
import xlwt
from selenium import webdriver
import re
import time
from datetime import date
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.keys import Keys
class telekommobiles:
def __init__(self):
self.url="https://www.telekom.de/mobilfunk/geraete/smartphone?page=1&pageFilter=promotion"
self.country='DE'
self.currency='GBP'
self.VAT='Included'
self.shipping = 'free shipping within 3-4 weeks'
self.Pre_PromotionPrice ='N/A'
self.color ='N/A'
def telekom(self):
#try:
driver=webdriver.Chrome()
driver.maximize_window()
driver.get(self.url)
today = date.today()
time.sleep(5)
cookies = driver.find_element_by_css_selector('button.cl-btn.cl-btn--accept-all').click()
print("cookies accepted")
links_prod_check = []
prod_models = []
prod_manufacturer =[]
prod_memorys = []
product_colors =[]
product_price_monthly_payments = []
product_price_one_time_payments =[]
product_links = []
containers = driver.find_elements_by_css_selector('div[class="styles_item__12Aw4"]')
i = 1
for container in containers:
p_links =container.find_element_by_tag_name('a').get_attribute('href')
i = i + 1
product_links.append(p_links)
#print(p_links)
for links in product_links:
driver.get(links)
#time.sleep(5)
#print(driver.current_url)
#links_prod_check.append(driver.current_url)
coloroptions = WebDriverWait(driver, 30).until(EC.presence_of_all_elements_located((By.XPATH,"//li[#data-qa='list_ColorVariant']")))
#print(coloroptions)
for i in range(len(coloroptions)):
coloroption = driver.find_elements_by_xpath("//li[#data-qa='list_ColorVariant']")
coloroption[i].click()
#print(coloroption[i])
time.sleep(3)
memoryoptions = WebDriverWait(driver, 30).until(EC.presence_of_all_elements_located((By.XPATH,"//span[#class='phx-radio__element']")))
for i in range(len(memoryoptions)):
memoryoption = driver.find_elements_by_xpath("//span[#class='phx-radio__element']")
try:
memoryoption[i].click()
except:
pass
time.sleep(5)
change_traiff = driver.find_element_by_css_selector('button[class="phx-link phx-list-of-links__link js-mod tracking-added"]').click()
time.sleep(3)
#looping for each section
section_loops = driver.find_elements_by_css_selector('section[class="tariff-catalog--layer"]')
#print(len(section_loops))
for section_loop in section_loops:
#print(section_loop)
time.sleep(5)
#Headings
heading_1 = section_loop.find_element_by_css_selector('h2[class="page-title page-title--lowercase"]').text
print(heading_1)
# looping for each separate boxes
each_box_subcontainers = section_loop.find_elements_by_css_selector('.phx-tariff-box__section')
#print(len(each_box_subcontainers))
for subcontainer in each_box_subcontainers:
#print(subcontainer)
looping_for_tariff = WebDriverWait(driver, 10).until(EC.presence_of_all_elements_located((By.XPATH,"//span[#class='phx-radio__element']")))
#print(looping_for_tariff)
for i in range(len(looping_for_tariff)):
#print(i)
try:
for_tariff_loop = driver.find_elements_by_xpath("//span[#class='phx-radio__element']")
for_tariff_loop[i].click()
time.sleep(3)
except:
pass
for_tariff_loop = driver.find_elements_by_xpath("//span[#class='phx-radio__element']")
radio_label_list = for_tariff_loop[i].find_element_by_css_selector('span[class="phx-radio__label"]')
print(radio_label_list)
time.sleep(1)
change_traiff_close_button = driver.find_element_by_css_selector('span[class="icon-after-yellow-close right close popup-close-tr js-popup-close"]').click()
telekom_de=telekommobiles()
telekom_de.telekom()
You are trying to find element within an element. Finding radio_label_list using for_tariff_loop[i], xpath for radio_label_list will become like below:
//span[#class='phx-radio__element']//span[#class="phx-radio__label"]
Which does not exist in the DOM.
I tried the last part of the code. And was able to print the Memory size like below. Do try and confirm:
Replaced css-selector for radio_label_list with this xpath ./following-sibling::span
looping_for_tariff = WebDriverWait(driver, 10).until(EC.presence_of_all_elements_located((By.XPATH, "//span[#class='phx-radio__element']")))
# print(looping_for_tariff)
for i in range(len(looping_for_tariff)):
# print(i)
try:
for_tariff_loop = driver.find_elements_by_xpath("//span[#class='phx-radio__element']")
for_tariff_loop[i].click()
time.sleep(3)
except:
pass
for_tariff_loop = driver.find_elements_by_xpath("//span[#class='phx-radio__element']")
radio_label_list = for_tariff_loop[i].find_element_by_xpath("./following-sibling::span").text
print(radio_label_list)
time.sleep(1)
As per the comments, check this code:
driver.get("https://www.telekom.de/unterwegs/apple/apple-iphone-13-pro/graphit-512gb")
wait = WebDriverWait(driver,30)
wait.until(EC.element_to_be_clickable((By.XPATH,"//button[text()='Accept All']"))).click()
wait.until(EC.element_to_be_clickable((By.XPATH,"//ul[contains(#class,'phx-tariff-notification-box-new__element--desktop-tablet')]/li[2]/button"))).click()
length = len(driver.find_elements_by_class_name("phx-tariff-box__section"))
for i in range(length):
print("----------------------------------------------------------------------------------------------------------")
options = driver.find_elements_by_class_name("phx-tariff-box__section")
datas = options[i].find_element_by_xpath(".//div[contains(#class,'phx-tariff-box__volume')]").get_attribute("innerText")
print("data: {}".format(datas))
len_types = len(options[i].find_elements_by_xpath(".//div[#class='phx-tariff-box__radios-inner']//label"))
types = options[i].find_elements_by_xpath(".//div[#class='phx-tariff-box__radios-inner']//label")
if len(types) == 0:
price = options[i].find_element_by_xpath(".//p[#data-qa='block_TariffPrice']").get_attribute("innerText")
print(price)
else:
for j in range(len_types):
types[j].click()
time.sleep(2)
options = driver.find_elements_by_class_name("phx-tariff-box__section")
types = options[i].find_elements_by_xpath(".//div[#class='phx-tariff-box__radios-inner']//label")
try:
types[j].find_element_by_xpath("./input[#checked]")
type = types[j].find_element_by_xpath("./span[2]").get_attribute("innerText")
price = options[i].find_element_by_xpath(".//p[#data-qa='block_TariffPrice']").get_attribute("innerText")
print(f"{type}: {price}")
except:
pass
I am attempting to identify an HTML Button with Xpath and have attempted both the relative and absolute Xpath without success. I am attempting to click the button.
The relative path:
click = webdriver.find.element_by_xpath("//onboarding-mobile-fixed-bottom-container/div[1]/div/sprout-button/button").click()
Absolute path: /html/body/cfapp-root/main/cfapp-spa-host/main/onboarding-root/div/div[1]/main/onboarding-business-phone/section/form/onboarding-next-button/onboarding-mobile-fixed-bottom-container/div[2]/sprout-button/button
absolute = webdriver.find.element_by_xpath("/html/body/cfapp-root/main/cfapp-spa-host/main/onboarding-root/div/div[1]/main/onboarding-business-phone/section/form/onboarding-next-button/onboarding-mobile-fixed-bottom-container/div[2]/sprout-button/button").click()
Even when using the absolute xpath (I know, frowned upon practice) I can't get the button to click.
For reference, I am automating: site: https://account.kabbage.com/onboarding/data/number-of-employees; Username: testingoverflow#aol.com; Pw: Kabbage123
(click finish applying; finish applying; working on the continue box)
Any help is much appreciated!!
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from time import sleep, strftime
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
import csv
import xlrd
info = "info.xlsx"
openwb = xlrd.open_workbook(info)
inputws = openwb.sheet_by_index(0)
print(inputws.nrows)
print(inputws.ncols)
print(inputws.cell_value(1,0))
email_log = inputws.cell_value(2,0)
businesslog = inputws.cell_value(2,1)
firstname = inputws.cell_value(2,2)
lastname = inputws.cell_value(2,3)
phone = int(inputws.cell_value(2,4))
employees = int(inputws.cell_value(2,5))
business_types = inputws.cell_value(2,6)
print(email_log)
print(businesslog)
print(firstname)
print(lastname)
print(phone)
sleep(1)
chromedriver_path = 'C:/Users/Documents/Scale/Programs/chromedriver.exe'
webdriver = webdriver.Chrome(executable_path=chromedriver_path)
webdriver.get('https://app.kabbage.com/signup/create_account')
sleep(1)
#input email
input_emails = webdriver.find_element_by_xpath('//*[#id="CreateAccount.EmailAddress_inner"]').send_keys(email_log)
sleep(1)
#re-input email
reinput = webdriver.find_element_by_xpath('//*[#id="CreateAccount.ConfirmEmail_inner"]').send_keys(email_log)
# Password
passwrd = webdriver.find_element_by_xpath('//*[#id="CreateAccount.CreatePassword"]')
sleep(1)
passwrd.send_keys('Paycheck11!!')
sleep(1)
button_started = webdriver.find_element_by_class_name("btn-full-width").click()
sleep(5)
#ApplyNow
#apply = webdriver.find_element_by_class_name('spr-btn spr-btn-primary')
#apply = webdriver.find_elements_by_class_name("spr-btn-primary").click()
#xpath("//div[#class='fc-day-content' and text()='15']")
applynow = webdriver.find_element_by_xpath("//sprout-button/button[contains(#class, 'spr-btn-primary')]").click()
sleep(5)
applyfinal = webdriver.find_element_by_xpath("//sprout-button/button[contains(#class, 'spr-btn-primary')]").click()
sleep(5)
business_name = webdriver.find_element_by_xpath('//*[#id="businessName-input"]').send_keys(businesslog)
business_send = webdriver.find_element_by_xpath("/html/body/cfapp-root/main/cfapp-spa-host/main/onboarding-root/div/div[1]/main/onboarding-business-name/section/form/onboarding-next-button/onboarding-mobile-fixed-bottom-container/div[2]/sprout-button/button").click()
sleep(5)
first_name = webdriver.find_element_by_xpath('//*[#id="lastName-input"]').send_keys(lastname)
last_name = webdriver.find_element_by_xpath('//*[#id="firstName-input"]').send_keys(firstname)
names_send = webdriver.find_element_by_xpath("/html/body/cfapp-root/main/cfapp-spa-host/main/onboarding-root/div/div[1]/main/onboarding-personal-name/section/form/onboarding-next-button/onboarding-mobile-fixed-bottom-container/div[2]/sprout-button/button").click()
sleep(5)
phone_num = webdriver.find_element_by_xpath('//*[#id="businessPhone-input"]').send_keys(phone)
phone_check = webdriver.find_element_by_xpath('//html/body/cfapp-root/main/cfapp-spa-host/main/onboarding-root/div/div[1]/main/onboarding-business-phone/section/form/kbg-consent-box/div/sprout-checkbox/div/label').click()
#phone_send = names_send = webdriver.find_element_by_xpath("/html/body/cfapp-root/main/cfapp-spa-host/main/onboarding-root/div/div[1]/main/onboarding-personal-name/section/form/onboarding-next-button/onboarding-mobile-fixed-bottom-container/div[2]/sprout-button/button").click()
phone_submits = webdriver.find_element_by_xpath("/html/body/cfapp-root/main/cfapp-spa-host/main/onboarding-root/div/div[1]/main/onboarding-business-phone/section/form/onboarding-next-button/onboarding-mobile-fixed-bottom-container/div[2]/sprout-button/button").click()
sleep(5)
num_empl = webdriver.find_element_by_xpath('//*[#id="numberOfEmployees-input"]').send_keys(employees)
#emp_submit = webdriver.find_element_by_xpath("//sprout-button/button[contains(#class, 'spr-btn-block')][2]").click()
sending = webdriver.find.element_by_xpath("//button[#class='spr-btn spr-btn-primary' and contains(text(),'Continue')]").click()
You can use any of these Xpaths:
Correct relative XPath for Continue button
Xpath 1:
*//onboarding-next-button//onboarding-mobile-fixed-bottom-container//div[2]//sprout-button//button[contains(text(),Continue)]
Xpath 2:
*//sprout-button[#class='desktop-button']//button[contains(text(),Continue)]
Give this a go:
webdriver.find_element_by_xpath("//button[#class="spr-btn spr-btn-primary" and contains(text(),'Continue')]").click()
Random timeout exception for the code below not sure whats the best approach to address these issues, and this timeout not happen all time, and also it does find elements sometime or all time
we appreciate your comments and suggestions and apparently explicit wait is not handling until the elements gets loaded into the browser application or elements are getting different interval in every single time when new page gets loaded
"""
"""
import platform , logging
import os,re
from time import sleep
import selenium.webdriver.support.ui as ui
import selenium.webdriver.support.expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.webdriver import WebDriver
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.action_chains import ActionChains
class Cloud(object):
"""
cloud class to get query and response
"""
def __init__(self, username='xxxxxx', password='xxxx'):
logging.basicConfig(format='%(asctime)s %(message)s', datefmt='%m/%d/%Y %I:%M:%S %p', level=logging.INFO)
self.logger = logging.getLogger(__name__)
self.url = "https://www.amazon.com"
self.username = username
self.password = password
self.timeout = 100
self.driver = None
self.get_chrome_driver()
def get_chrome_driver(self):
"""
get chrome driver
"""
if platform.system().lower() == 'windows':
if self.driver is None:
chrome_options = Options()
#chrome_options.add_argument('--disable-extensions')
chrome_options.add_argument('--start-maximized')
chrome_options.add_argument('--disable-popup-blocking')
chrome_options.add_argument('--ignore-certificate-errors')
chrome_options.add_argument('--allow-insecure-localhost')
chrome_options.add_argument('--disable-infobars')
chrome_options.add_argument("--log-level=3")
chrome_driver_path = os.path.join(str(os.environ['PYTHONPATH'].split(';')[0]),"bin","chromedriver","chromedriver.exe")
self.driver = WebDriver(executable_path=chrome_driver_path, chrome_options=chrome_options)
return self.driver
def login(self, username='xxxxxxx', password='xxxxx'):
"""
Login into amazon cloud
"""
self.logger.info("logging in amazon cloud username: %s and password: %s" %(self.username, re.sub(r".", "*", self.password)))
self.driver.get(self.url)
# wait for login username textbox
self.wait_visibility_element(By.XPATH, "//div[#id='nav-signin-tooltip']//span[#class='nav-action-inner'][contains(text(),'Sign in')]")
self.driver.find_element_by_xpath(" //div[#id='nav-signin-tooltip']//span[#class='nav-action-inner'][contains(text(),'Sign in')]").click()
self.wait_visibility_element(By.XPATH,"//label[#class='a-form-label']")
self.wait_visibility_element(By.XPATH,"//input[#id='ap_email']")
username_textbox = self.driver.find_element_by_xpath("//input[#id='ap_email']")
username_textbox.clear()
username_textbox.send_keys(self.username)
self.driver.find_element_by_xpath("//input[#id='continue']").click()
self.wait_visibility_element(By.XPATH,"//input[#id='ap_password']") #//label[#class='a-form-label']
password_textbox = self.driver.find_element_by_xpath("//input[#id='ap_password']")
password_textbox.clear()
password_textbox.send_keys(self.password)
# click on submit button
self.driver.find_element_by_xpath("//input[#id='signInSubmit']").click()
def wait_visibility_element(self, by_type, element_name):
"""
wait for visibility of element
:param by_type: Locate element using type of element
:param element_name: element name
"""
ui.WebDriverWait(self.driver, self.timeout).until(
EC.visibility_of_element_located((by_type, element_name)))
def get_audio_text(self, multi_turn_count=1):
self.login()
#Arrow in the Top Menu
self.wait_visibility_element(By.XPATH, "//a[#id='nav-link-accountList']//span[#class='nav-icon nav-arrow']")
ui.WebDriverWait(self.driver, self.timeout).until(EC.element_to_be_clickable((By.XPATH, "//a[#id='nav-link-accountList']//span[#class='nav-icon nav-arrow']")))
self.driver.find_element_by_xpath("//a[#id='nav-link-accountList']//span[#class='nav-icon nav-arrow']").click()
#To avoid click event ambiguity
firstLevelMenu = self.driver.find_element_by_xpath("//span[contains(#class,'nav-line-2')][contains(text(),'Account & Lists')]")
action = ActionChains(self.driver)
action.move_to_element(firstLevelMenu).perform()
#sub menu select and click
self.wait_visibility_element(By.XPATH, "//span[contains(text(),'Your Content and Devices')]")
self.driver.find_element_by_xpath("//span[contains(text(),'Your Content and Devices')]").click()
#Alexa Privacy
self.wait_visibility_element(By.XPATH, "//div[#id='ng-app']//div[2]//div[1]//div[1]//div[1]//div[1]//div[1]//div[2]//div[6]//div[1]//div[1]")
self.driver.find_element_by_xpath("//div[#id='ng-app']//div[2]//div[1]//div[1]//div[1]//div[1]//div[1]//div[2]//div[6]//div[1]//div[1]").click()
self.wait_visibility_element(By.XPATH,'//div[#class="navAlexaOptionTitle_alexaNavHeader_myx ng-binding"][contains(text(),"Review Voice History")]')
ui.WebDriverWait(self.driver, self.timeout).until(
EC.element_to_be_clickable((By.XPATH, '//div[#class="navAlexaOptionTitle_alexaNavHeader_myx ng-binding"][contains(text(),"Review Voice History")]')))
ui.WebDriverWait(self.driver, self.timeout).until(EC.text_to_be_present_in_element((By.XPATH, '//span[#class="overviewHeadingString_myx ng-binding"]'), 'Alexa Privacy'))
self.driver.find_element_by_xpath('//div[#class="navAlexaOptionTitle_alexaNavHeader_myx ng-binding"][contains(text(),"Overview")]').click()
self.driver.find_element_by_xpath("//div[#class='navAlexaOptionTitle_alexaNavHeader_myx ng-binding'][contains(text(),'Review Voice History')]").click()
# Select the dropdown box
self.wait_visibility_element(By.XPATH,"//span[#id='timePickerDesktop']//span[#class='a-button-text a-declarative']")
ui.WebDriverWait(self.driver, self.timeout).until(EC.element_to_be_clickable((By.XPATH, "//span[#id='timePickerDesktop']//span[#class='a-button-text a-declarative']")))
self.driver.find_element_by_xpath("//span[#id='timePickerDesktop']//span[#class='a-button-text a-declarative']").click()
#All history selection
self.wait_visibility_element(By.XPATH,"//a[#id='timePickerDesktop_4']")
ui.WebDriverWait(self.driver, self.timeout).until(EC.element_to_be_clickable((By.XPATH, "//a[#id='timePickerDesktop_4']")))
self.driver.find_element_by_xpath("//a[#id='timePickerDesktop_4']").click()
# read first text format of the data
self.wait_visibility_element(By.XPATH,"//span[#id='mainInfo-0']//div[contains(#class,'summaryCss')]")
txt = self.driver.find_element_by_xpath("//span[#id='mainInfo-0']//div[contains(#class,'summaryCss')]").text
question_text = txt.encode("utf-8")[3:-3]
# Dropdown the rectangle menu
self.driver.find_element_by_xpath("//div[#id='arrowExpand-0']//i[#class='fa fa-angle-down caretAlignment']").click()
# read AVS Response
self.wait_visibility_element(By.XPATH,"//div[#id='activityItemsInner-0']//div[#class='ttsInfo']")
ui.WebDriverWait(self.driver, self.timeout).until(EC.element_to_be_clickable((By.XPATH, "//div[#id='activityItemsInner-0']//div[#class='ttsInfo']")))
txt = self.driver.find_element_by_xpath("//div[#id='activityItemsInner-0']//div[#class='ttsInfo']").text
answer_text = txt.encode("utf-8")[3:-3]
self.sign_out_direct()
return question_text, answer_text
def sign_out(self):
#Sigout menu nevigation
self.driver.find_element_by_xpath("//i[#class='hm-icon nav-sprite']").click()
self.wait_visibility_element(By.XPATH,"//div[contains(text(),'SHOP BY CATEGORY')]")
#sign out
sign_out_element = self.driver.find_element_by_xpath("//li[44]//a[1]")
self.driver.execute_script("arguments[0].scrollIntoView();", sign_out_element)
#self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
ui.WebDriverWait(self.driver, self.timeout).until(EC.element_to_be_clickable((By.XPATH, "//li[44]//a[1]")))
self.driver.find_element_by_xpath("//li[44]//a[1]").click()
self.sign_out_direct()
#Close current tab
self.driver.close()
def sign_out_direct(self):
#Arrow in the Top Menu
self.wait_visibility_element(By.XPATH, "//a[#id='nav-link-accountList']//span[#class='nav-icon nav-arrow']")
ui.WebDriverWait(self.driver, self.timeout).until(EC.element_to_be_clickable((By.XPATH, "//a[#id='nav-link-accountList']//span[#class='nav-icon nav-arrow']")))
self.driver.find_element_by_xpath("//a[#id='nav-link-accountList']//span[#class='nav-icon nav-arrow']").click()
#To avoid click event ambiguity
firstLevelMenu = self.driver.find_element_by_xpath("//span[contains(#class,'nav-line-2')][contains(text(),'Account & Lists')]")
action = ActionChains(self.driver)
action.move_to_element(firstLevelMenu).perform()
#sub menu select and click
self.wait_visibility_element(By.XPATH, " //span[contains(text(),'Sign Out')]")
self.driver.find_element_by_xpath("//span[contains(text(),'Sign Out')]").click()
#Close current tab
self.driver.close()
if __name__ == '__main__':
for loop in range(20):
PAGE = Cloud()
#PAGE.login()
OUTPUT = PAGE.get_audio_text()
print("\n\nQuestion:: %s"%str(list(OUTPUT)[0]).upper())
print("Answer:: %s"%str(list(OUTPUT)[1]).upper())
#PAGE.sign_out()
#PAGE.sign_out_direct()
sleep(2)
If you post the lines of code in particular that are throwing the timeout exceptions, this will help track down the issues easier.
I noticed most of your waits are for visibility_of_element_located. I would recommend trying to change some of those to element_to_be_clickable instead, because some elements will appear on the DOM before they are fully rendered.
I've written a scraper in Python scrapy in combination with selenium to scrape some titles from a website. The css selectors defined within my scraper is flawless. I wish my scraper to keep on clicking on the next page and parse the information embedded in each page. It is doing fine for the first page but when it comes to play the role for selenium part the scraper keeps clicking on the same link over and over again.
As this is my first time to work with selenium along with scrapy, I don't have any idea to move on successfully. Any fix will be highly appreciated.
If I try like this then it works smoothly (there is nothing wrong with selectors):
class IncomeTaxSpider(scrapy.Spider):
name = "taxspider"
start_urls = [
'https://www.incometaxindia.gov.in/Pages/utilities/exempted-institutions.aspx',
]
def __init__(self):
self.driver = webdriver.Chrome()
self.wait = WebDriverWait(self.driver, 10)
def parse(self,response):
self.driver.get(response.url)
while True:
for elem in self.wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR,"h1.faqsno-heading"))):
name = elem.find_element_by_css_selector("div[id^='arrowex']").text
print(name)
try:
self.wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "input[id$='_imgbtnNext']"))).click()
self.wait.until(EC.staleness_of(elem))
except TimeoutException:break
But my intention is to make my script run this way:
import scrapy
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
class IncomeTaxSpider(scrapy.Spider):
name = "taxspider"
start_urls = [
'https://www.incometaxindia.gov.in/Pages/utilities/exempted-institutions.aspx',
]
def __init__(self):
self.driver = webdriver.Chrome()
self.wait = WebDriverWait(self.driver, 10)
def click_nextpage(self,link):
self.driver.get(link)
elem = self.wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "div[id^='arrowex']")))
#It keeeps clicking on the same link over and over again
self.wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "input[id$='_imgbtnNext']"))).click()
self.wait.until(EC.staleness_of(elem))
def parse(self,response):
while True:
for item in response.css("h1.faqsno-heading"):
name = item.css("div[id^='arrowex']::text").extract_first()
yield {"Name": name}
try:
self.click_nextpage(response.url) #initiate the method to do the clicking
except TimeoutException:break
These are the titles visible on that landing page (to let you know what I'm after):
INDIA INCLUSION FOUNDATION
INDIAN WILDLIFE CONSERVATION TRUST
VATSALYA URBAN AND RURAL DEVELOPMENT TRUST
I'm not willing to get the data from that site, so any alternative approach other than what I've tried above is useless to me. My only intention is to have any solution related to the way I tried in my second approach.
Your initial code was almost correct with one key piece missing from it. You were using the same response object always. The response object needs to be from the latest page source.
Also you were browsing the link again and again in click next page which was resetting it to page 1 every time. That is why you get page 1 and 2 (max). You need to get the url only once in the parse stage and then let the next page click to happen
Below is final code working fine
class IncomeTaxSpider(scrapy.Spider):
name = "taxspider"
start_urls = [
'https://www.incometaxindia.gov.in/Pages/utilities/exempted-institutions.aspx',
]
def __init__(self):
self.driver = webdriver.Chrome()
self.wait = WebDriverWait(self.driver, 10)
def click_nextpage(self,link):
# self.driver.get(link)
elem = self.wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "div[id^='arrowex']")))
#It keeeps clicking on the same link over and over again
self.wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "input[id$='_imgbtnNext']"))).click()
self.wait.until(EC.staleness_of(elem))
def parse(self, response):
self.driver.get(response.url)
while True:
for item in response.css("h1.faqsno-heading"):
name = item.css("div[id^='arrowex']::text").extract_first()
yield {"Name": name}
try:
self.click_nextpage(response.url) #initiate the method to do the clicking
response = response.replace(body=self.driver.page_source)
except TimeoutException:break
After that change it works perfect
In case you need pure Selenium solution:
driver.get("https://www.incometaxindia.gov.in/Pages/utilities/exempted-institutions.aspx")
while True:
for item in wait(driver, 10).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, "div[id^='arrowex']"))):
print(item.text)
try:
driver.find_element_by_xpath("//input[#text='Next' and not(contains(#class, 'disabledImageButton'))]").click()
except NoSuchElementException:
break
import scrapy
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from scrapy.crawler import CrawlerProcess
class IncomeTaxSpider(scrapy.Spider):
name = "taxspider"
start_urls = [
'https://www.incometaxindia.gov.in/Pages/utilities/exempted-institutions.aspx',
]
def __init__(self):
self.driver = webdriver.Chrome()
self.wait = WebDriverWait(self.driver, 10)
link = 'https://www.incometaxindia.gov.in/Pages/utilities/exempted-institutions.aspx'
self.driver.get(link)
def click_nextpage(self):
elem = self.wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "div[id^='arrowex']")))
#It keeeps clicking on the same link over and over again
self.wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "input[id$='_imgbtnNext']"))).click()
self.wait.until(EC.staleness_of(elem))
time.sleep(4)
def parse(self,response):
while True:
for item in response.css("h1.faqsno-heading"):
name = item.css("div[id^='arrowex']::text").extract_first()
yield {"Name": name}
try:
self.click_nextpage() #initiate the method to do the clicking
except TimeoutException:break
process = CrawlerProcess()
process.crawl(IncomeTaxSpider)
process.start()
Whenever the page gets loaded using the 'Next Page' arrow (using Selenium) it gets reset back to Page '1'. Not sure about the reason for this (may be the java script)
Hence changed the approach to use the input field to enter the page number needed and hitting ENTER key to navigate.
Here is the modified code. Hope this may be useful for you.
import scrapy
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.keys import Keys
class IncomeTaxSpider(scrapy.Spider):
name = "taxspider"
start_urls = [
'https://www.incometaxindia.gov.in/Pages/utilities/exempted-institutions.aspx',
]
def __init__(self):
self.driver = webdriver.Firefox()
self.wait = WebDriverWait(self.driver, 10)
def click_nextpage(self,link, number):
self.driver.get(link)
elem = self.wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "div[id^='arrowex']")))
#It keeeps clicking on the same link over and over again
inputElement = self.driver.find_element_by_xpath("//input[#id='ctl00_SPWebPartManager1_g_d6877ff2_42a8_4804_8802_6d49230dae8a_ctl00_txtPageNumber']")
inputElement.clear()
inputElement.send_keys(number)
inputElement.send_keys(Keys.ENTER)
self.wait.until(EC.staleness_of(elem))
def parse(self,response):
number = 1
while number < 10412: #Website shows it has 10411 pages.
for item in response.css("h1.faqsno-heading"):
name = item.css("div[id^='arrowex']::text").extract_first()
yield {"Name": name}
print (name)
try:
number += 1
self.click_nextpage(response.url, number) #initiate the method to do the clicking
except TimeoutException:break
Create a self.page_num or something.
def parse(self,response):
self.pages = self.driver.find_element_by_css_selector("#ctl00_SPWebPartManager1_g_d6877ff2_42a8_4804_8802_6d49230dae8a_ctl00_totalRecordsDiv.act_search_footer span")
self.pages = int(self.pages.split('of ')[1].split(']')[0])
self.page_num = 1
while self.page_num <= self.pages:
for item in response.css("h1.faqsno-heading"):
name = item.css("div[id^='arrowex']::text").extract_first()
yield {"Name": name}
try:
self.click_nextpage(response.url) #initiate the method to do the clicking
except TimeoutException:break
def click_nextpage(self,link):
self.driver.get(link)
elem = self.wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "div[id^='arrowex']")))
page_link = 'ctl00_SPWebPartManager1_g_d6877ff2_42a8_4804_8802_6d49230dae8a_ctl00_lnkBtn_' + str(self.page_num)
self.page_num = self.page_num + 1
self.wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "input[id$='_imgbtnNext']"))).click()
self.wait.until(EC.staleness_of(elem))