I have an application that is almost working as intended. The problem arises after it runs through the loop on the 5th instance. The search states there are two results which results in the same end result. When this occurs I'd like to select the first of the two.
The popup messages looks like the following:
I'm using the following code to create the list and then loop:
from selenium import webdriver
import pandas as pd
import random
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.common.proxy import Proxy, ProxyType
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
#service = Service('C:\Program Files\Chrome Driver\chromedriver.exe')
URL = "https://mor.nlm.nih.gov/RxClass/search?query=ALIMENTARY TRACT AND METABOLISM"
driver = webdriver.Chrome('C:\Program Files\Chrome Driver\chromedriver.exe')
driver.get(URL)
category = [my_elem.text for my_elem in WebDriverWait(driver, 20).until(EC.visibility_of_all_elements_located((By.CSS_SELECTOR, "div.drug_class img+a")))]
classid = [my_elem.text for my_elem in WebDriverWait(driver, 20).until(EC.visibility_of_all_elements_located((By.CSS_SELECTOR, "div.propText")))]
dfObj = pd.DataFrame(category)
dfObj.columns =['Category']
dfObj.dropna(inplace = True)
new = dfObj["Category"].str.split("(", n = 1, expand = True)
dfObj["New Category"]= new[0]
dfObj["Count"]= new[1]
dfObj.drop(columns =["Category"], inplace = True)
dfObj['Count'] = dfObj['Count'].str.rstrip(')')
dfObj['IsNumber'] = dfObj['Count'].str.isnumeric()
dfObj = dfObj[(dfObj['IsNumber'] == True)]
searchcat = dfObj['New Category'].tolist()
print(searchcat)
dfObj.to_csv('tabledf.csv',index=False)
time.sleep(8)
driver.quit()
for search in searchcat:
page = f"https://mor.nlm.nih.gov/RxClass/search?query={search}"
driver = webdriver.Chrome('C:\Program Files\Chrome Driver\chromedriver.exe')
driver.get(page)
time.sleep(4)
table = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, 'tr.dbsearch')))
time.sleep(4)
filename = search[0:30]+'table.csv'
pd.read_html(driver.page_source)[1].iloc[:,:-1].to_csv(filename,index=False)
time.sleep(4)
driver.quit()
The loop will continue to run if I manually click each search result. However, I would like for selenium to always select the first option. How would I go about this?
Updated Code:
from selenium import webdriver
import pandas as pd
import random
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.common.proxy import Proxy, ProxyType
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait, TimeoutException
import time
with webdriver.Chrome('C:\Program Files\Chrome Driver\chromedriver.exe') as driver:
URL = "https://mor.nlm.nih.gov/RxClass/search?query=ALIMENTARY TRACT AND METABOLISM"
driver.get(URL)
category = [my_elem.text for my_elem in WebDriverWait(driver, 20).until(EC.visibility_of_all_elements_located((By.CSS_SELECTOR, "div.drug_class img+a")))]
dfObj = pd.DataFrame(category)
dfObj.columns =["Category"]
dfObj.dropna(inplace = True)
new = dfObj["Category"].str.split("(", n = 1, expand = True)
dfObj["New Category"]= new[0]
dfObj["Count"]= new[1]
dfObj.drop(columns =["Category"], inplace = True)
dfObj["Count"] = dfObj["Count"].str.rstrip(')')
dfObj["IsNumber"] = dfObj["Count"].str.isnumeric()
dfObj = dfObj[(dfObj["IsNumber"] == True)]
searchcat = dfObj["New Category"].tolist()
dfObj.to_csv('tabledf.csv',index=False)
time.sleep(3)
for search in searchcat:
page = f"https://mor.nlm.nih.gov/RxClass/search?query={search}"
driver = webdriver.Chrome('C:\Program Files\Chrome Driver\chromedriver.exe')
driver.get(page)
table = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, 'tr.dbsearch')))
modal_wait = WebDriverWait(driver, 1)
try:
modal_el = modal_wait.until(EC.visibility_of_element_located((By.ID, 'optionModal')))
modal_el.find_element(By.CSS_SELECTOR, '.uloption').click()
except TimeoutException:
pass
filename = search[0:30]+'table.csv'
classid = [my_elem.text for my_elem in WebDriverWait(driver, 20).until(EC.visibility_of_all_elements_located((By.CSS_SELECTOR, "div.table-responsive div.propText strong:nth-child(2)")))]
classname = [my_elem.text for my_elem in WebDriverWait(driver, 20).until(EC.visibility_of_all_elements_located((By.CSS_SELECTOR, "div.table-responsive div.propText strong:nth-child(1)")))]
classtype = [my_elem.text for my_elem in WebDriverWait(driver, 20).until(EC.visibility_of_all_elements_located((By.CSS_SELECTOR, "div.table-responsive div.propText strong:nth-child(3)")))]
df = pd.read_html(driver.page_source)[1].iloc[:,:-1]
df["ClassID"] = pd.Series(classid)
df["ClassName"] = pd.Series(classname)
df["ClassType"] = pd.Series(classtype)
df.to_csv(filename,index=False)
time.sleep(4)
driver.quit()
First of, I will suggest that you use the with context manager. It will handle opening/closing the driver (Chrome) by itself. This ensure if any exception is raised that it will still be closed.
To do so, use:
with webdriver.Chrome() as driver:
...
In your code I see you close/open a new browser for each URL. This is not needed and not doing so will speed up your script. Just use driver.get() to change the URL.
For your main issue, just add a portion of code that will detect the modal and chose the first option. Something along those lines
modal_wait = WebDriverWait(driver, 1)
try:
modal_el = modal_wait.until(EC.element_to_be_clickable((By.ID, 'optionModal')))
modal_el.find_element(By.CSS_SELECTOR, '.uloption').click()
except TimeoutException:
pass
You must include the following imports:
from selenium.webdriver.support.wait import WebDriverWait, TimeoutException
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
Related
I am scraping an instagramm page where I need to get the user's:
number of posts
Number of followers
I managed to login on instagram then search for the user(in this example 'leonardodicaprio') then go to his page. I am not able to select the text though.
Can someone help please?
Thanks!
# -*- coding: utf-8 -*-
import scrapy
from scrapy_splash import SplashRequest
from scrapy.selector import Selector
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from shutil import which
import logging
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
import time
class InstatestSpider(scrapy.Spider):
name = 'instatest'
allowed_domains = ['www.instagram.com']
start_urls = ['https://www.instagram.com/accounts/login']
def __init__(self):
chrome_option = Options()
#chrome_option.add_argument("--headless")
chrome_path = which("chromedriver")
driver = webdriver.Chrome(executable_path=chrome_path, options = chrome_option)
driver.set_window_size(1920, 1080)
driver.get("https://www.instagram.com/accounts/login")
logging.info('Website opened...')
# username = driver.find_element_by_name("username")
# username = driver.find_element(By.XPATH, '//input[#name="username"]')
username = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, '//input[#name="username"]')))
password = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, '//input[#name="password"]')))
username.clear()
username.send_keys("username")
logging.info('Typing Username...')
password.clear()
password.send_keys("password")
logging.info('Typing Password...')
Login_button = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, '//button[#type="submit"]'))).click()
alert_1 = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, '//button[contains(text(), "Not Now")]'))).click()
logging.info('Do NOT save password...')
alert_2 = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, '//button[contains(text(), "Not Now")]'))).click() #search for a text="Not Now"
logging.info('Do NOT turn notifications on...')
logging.info('Logging Successful...')
influencer = "leonardodicaprio"
driver.get("https://www.instagram.com/" + influencer + "/")
time.sleep(5)
driver.save_screenshot('Influencer_Home_Page.png')
P.S: For the number of followers I want to get the exact number to the nearest digit as found in the title attribute in the selector. Please see picture below:
insta
Getting this error when running:
error in jupyterlab
wait = WebDriverWait(driver, 20)
number_of_post = wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "a[href$='profile_posts'] span"))).text
print(number_of_post)
number_of_follower = wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "a[href$='followed_by_list'] span"))).get_attribute('title')
print(number_of_follower)
error
Updated code:
# -*- coding: utf-8 -*-
import scrapy
from scrapy_splash import SplashRequest
from scrapy.selector import Selector
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from shutil import which
import logging
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
import time
class InstatestSpider(scrapy.Spider):
name = 'instatest'
allowed_domains = ['www.instagram.com']
start_urls = ['https://www.instagram.com/accounts/login']
def __init__(self):
chrome_option = Options()
#chrome_option.add_argument("--headless")
chrome_path = which("chromedriver")
driver = webdriver.Chrome(executable_path=chrome_path, options = chrome_option)
driver.set_window_size(1920, 1080)
driver.get("https://www.instagram.com/accounts/login")
logging.info('Website opened...')
# username = driver.find_element_by_name("username")
# username = driver.find_element(By.XPATH, '//input[#name="username"]')
username = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, '//input[#name="username"]')))
password = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, '//input[#name="password"]')))
username.clear()
username.send_keys("username")
logging.info('Typing Username...')
password.clear()
password.send_keys("password")
logging.info('Typing Password...')
Login_button = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, '//button[#type="submit"]'))).click()
alert_1 = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, '//button[contains(text(), "Not Now")]'))).click()
logging.info('Do NOT save password...')
alert_2 = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, '//button[contains(text(), "Not Now")]'))).click() #search for a text="Not Now"
logging.info('Do NOT turn notifications on...')
logging.info('Logging Successful...')
influencer = "leonardodicaprio"
driver.get("https://www.instagram.com/" + influencer + "/")
time.sleep(5)
wait = WebDriverWait(driver, 20)
number_of_post = wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "a[href$='profile_posts'] span"))).text
print(number_of_post)
number_of_follower = wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "a[href$='followed_by_list'] span"))).get_attribute('title')
print(number_of_follower)
driver.save_screenshot('Influencer_Home_Page.png')
You can use the below CSS_SELECTOR, to get number of posts, and Number of followers. to get title, you can use .get_attribute()
wait = WebDriverWait(driver, 20)
number_of_post = wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "a[href$='profile_posts'] span"))).text
print(number_of_post)
number_of_follower = wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "a[href$='followed_by_list'] span"))).get_attribute('title')
print(number_of_follower)
Imports :
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
I want to get the text of the sender on my console, I tried beautiful soup for scraping but it didn't work. I had used several other features like XPath and different class names on selenium but not able to resolve this issue.
Here, is my code,
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
import time
myemail = "<username>"
mypassword = "<password>"
friendusernames = ["<>sender username"]
PATH = "C:/Chromedriver.exe"
driver = webdriver.Chrome(PATH)
url = "https://www.instagram.com/"
driver.get(url)
usernamebox = WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.NAME, 'username')))
usernamebox.send_keys(myemail)
passwordbox = driver.find_element_by_name('password')
passwordbox.send_keys(mypassword)
loginbutton = driver.find_element_by_css_selector('.Igw0E')
loginbutton.click()
print("Logging in")
dmbtn = WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.CSS_SELECTOR, '.xWeGp')))
dmbtn.click()
notificationsnotnow = WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.CSS_SELECTOR, '.HoLwm')))
notificationsnotnow.click()
for friendusername in friendusernames:
searchuser = WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.CSS_SELECTOR, '.EQ1Mr')))
searchuser.click()
searchuserbox = WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.CSS_SELECTOR, '.j_2Hd')))
searchuserbox.send_keys(friendusername)
time.sleep(3)
firstuser = driver.find_element_by_xpath(
'/html/body/div[5]/div/div/div[2]/div[2]/div[1]/div')
firstuser.click()
pressingnext = WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.CSS_SELECTOR, '.rIacr')))
pressingnext.click()
names = driver.find_element_by_class_name(
'_7UhW9 > span').text
# names = driver.find_element_by_class_name(
# '.xLCgt').text
# names = driver.find_element_by_class_name(
# '.MMzanKV-D4').text
# names = driver.find_element_by_class_name(
# '.p1tLr').text
# names = driver.find_element_by_class_name(
# '.hjZTB').text
print(names)
time.sleep(1)
I want this text on my console
How, can I do so??
Since every page on the internet has HTML in it, I would inspect the page with right click and find out which tag belongs to the message in the chat, then I'd find the tag's XPath or class and finally get its innerText for retrieving the string.
i try to scrape the contact data from companies from this website:
https://de.statista.com/companydb/suche?idCountry=276&idBranch=0&revenueFrom=-1000000000000000000&revenueTo=1000000000000000000&employeesFrom=0&employeesTo=100000000&sortMethod=revenueDesc&p=4
I can do this with the following Code:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
import pandas as pd
import time
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
company_list= [] #create empty list
driver = webdriver.Chrome('/Users/rieder/Anaconda3/chromedriver_win32/chromedriver.exe') #define driver
driver.get('https://de.statista.com/companydb/suche?idCountry=276&idBranch=0&revenueFrom=-1000000000000000000&revenueTo=1000000000000000000&employeesFrom=0&employeesTo=100000000&sortMethod=revenueDesc&p=1') # open Website
driver.find_element_by_id("cookiesNotificationConfirm").click(); #accept cookies
driver.find_element_by_xpath("//*[#id='content']/section[3]/div/div/form/div/div[2]/div[2]/table/tr[2]/td[1]/a").click(); #click on the first company namelink
contact_data = WebDriverWait(driver, 20).until(EC.visibility_of_all_elements_located((By.XPATH, "/html/body/div[3]/div[4]/section[6]/div/div[2]/div[2]/div/div"))) #get the contactdata from the company you chose before
for cn in contact_data:
company_list.append(cn.text) # this stores the text in the list
driver.back() #navigate to previous site
time.sleep(5) #wait for the pop-up window to appear
driver.find_element_by_xpath("/html/body/div[15]/div[3]/div[3]/div[1]/button[1]").click(), #deny the websites popup
time.sleep(5) #wait for the popup to vanish
driver.find_element_by_xpath("//*[#id='content']/section[3]/div/div/form/div/div[2]/div[2]/table/tr[3]/td[1]/a").click(); #click on the next company namelink
contact_data2 = WebDriverWait(driver, 20).until(EC.visibility_of_all_elements_located((By.XPATH, "/html/body/div[3]/div[4]/section[6]/div/div[2]/div[2]/div/div"))) #get the contactdata from the company you chose before
for cn in contact_data2:
company_list.append(cn.text) # this stores the text in the list
print(company_list) #show the list
My Output is this:
['GUTex GmbH\nGerhard-Unland-Str. 1\n26683\nSaterland\nDeutschland', 'Robert Bosch GmbH\nRobert-Bosch-Platz 1\n70839\nGerlingen\nDeutschland']
Problem:
I want, that my code does this to the whole list on page 1 and then goes on on the next page and do it again. This shall go on until I have for example 100 adresses in the list. I would do this with a "while loop" but my xpaths for finding the adress are too specified, so it would always loop the same companies.
Thanks a lot inbefore
Try below code for one page data extract. Update the code for iterating over the next page records.
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
company_list= [] #create empty list
driver = webdriver.Chrome() #define driver
driver.get('https://de.statista.com/companydb/suche?idCountry=276&idBranch=0&revenueFrom=-1000000000000000000&revenueTo=1000000000000000000&employeesFrom=0&employeesTo=100000000&sortMethod=revenueDesc&p=1') # open Website
if len(driver.find_elements_by_id("cookiesNotificationConfirm")) > 0:
driver.find_element_by_id("cookiesNotificationConfirm").click(); # accept cookies
WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.XPATH, '//table[#class="zebraTable zebraTable--companies"]//td[1]')))
elementsSize = len(driver.find_elements_by_xpath('//table[#class="zebraTable zebraTable--companies"]//td[1]'))
# To iterate over the company list and click on the company name then capture the address on navigated page
# come back to previous page and repeat the same.
for i in range(elementsSize):
WebDriverWait(driver, 20).until(
EC.element_to_be_clickable((By.XPATH, '//table[#class="zebraTable zebraTable--companies"]//td[1]')))
elements = driver.find_elements_by_xpath('//table[#class="zebraTable zebraTable--companies"]//td[1]/a')
company_name = elements[i].text
elements[i].click() # click on the first company namelink
WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.XPATH,
'//*[#id="contactInformation"]//div[#class="companyContactBox"]'))) # get the contactdata from the company you chose before
contact_data = driver.execute_script("return document.getElementsByClassName('companyContactBox')[0].innerText")
# print(contact_data)
company_list.append(company_name + " : " + contact_data)
driver.back() # navigate to previous site
print(company_list)
Thanks to Dilip Meghwals comment above i could finish my Code:
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
import time
company_list= [] #create empty list
count = 25
chrome_options = webdriver.ChromeOptions()
prefs = {"profile.default_content_setting_values.notifications" : 2}
chrome_options.add_experimental_option("prefs",prefs)
driver = webdriver.Chrome('/Users/rieder/Anaconda3/chromedriver_win32/chromedriver.exe', chrome_options=chrome_options) #define driver
driver.get('https://de.statista.com/companydb/suche?idCountry=276&idBranch=0&revenueFrom=-1000000000000000000&revenueTo=1000000000000000000&employeesFrom=0&employeesTo=100000000&sortMethod=revenueDesc&p=1') # open Website
if len(driver.find_elements_by_id("cookiesNotificationConfirm")) > 0:
driver.find_element_by_id("cookiesNotificationConfirm").click(); # accept cookies
while len(company_list) < 1000:
WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.XPATH, '//table[#class="zebraTable zebraTable--companies"]//td[1]')))
elementsSize = len(driver.find_elements_by_xpath('//table[#class="zebraTable zebraTable--companies"]//td[1]'))
# To iterate over the company list and click on the company name then capture the address on navigated page
# come back to previous page and repeat the same.
for i in range(elementsSize):
WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.XPATH, '//table[#class="zebraTable zebraTable--companies"]//td[1]')))
elements = driver.find_elements_by_xpath('//table[#class="zebraTable zebraTable--companies"]//td[1]/a')
company_name = elements[i].text
elements[i].click() # click on the first company namelink
WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.XPATH,'//*[#id="contactInformation"]//div[#class="companyContactBox"]'))) # get the contactdata from the company you chose before
contact_data = driver.execute_script("return document.getElementsByClassName('companyContactBox')[0].innerText")
# print(contact_data)
company_list.append(contact_data)
driver.back() # navigate to previous site
time.sleep(5)
driver.find_element_by_xpath("//*[#id='content']/section[3]/div/div/form/div/div[2]/div[2]/div[2]/div/button[2]").click();
company_list = [w.replace('\n', ', ') for w in company_list]
print(company_list)
df_company_name = pd.DataFrame(company_list, columns =['Name'])
df_company_name.to_excel("company_name.xlsx")
I am having some trouble scrolling down to the end of the second web page. The first scroll works fine, the second won't run.
This issue seems to be happening on line 33 and 34
Please see lines of code below:
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
from selenium.webdriver.common.keys import Keys
def main():
n = 1
# LMS -> PSYC1101 -> Quiz
driver = webdriver.Chrome("/usr/local/bin/chromedriver")
driver.get("https://lms.uwa.edu.au/webapps/portal/execute/tabs/tabAction?tab_tab_group_id=_1_1")
window_before = driver.window_handles[0]
username = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.ID,"login-user"))).send_keys("username")
password = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.ID,"login-pass"))).send_keys("password")
login = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.ID,"form_button_0"))).click()
time.sleep(10)
units = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.XPATH,"/html/body/div[4]/table/tbody/tr/td/div/div[2]/table/tbody/tr/td[2]/a/span"))).click()
psych = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.XPATH,"/html/body/div[5]/div/div/div/div/div/div/div/div[2]/div/div[2]/div/div[2]/ul/li[4]/a"))).click()
page = driver.find_element_by_tag_name("html")
page.send_keys(Keys.END)
time.sleep(1)
quizzes = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.XPATH,"/html/body/div[5]/div[2]/nav/div/div[2]/div[1]/div[2]/ul/li[22]/a/span"))).click()
# Quiz One -> Begin -> Scroll -> Continue -> Sumbit -> Submit2 -> View Results
quizOne = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.XPATH,"/html/body/div[5]/div[2]/div/div/div/div/div[2]/ul/li[2]/div[1]/h3/a/span"))).click()
time.sleep(1)
begin = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.XPATH,"/html/body/div[1]/div/div/div/div[4]/div/div[2]/div/div[1]/div[17]/button"))).click()
window_after = driver.window_handles[0]
driver.switch_to.window(window_after)
page = driver.find_element_by_tag_name("html")
page.send_keys(Keys.END)
#driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
Continue = WebDriverWait(driver, 20).until(EC.presence_of_element_located((By.XPATH,"/html/body/div/div/main/div[5]/button[1]"))).click()
submit = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.XPATH,"/html/body/div/header/div/div/button[2]"))).click()
submit2 = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.XPATH,"/html/body/div/ic-modal[2]/ic-modal-main/div[2]/button[2]"))).click()
viewResults = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.XPATH,"/html/body/div/ic-modal[3]/ic-modal-main/div/button"))).click()
main()
Hi Here I am trying to scrape all the teacher jobs from https://www.naukri.com/ this url I want all the pages data but I am getting only one page data and getting this error
Traceback (most recent call last):
File "naukri.py", line 48, in <module>
driver.execute_script("arguments.click();", next_page)
File "/home/nyros/Documents/mypython/lib/python3.6/site-packages/selenium/webdriver/remote/webdriver.py", line 636, in execute_script
'args': converted_args})['value']
File "/home/nyros/Documents/mypython/lib/python3.6/site-packages/selenium/webdriver/remote/webdriver.py", line 321, in execute
self.error_handler.check_response(response)
File "/home/nyros/Documents/mypython/lib/python3.6/site-packages/selenium/webdriver/remote/errorhandler.py", line 242, in check_response
raise exception_class(message, screen, stacktrace)
selenium.common.exceptions.JavascriptException: Message: javascript error: arguments.click is not a function
(Session info: chrome=80.0.3987.116)
The code which I wrote is:
import selenium.webdriver
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
url ='https://www.naukri.com/'
driver = webdriver.Chrome(r"mypython/bin/chromedriver_linux64/chromedriver")
driver.get(url)
WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.CSS_SELECTOR, '#qsbClick > span.blueBtn'))).click()
driver.find_element_by_xpath('//*[#id="skill"]/div[1]/div[2]/input').send_keys("teacher")
driver.find_element_by_xpath('//*[#id="qsbFormBtn"]').click()
data = WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.CLASS_NAME, "srp_container.fl")))
result = WebDriverWait(data, 10).until(
EC.presence_of_all_elements_located((By.CLASS_NAME, "row")))
for r in result:
data = WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.CLASS_NAME, "srp_container.fl")))
result = WebDriverWait(data, 10).until(
EC.presence_of_all_elements_located((By.CLASS_NAME, "row")))
for r in result:
try:
title=r.find_element_by_class_name("desig").text
print('title:',title)
school=r.find_element_by_class_name("org").text
print('school:',school)
location=r.find_element_by_class_name("loc").text
print("location:",location)
salary=r.find_element_by_class_name("salary").text
print("salary:",salary)
except:
pass
print('-------')
next_page = r.find_elements_by_xpath("/html/body/div[5]/div/div[3]/div[1]/div[59]/a/button")
driver.execute_script("arguments.click();", next_page)
Please help me anyone Thanks in advance!
Since the element index of the 'next' button changes from 59 in the first page to 60 in the next pages, you can just find all elements on the page which have class "grayBtn", and click on index [-1] of the list returned, as this will always provide the next button. I removed some unnecessary parts of your code too, like repeated importations as well as unnecessary button clicks. I instantly directed to the page containing the list of results for teachers, instead of entering "teacher" into the search field on the home page. I was left with the following:
from selenium import webdriver
import time
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import re
Category = input("Category?")
Category = re.sub(" ", "%20", Category)
Type = re.sub(" ", "-", Category.lower())
url ='https://www.naukri.com/' + Type + '-jobs?k=' + Category
driver = webdriver.Chrome(r"mypython/bin/chromedriver_linux64/chromedriver")
driver.get(url)
data = WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.CLASS_NAME, "srp_container.fl")))
result = WebDriverWait(data, 10).until(
EC.presence_of_all_elements_located((By.CLASS_NAME, "row")))
for res in result:
data = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CLASS_NAME, "srp_container.fl")))
jobs = WebDriverWait(data, 10).until(EC.presence_of_all_elements_located((By.CLASS_NAME, "row")))
for job in jobs:
try:
title=job.find_element_by_class_name("desig").text
print('title:',title)
school=job.find_element_by_class_name("org").text
print('school:',school)
location=job.find_element_by_class_name("loc").text
print("location:",location)
salary=job.find_element_by_class_name("salary").text
print("salary:",salary)
except:
pass
print('-------')
Button = driver.find_elements_by_class_name("grayBtn")[-1]
time.sleep(1)
driver.execute_script("window.scrollTo(0,document.body.scrollHeight - 1300)")
Button.click()
As requested, here is the modified code to append data to a pandas dataframe and convert the dataframe to excel:
from selenium import webdriver
import time
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import re
import pandas as pd
df = pd.DataFrame(columns = ['Title', 'School', 'Location', 'Salary'])
Category = input("Category?")
Category = re.sub(" ", "%20", Category)
Type = re.sub(" ", "-", Category.lower())
url ='https://www.naukri.com/' + Type + '-jobs?k=' + Category
driver = webdriver.Chrome(r"mypython/bin/chromedriver_linux64/chromedriver")
driver.get(url)
data = WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.CLASS_NAME, "srp_container.fl")))
result = WebDriverWait(data, 10).until(
EC.presence_of_all_elements_located((By.CLASS_NAME, "row")))
i = 0
for res in result:
data = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CLASS_NAME, "srp_container.fl")))
jobs = WebDriverWait(data, 10).until(EC.presence_of_all_elements_located((By.CLASS_NAME, "row")))
for job in jobs:
try:
title=job.find_element_by_class_name("desig").text
print('title:',title)
school=job.find_element_by_class_name("org").text
print('school:',school)
location=job.find_element_by_class_name("loc").text
print("location:",location)
salary=job.find_element_by_class_name("salary").text
print("salary:",salary)
df.loc[i] = [title, school, location, salary]
i += 1
except:
pass
print('-------')
Button = driver.find_elements_by_class_name("grayBtn")[-1]
time.sleep(1)
driver.execute_script("window.scrollTo(0,document.body.scrollHeight - 1300)")
Button.click()
df.to_excel("all_results.xlsx")