I have to go to here
Here I have to choose applicant name = “ltd”
But here before submitting the page, I have to solve a captcha. How to fetch the next page's information(application number, application title, date, application status etc.....) in an excel format using web scrapping?
---------------- Running the following script, getting error -----
import csv
import json
from time import sleep, time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver import DesiredCapabilities
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.chrome.webdriver import WebDriver
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
def save_to_csv(data: list) -> None:
with open(file='ipindiaservices.csv', mode='a', encoding="utf-8") as f:
writer = csv.writer(f, lineterminator='\n')
writer.writerow([*data])
def start_from_page(page_number: int, driver: WebDriver) -> None:
driver.execute_script(
f"""
document.querySelector('button.next').value = {page_number};
document.querySelector('button.next').click();
"""
)
def titles_validation(driver: WebDriver) -> None:
"""replace empty title name with '_'"""
driver.execute_script(
"""
let titles = document.querySelectorAll('input+.tab-pane tr:not(:first-child)>td:last-child')
Array.from(titles).forEach((e) => {
if (!e.textContent.trim()) {
e.textContent = '_';
}
});
"""
)
def get_network_data(log: dict, driver: WebDriver) -> dict:
log = json.loads(log["message"])["message"]
if all([
"Network.responseReceived" in log["method"],
"params" in log.keys(),
'CaptchaAudio' in str(log["params"].values())
]):
return driver.execute_cdp_cmd('Network.getResponseBody', {'requestId': log["params"]["requestId"]})
def get_captcha_text(driver: WebDriver, timeout: float) -> str:
"""Return captcha text
Arguments:
- driver: WebDriver
- timeout: pause before receiving data from the web driver log
"""
driver.execute_script(
"""
// document.querySelector('img[title="Captcha"]').click()
document.querySelector('img[title="Captcha Audio"]').click()
"""
)
sleep(timeout)
logs = driver.get_log('performance')
responses = [get_network_data(log, driver) for log in logs if get_network_data(log, driver)]
if responses:
return json.loads(responses[0]['body'])['CaptchaImageText']
else:
get_captcha_text(driver)
def submit_captcha(captcha_text: str, btn_name: str) -> None:
"""Submit captcha
Arguments:
- btn_name: captcha send button name["submit" or "search"]
"""
if btn_name == 'search':
captcha_locator = (By.CSS_SELECTOR, 'input[name="submit"]')
elif btn_name == 'submit':
captcha_locator = (By.ID, 'btnSubmit')
wait.until(EC.visibility_of_element_located((By.ID, 'CaptchaText'))).send_keys(captcha_text)
wait.until(EC.visibility_of_element_located(captcha_locator)).click()
# options = webdriver.ChromeOptions()
# options.add_argument('--headless')
# options.add_experimental_option("excludeSwitches", ["enable-automation", "enable-logging"])
# capabilities = DesiredCapabilities.CHROME
# capabilities["goog:loggingPrefs"] = {"performance": "ALL"}
# service = Service(executable_path="path/to/your/chromedriver.exe")
# # driver = webdriver.Chrome(service=service, options=options, desired_capabilities=capabilities)
wait = WebDriverWait(driver, 15)
table_values_locator = (By.CSS_SELECTOR, 'input+.tab-pane tr:not(:first-child)>td:last-child')
applicant_name_locator = (By.ID, 'TextField6')
page_number_locator = (By.CSS_SELECTOR, 'span.Selected')
app_satus_locator = (By.CSS_SELECTOR, 'button.btn')
next_btn_locator = (By.CSS_SELECTOR, 'button.next')
driver.get('https://ipindiaservices.gov.in/PublicSearch/')
# sometimes an alert with an error message("") may appear, so a small pause is used
sleep(1)
wait.until(EC.visibility_of_element_located(applicant_name_locator)).send_keys('ltd')
# on the start page and the page with the table, the names of the buttons are different
captcha_text = get_captcha_text(driver, 1)
submit_captcha(captcha_text, "search")
# the page where the search starts
start_from_page(1, driver)
while True:
start = time()
# get current page number
current_page = wait.until(EC.visibility_of_element_located(page_number_locator)).text
print(f"Current page: {current_page}")
# get all application status WebElements
app_status_elements = wait.until(EC.visibility_of_all_elements_located(app_satus_locator))
for element in range(len(app_status_elements)):
print(f"App number: {element}")
# update application status WebElements
app_status_elements = wait.until(EC.visibility_of_all_elements_located(app_satus_locator))
# click on application status
wait.until(EC.visibility_of(app_status_elements[element])).click()
# wait 2 seconds for the captcha to change
sleep(2)
# get text and submit captcha
captcha_text = get_captcha_text(driver, 1)
submit_captcha(captcha_text, "submit")
try:
# get all table data values(without titles) WebElements
table_data_values = wait.until(EC.visibility_of_all_elements_located(table_values_locator))
# if there are empty rows in the table replace them with "_"
titles_validation(driver)
# save data to csv
save_to_csv([val.text.replace('\n', ' ') for val in table_data_values])
except TimeoutException:
print("Application Number does not exist")
finally:
driver.back()
# print the current page number to the console
print(f"Time per page: {round(time()-start, 3)}")
# if the current page is equal to the specified one, then stop the search and close the driver
if current_page == '3776':
break
# click next page
wait.until(EC.visibility_of_element_located(next_btn_locator)).click()
driver.quit()
On this site, captcha can be solved without resorting to third-party services. When you click on the "Captcha Audio" button, a GET request is sent to the endpoint https://ipindiaservices.gov.in/PublicSearch/Captcha/CaptchaAudio The response is a dictionary {"CaptchaImageText":"hnnxd"} which you can access get from Selenium via the "Chrome Devtools Protocol" using the Network.getResponseBody method, or you can use the requests library.
To save data in csv, you can use, for example, the csv module included in the standard library.
Here is one possible solution:
import re
import csv
import json
from time import sleep
from selenium import webdriver
from typing import Generator, List, Tuple
from selenium.webdriver.common.by import By
from selenium.webdriver import DesiredCapabilities
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.chrome.webdriver import WebDriver
from selenium.webdriver.support import expected_conditions as EC
def get_captcha_text(log: dict, driver: WebDriver) -> dict:
log = json.loads(log["message"])["message"]
if all([
"Network.responseReceived" in log["method"],
"params" in log.keys(),
'CaptchaAudio' in str(log["params"].values())
]):
return driver.execute_cdp_cmd('Network.getResponseBody', {'requestId': log["params"]["requestId"]})
def save_to_csv(table_data: Generator[None, None, List[Tuple[str, str, str, str]]]) -> None:
for app_num, title, app_date, status in zip(*list(zip(*table_data))):
data = {
'Application Number': app_num,
'Title': title,
'Application Date': app_date,
'Status': status
}
with open(file='ipindiaservices.csv', mode='a', encoding="utf-8") as f:
writer = csv.writer(f, lineterminator='\n')
writer.writerow([data['Application Number'], data['Title'], data['Application Date'], data['Status']])
def start_from_page(page_number: int, driver: WebDriver) -> None:
driver.execute_script(
f"""
document.querySelector('button.next').value = {page_number};
document.querySelector('button.next').click();
"""
)
def titles_validation(driver: WebDriver) -> None:
"""replace empty title name with '_'"""
driver.execute_script(
"""
let titles = document.querySelectorAll('#tableData>tbody>tr>td.title')
Array.from(titles).forEach((e) => {
if (!e.textContent.trim()) {
e.textContent = '_';
}
});
"""
)
options = webdriver.ChromeOptions()
options.add_experimental_option("excludeSwitches", ["enable-automation", "enable-logging"])
capabilities = DesiredCapabilities.CHROME
capabilities["goog:loggingPrefs"] = {"performance": "ALL"}
service = Service(executable_path="path/to/your/chromedriver.exe")
driver = webdriver.Chrome(service=service, options=options, desired_capabilities=capabilities)
wait = WebDriverWait(driver, 10)
# regular expression to search for data in a table
pattern = r'^([0-9A-Z\/\-,]+) (.+)? ([0-9\/]+) (\w+)'
driver.get('https://ipindiaservices.gov.in/PublicSearch/')
# sometimes an alert with an error message("") may appear, so a small pause is used
sleep(1)
driver.find_element(By.CSS_SELECTOR, 'img[title="Captcha Audio"]').click()
driver.find_element(By.ID, 'TextField6').send_keys('ltd')
# short pause is needed here to write the log, otherwise we will get an empty list
sleep(1)
logs = driver.get_log('performance')
# get request data that is generated when click on the button listen to the text of the captcha
responses = [get_captcha_text(log, driver) for log in logs if get_captcha_text(log, driver)]
# get captcha text
captcha_text = json.loads(responses[0]['body'])['CaptchaImageText']
# enter the captcha text and click "Search" button
driver.find_element(By.ID, 'CaptchaText').send_keys(captcha_text)
driver.find_element(By.CSS_SELECTOR, 'input[name="submit"]').click()
# the page where the search starts
start_from_page(1, driver)
while True:
# get current page number
current_page = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, 'span.Selected'))).text
# print the current page number to the console
print(f"Current page: {current_page}")
# get all fields of the table
table_web_elements = wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, '#tableData>tbody>tr')))
# check title name
titles_validation(driver)
# get all table data on current page
table_data = (re.findall(pattern, data.text)[0] for data in table_web_elements)
# save table data to csv
save_to_csv(table_data)
# if the current page is equal to the specified one, then stop the search and close the driver
if current_page == '3768':
break
# click next page
driver.find_element(By.CSS_SELECTOR, 'button.next').click()
driver.quit()
The performance of this solution is about 280sec per 100 pages.
It will take about 2.5-3 hours to collect all the data.
Therefore, the ability to stop data collection on a specific page has been added (by default, this is the last page):
if current_page == '3768':
break
And start collection data from the specified page (by default, this is the first page):
start_from_page(1, driver)
Output is ipindiaservices.csv
202247057786,NON-AQUEOUS ELECTROLYTE SECONDARY BATTERY,10/10/2022,Published
202247057932,"COMMUNICATION METHOD, APPARATUS AND SYSTEM",10/10/2022,Published
202247057855,POLYOLEFIN RESIN FILM,10/10/2022,Published
202247057853,CEMENT COMPOSITION AND CURED PRODUCT THEREOF,10/10/2022,Published
...
To use this solution in Google Collab, follow these steps:
Install Selenium and ChromeDriver
!pip install selenium
!apt-get update
!apt install chromium-chromedriver
Make the necessary imports
import re
import csv
import json
from time import sleep
from typing import Generator, List, Tuple
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver import DesiredCapabilities
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.chrome.webdriver import WebDriver
from selenium.webdriver.support import expected_conditions as EC
Set options
options = webdriver.ChromeOptions()
options.add_argument('--headless')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
capabilities = DesiredCapabilities.CHROME
capabilities["goog:loggingPrefs"] = {"performance": "ALL"}
driver = webdriver.Chrome('chromedriver', chrome_options=options, desired_capabilities=capabilities)
Everything else remains unchanged.
And don't put all the code in one cell.
Update:
This is one possible solution to collect all the information from the table of each application:
import csv
import json
from time import sleep, time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver import DesiredCapabilities
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.chrome.webdriver import WebDriver
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
def save_to_csv(data: list) -> None:
with open(file='ipindiaservices.csv', mode='a', encoding="utf-8") as f:
writer = csv.writer(f, lineterminator='\n')
writer.writerow([*data])
def start_from_page(page_number: int, driver: WebDriver) -> None:
driver.execute_script(
f"""
document.querySelector('button.next').value = {page_number};
document.querySelector('button.next').click();
"""
)
def titles_validation(driver: WebDriver) -> None:
"""replace empty title name with '_'"""
driver.execute_script(
"""
let titles = document.querySelectorAll('input+.tab-pane tr:not(:first-child)>td:last-child')
Array.from(titles).forEach((e) => {
if (!e.textContent.trim()) {
e.textContent = '_';
}
});
"""
)
def get_network_data(log: dict, driver: WebDriver) -> dict:
log = json.loads(log["message"])["message"]
if all([
"Network.responseReceived" in log["method"],
"params" in log.keys(),
'CaptchaAudio' in str(log["params"].values())
]):
return driver.execute_cdp_cmd('Network.getResponseBody', {'requestId': log["params"]["requestId"]})
def get_captcha_text(driver: WebDriver, timeout: float) -> str:
"""Return captcha text
Arguments:
- driver: WebDriver
- timeout: pause before receiving data from the web driver log
"""
driver.execute_script(
"""
// document.querySelector('img[title="Captcha"]').click()
document.querySelector('img[title="Captcha Audio"]').click()
"""
)
sleep(timeout)
logs = driver.get_log('performance')
if responses := [get_network_data(log, driver) for log in logs if get_network_data(log, driver)]:
return json.loads(responses[0]['body'])['CaptchaImageText']
else:
get_captcha_text(driver, timeout)
def submit_captcha(captcha_text: str, btn_name: str) -> None:
"""Submit captcha
Arguments:
- btn_name: captcha send button name["submit" or "search"]
"""
if btn_name == 'search':
captcha_locator = (By.CSS_SELECTOR, 'input[name="submit"]')
elif btn_name == 'submit':
captcha_locator = (By.ID, 'btnSubmit')
wait.until(EC.visibility_of_element_located((By.ID, 'CaptchaText'))).send_keys(captcha_text)
wait.until(EC.visibility_of_element_located(captcha_locator)).click()
options = webdriver.ChromeOptions()
options.add_argument('--headless')
options.add_experimental_option("excludeSwitches", ["enable-automation", "enable-logging"])
capabilities = DesiredCapabilities.CHROME
capabilities["goog:loggingPrefs"] = {"performance": "ALL"}
service = Service(executable_path="path/to/your/chromedriver.exe")
driver = webdriver.Chrome(service=service, options=options, desired_capabilities=capabilities)
wait = WebDriverWait(driver, 15)
table_values_locator = (By.CSS_SELECTOR, 'input+.tab-pane tr:not(:first-child)>td:last-child')
applicant_name_locator = (By.ID, 'TextField6')
page_number_locator = (By.CSS_SELECTOR, 'span.Selected')
app_satus_locator = (By.CSS_SELECTOR, 'button.btn')
next_btn_locator = (By.CSS_SELECTOR, 'button.next')
driver.get('https://ipindiaservices.gov.in/PublicSearch/')
# sometimes an alert with an error message("") may appear, so a small pause is used
sleep(1)
wait.until(EC.visibility_of_element_located(applicant_name_locator)).send_keys('ltd')
# on the start page and the page with the table, the names of the buttons are different
captcha_text = get_captcha_text(driver, 1)
submit_captcha(captcha_text, "search")
# the page where the search starts
start_from_page(1, driver)
while True:
start = time()
# get current page number
current_page = wait.until(EC.visibility_of_element_located(page_number_locator)).text
print(f"Current page: {current_page}")
# get all application status WebElements
app_status_elements = wait.until(EC.visibility_of_all_elements_located(app_satus_locator))
for element in range(len(app_status_elements)):
print(f"App number: {element}")
# update application status WebElements
app_status_elements = wait.until(EC.visibility_of_all_elements_located(app_satus_locator))
# click on application status
wait.until(EC.visibility_of(app_status_elements[element])).click()
# wait 2 seconds for the captcha to change
sleep(2)
# get text and submit captcha
captcha_text = get_captcha_text(driver, 1)
submit_captcha(captcha_text, "submit")
try:
# get all table data values(without titles) WebElements
table_data_values = wait.until(EC.visibility_of_all_elements_located(table_values_locator))
# if there are empty rows in the table replace them with "_"
titles_validation(driver)
# save data to csv
save_to_csv([val.text.replace('\n', ' ') for val in table_data_values])
except TimeoutException:
print("Application Number does not exist")
finally:
driver.back()
# print the current page number to the console
print(f"Time per page: {round(time()-start, 3)}")
# if the current page is equal to the specified one, then stop the search and close the driver
if current_page == '3776':
break
# click next page
wait.until(EC.visibility_of_element_located(next_btn_locator)).click()
driver.quit()
Output is ipindiaservices.csv
202247059447,PCT NATIONAL PHASE APPLICATION,18/10/2022,"PANASONIC INTELLECTUAL PROPERTY MANAGEMENT CO., LTD.",SOLID ELECTROLYTE MATERIAL AND BATTERY USING SAME,ELECTRICAL,patent#depenning.com,_,_,PCT/JP2021/004427,05/02/2021,31/03/2020,--,21/10/2022
202247059470,PCT NATIONAL PHASE APPLICATION,18/10/2022,"SHENZHEN SKYWORTH-RGB ELECTRONIC CO., LTD.","ATTACHMENT FORCE ADJUSTMENT METHOD AND APPARATUS, DEVICE, AND STORAGE MEDIUM",COMPUTER SCIENCE,ipo#knspartners.com,_,_,PCT/CN2020/125555,30/10/2020,02/04/2020,18/10/2022,21/10/2022
202247058733,PCT NATIONAL PHASE APPLICATION,14/10/2022,"SUMITOMO ELECTRIC OPTIFRONTIER CO., LTD.","FUSION SPLICING SYSTEM, FUSION SPLICING DEVICE, AND DETERIORATION DETERMINATION METHOD",COMPUTER SCIENCE,patent#depenning.com,_,_,PCT/JP2021/017016,28/04/2021,30/04/2020,--,21/10/2022
The performance of this solution is about 230sec per page.
Sometimes there may be no data on the aplication status page (for example, by the number "00054/CAL/1998" we get "Application Number does not exist") Therefore, the script simply ignores this aplication.
2 sec timeout before receiving the captcha text is due to the fact that after clicking on "Application Status" one captcha is shown and after ~1sec it changes to another one which we must enter
Fix:
Since the captcha that appeared after clicking on Application status was removed from the site, its solution from the script was also removed. Also, the developers of this resource made the same classes for Application Number and Application status, so we needed to change the css selector for Application status.
Solution for Google Colab:
import csv
import json
from time import sleep, time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver import DesiredCapabilities
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.chrome.webdriver import WebDriver
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
def save_to_csv(data: list) -> None:
with open(file='ipindiaservices.csv', mode='a', encoding="utf-8") as f:
writer = csv.writer(f, lineterminator='\n')
writer.writerow([*data])
def start_from_page(page_number: int, driver: WebDriver) -> None:
driver.execute_script(
f"""
document.querySelector('button.next').value = {page_number};
document.querySelector('button.next').click();
"""
)
def titles_validation(driver: WebDriver) -> None:
"""replace empty title name with '_'"""
driver.execute_script(
"""
let titles = document.querySelectorAll('input+.tab-pane tr:not(:first-child)>td:last-child')
Array.from(titles).forEach((e) => {
if (!e.textContent.trim()) {
e.textContent = '_';
}
});
"""
)
def get_network_data(log: dict, driver: WebDriver) -> dict:
log = json.loads(log["message"])["message"]
if all([
"Network.responseReceived" in log["method"],
"params" in log.keys(),
'CaptchaAudio' in str(log["params"].values())
]):
return driver.execute_cdp_cmd('Network.getResponseBody', {'requestId': log["params"]["requestId"]})
def get_captcha_text(driver: WebDriver, timeout: float) -> str:
"""Return captcha text
Arguments:
- driver: WebDriver
- timeout: pause before receiving data from the web driver log
"""
driver.execute_script(
"""
// document.querySelector('img[title="Captcha"]').click()
document.querySelector('img[title="Captcha Audio"]').click()
"""
)
sleep(timeout)
logs = driver.get_log('performance')
responses = [get_network_data(log, driver) for log in logs if get_network_data(log, driver)]
if responses:
return json.loads(responses[0]['body'])['CaptchaImageText']
else:
get_captcha_text(driver, timeout)
def submit_captcha(captcha_text: str, btn_name: str) -> None:
"""Submit captcha
Arguments:
- btn_name: captcha send button name["submit" or "search"]
"""
if btn_name == 'search':
captcha_locator = (By.CSS_SELECTOR, 'input[name="submit"]')
elif btn_name == 'submit':
captcha_locator = (By.ID, 'btnSubmit')
wait.until(EC.visibility_of_element_located((By.ID, 'CaptchaText'))).send_keys(captcha_text)
wait.until(EC.visibility_of_element_located(captcha_locator)).click()
options = webdriver.ChromeOptions()
options.add_argument('--headless')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
options.add_experimental_option("excludeSwitches", ["enable-automation", "enable-logging"])
capabilities = DesiredCapabilities.CHROME
capabilities["goog:loggingPrefs"] = {"performance": "ALL"}
driver = webdriver.Chrome(executable_path='chromedriver', options=options, desired_capabilities=capabilities)
wait = WebDriverWait(driver, 10)
table_values_locator = (By.CSS_SELECTOR, 'input+.tab-pane tr:not(:first-child)>td:last-child')
applicant_name_locator = (By.ID, 'TextField6')
page_number_locator = (By.CSS_SELECTOR, 'span.Selected')
app_satus_locator = (By.CSS_SELECTOR, 'button[name="ApplicationSatus"]')
next_btn_locator = (By.CSS_SELECTOR, 'button.next')
driver.get('https://ipindiaservices.gov.in/PublicSearch/')
# sometimes an alert with an error message("") may appear, so a small pause is used
sleep(1)
wait.until(EC.visibility_of_element_located(applicant_name_locator)).send_keys('ltd')
# on the start page and the page with the table, the names of the buttons are different
captcha_text = get_captcha_text(driver, 1)
submit_captcha(captcha_text, "search")
# the page where the search starts
start_from_page(1, driver)
while True:
start = time()
# get current page number
current_page = wait.until(EC.visibility_of_element_located(page_number_locator)).text
print(f"Current page: {current_page}")
# get all application status WebElements
app_status_elements = wait.until(EC.visibility_of_all_elements_located(app_satus_locator))
for element in range(len(app_status_elements)):
print(f"App number: {element}")
# update application status WebElements
app_status_elements = wait.until(EC.visibility_of_all_elements_located(app_satus_locator))
# click on application status
wait.until(EC.visibility_of(app_status_elements[element])).click()
try:
# switch to new tab
driver.switch_to.window(driver.window_handles[-1])
# get all table data values(without titles) WebElements
table_data_values = wait.until(EC.visibility_of_all_elements_located(table_values_locator))
# if there are empty rows in the table replace them with "_"
titles_validation(driver)
# save data to csv
save_to_csv([val.text.replace('\n', ' ') for val in table_data_values])
except TimeoutException:
print("Application Number does not exist")
finally:
# close new tab
driver.close()
# switch to main tab
driver.switch_to.window(driver.window_handles[0])
# print the current page number to the console
print(f"Time per page: {round(time()-start, 3)}")
# if the current page is equal to the specified one, then stop the search and close the driver
if current_page == '100':
break
# click next page
wait.until(EC.visibility_of_element_located(next_btn_locator)).click()
driver.quit()
These fixes have been tested on Google Colab(python 3.7.15).
The performance of this solution is about 55sec per page.
Previous versions of script are left for visual comparison.
You can use any capctha solving sites. On these sites, users usually fix it themselves, so they can be slow, but it does the job.
Sample website (I didn't receive any ads)
You can use selenium to pull the information. It will be enough to take and place the elements with the "id" tag on the site.
Library for reading/writing excel in python
but I am trying to write a unit test for my website that runs through all the links and returns an A ok or no go if the site is working. But I am having trouble with the program it's not able to constantly click the link in the site navigation bar. I've tried multiple waits implicit. Explicit, expected condition but the page loads and half the time it will click the link and go to that part of the site and the other half the program just stops and nothing is clicked.
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver import ActionChains
PATH = "C:\Program Files (x86)\chromedriver.exe"
drive = webdriver.Chrome(PATH)
drive.get("https://www.blackhempfamily.com/")
wait = WebDriverWait(drive, 10)
link = wait.until(EC.element_to_be_clickable((By.LINK_TEXT, "Why Black Hemp?")))
link.click()
Would be a better tag to use.
wait.until(EC.element_to_be_clickable((By.XPATH, "//p[text()='Why Black Hemp?']")))
The element you're searching for is not a link. It's a paragraph (p). I added a sleep call to give the page more load time.
Try this code:
time.sleep(3)
wait = WebDriverWait(drive, 10)
#link = wait.until(EC.element_to_be_clickable((By.LINK_TEXT, "Why Black Hemp?")))
link = drive.find_element_by_xpath('//*[#id="idh09fqo2label"]')
link.click()
So, it took a while ... but, I think that I was able to figure this out. The actions that you need to do are:
Click "Why Black Hemp?"
Wait until the page stops scrolling
Scroll to the top of the page
Wait until the page stops scrolling
**Attempt to scroll down so you can get the nav bar to display
Repeat until your heart is content / Test Passes with "A-OK"
In order for this to be achieved, you need to have the following imports
from selenium import webdriver
from selenium.webdriver.chrome.webdriver import WebDriver as ChromeWebDriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait as DriverWait
from selenium.webdriver.support import expected_conditions as DriverConditions
from selenium.common.exceptions import WebDriverException
import time
Step 1 - Click your "Why Black Hemp?" nav bar element
chrome_driver.find_element(By.XPATH, "//nav[contains(#id, 'navContainer')]//p[text()='Why Black Hemp?']/../../..").click()
Step 2 - Check to see if our page is still scrolling
# Checks to see if our page is still scrolling
while is_same_position == False:
windowPosition1 = chrome_driver.execute_script("return document.body.scrollHeight;")
time.sleep(2)
windowPosition2 = chrome_driver.execute_script("return document.body.scrollHeight;")
if(windowPosition1 == windowPosition2):
is_same_position = True
final_window_position = windowPosition1
Step 3 - Scroll to the top of the page
chrome_driver.execute_script("window.scrollTo(0, {0})".format((0 - final_window_position)))
Step 4 - Check to see if our page is still scrolling
# Checks to see if our page is still scrolling
while is_same_position == False:
windowPosition1 = chrome_driver.execute_script("return document.body.scrollHeight;")
time.sleep(2)
windowPosition2 = chrome_driver.execute_script("return document.body.scrollHeight;")
if(windowPosition1 == windowPosition2):
is_same_position = True
Step 5 - Attempt to scroll down until our header tag does not have the style of visibility: hidden
# Scrolls down until our nav bar is displayed
for scrollNum in range(10):
chrome_driver.execute_script("window.scrollTo(0, {0})".format(scrollNum * 100 + 200))
time.sleep(2)
if is_displayed(chrome_driver, "//header[contains(#style, 'visibility: hidden')]") == False:
break
Step 6 - Repeat until your heart is content
MAIN CODE - For Reference
from selenium import webdriver
from selenium.webdriver.chrome.webdriver import WebDriver as ChromeWebDriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait as DriverWait
from selenium.webdriver.support import expected_conditions as DriverConditions
from selenium.common.exceptions import WebDriverException
import time
def get_chrome_driver():
"""This sets up our Chrome Driver and returns it as an object"""
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument("window-size=1500,1000")
# Removes the "This is being controlled by automation" alert / notification
chrome_options.add_experimental_option("excludeSwitches", ['enable-automation'])
path_to_chrome = "F:\Selenium_Drivers\Windows_Chrome85_Driver\chromedriver.exe"
return webdriver.Chrome(executable_path = path_to_chrome,
options = chrome_options)
def wait_displayed(driver : ChromeWebDriver, xpath : str, int = 3):
try:
DriverWait(driver, int).until(
DriverConditions.presence_of_element_located(locator = (By.XPATH, xpath))
)
except:
raise WebDriverException(f'Timeout: Failed to find {xpath}')
def is_displayed(driver : ChromeWebDriver, xpath : str, int = 3):
try:
webElement = DriverWait(driver, int).until(
DriverConditions.presence_of_element_located(locator = (By.XPATH, xpath))
)
return True if webElement != None else False
except:
return False
# Gets our chrome driver and opens our site
chrome_driver = get_chrome_driver()
chrome_driver.get("https://www.blackhempfamily.com/")
# Repeats this 5 times
for repeat in range(5):
print("Attempt to click our link. Try #{0}".format(repeat + 1))
is_same_position = False
final_window_position = 0
# Checks to see if our website's elements display
wait_displayed(chrome_driver, "//nav[contains(#id, 'navContainer')]")
wait_displayed(chrome_driver, "//nav[contains(#id, 'navContainer')]//p[text()='Why Black Hemp?']")
wait_displayed(chrome_driver, "//nav[contains(#id, 'navContainer')]//p[text()='Shop Black Hemp']")
# Clicks our "Why Black Hemp?" tab
chrome_driver.find_element(By.XPATH, "//nav[contains(#id, 'navContainer')]//p[text()='Why Black Hemp?']/../../..").click()
# Checks to see if our page is still scrolling
while is_same_position == False:
windowPosition1 = chrome_driver.execute_script("return document.body.scrollHeight;")
time.sleep(2)
windowPosition2 = chrome_driver.execute_script("return document.body.scrollHeight;")
if(windowPosition1 == windowPosition2):
is_same_position = True
final_window_position = windowPosition1
# Checks to see if our "Natural Moisture" text displays
wait_displayed(chrome_driver, "(//h2//span[contains(., 'Natural Moisture')]/../..)[1]")
# Scrolls back to the top of the page
chrome_driver.execute_script("window.scrollTo(0, {0})".format((0 - final_window_position)))
is_same_position = False
# Checks to see if our page is still scrolling
while is_same_position == False:
windowPosition1 = chrome_driver.execute_script("return document.body.scrollHeight;")
time.sleep(2)
windowPosition2 = chrome_driver.execute_script("return document.body.scrollHeight;")
if(windowPosition1 == windowPosition2):
is_same_position = True
# Scrolls down until our nav bar is displayed
for scrollNum in range(10):
chrome_driver.execute_script("window.scrollTo(0, {0})".format(scrollNum * 100 + 200))
time.sleep(2)
if is_displayed(chrome_driver, "//header[contains(#style, 'visibility: hidden')]") == False:
break
chrome_driver.quit()
chrome_driver.stop_client()
print('Congratulations! You clicked your link multiple times!')
Try it with xpath instead, and with element to be located (not clickable), as it is a paragraph. This worked for me:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver import ActionChains
PATH = "C:\Program Files (x86)\chromedriver.exe"
drive = webdriver.Chrome(PATH)
drive.get("https://www.blackhempfamily.com/")
linkWait = EC.element_to_be_located((By.XPATH, "//div/p[contains(., 'Why Black Hemp?')]"))
WebDriverWait(drive, 10).until(linkWait)
link = drive.find_element_by_xpath("//div/p[contains(., 'Why Black Hemp?')]")
link.click()
I've written a scraper in Python scrapy in combination with selenium to scrape some titles from a website. The css selectors defined within my scraper is flawless. I wish my scraper to keep on clicking on the next page and parse the information embedded in each page. It is doing fine for the first page but when it comes to play the role for selenium part the scraper keeps clicking on the same link over and over again.
As this is my first time to work with selenium along with scrapy, I don't have any idea to move on successfully. Any fix will be highly appreciated.
If I try like this then it works smoothly (there is nothing wrong with selectors):
class IncomeTaxSpider(scrapy.Spider):
name = "taxspider"
start_urls = [
'https://www.incometaxindia.gov.in/Pages/utilities/exempted-institutions.aspx',
]
def __init__(self):
self.driver = webdriver.Chrome()
self.wait = WebDriverWait(self.driver, 10)
def parse(self,response):
self.driver.get(response.url)
while True:
for elem in self.wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR,"h1.faqsno-heading"))):
name = elem.find_element_by_css_selector("div[id^='arrowex']").text
print(name)
try:
self.wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "input[id$='_imgbtnNext']"))).click()
self.wait.until(EC.staleness_of(elem))
except TimeoutException:break
But my intention is to make my script run this way:
import scrapy
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
class IncomeTaxSpider(scrapy.Spider):
name = "taxspider"
start_urls = [
'https://www.incometaxindia.gov.in/Pages/utilities/exempted-institutions.aspx',
]
def __init__(self):
self.driver = webdriver.Chrome()
self.wait = WebDriverWait(self.driver, 10)
def click_nextpage(self,link):
self.driver.get(link)
elem = self.wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "div[id^='arrowex']")))
#It keeeps clicking on the same link over and over again
self.wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "input[id$='_imgbtnNext']"))).click()
self.wait.until(EC.staleness_of(elem))
def parse(self,response):
while True:
for item in response.css("h1.faqsno-heading"):
name = item.css("div[id^='arrowex']::text").extract_first()
yield {"Name": name}
try:
self.click_nextpage(response.url) #initiate the method to do the clicking
except TimeoutException:break
These are the titles visible on that landing page (to let you know what I'm after):
INDIA INCLUSION FOUNDATION
INDIAN WILDLIFE CONSERVATION TRUST
VATSALYA URBAN AND RURAL DEVELOPMENT TRUST
I'm not willing to get the data from that site, so any alternative approach other than what I've tried above is useless to me. My only intention is to have any solution related to the way I tried in my second approach.
Your initial code was almost correct with one key piece missing from it. You were using the same response object always. The response object needs to be from the latest page source.
Also you were browsing the link again and again in click next page which was resetting it to page 1 every time. That is why you get page 1 and 2 (max). You need to get the url only once in the parse stage and then let the next page click to happen
Below is final code working fine
class IncomeTaxSpider(scrapy.Spider):
name = "taxspider"
start_urls = [
'https://www.incometaxindia.gov.in/Pages/utilities/exempted-institutions.aspx',
]
def __init__(self):
self.driver = webdriver.Chrome()
self.wait = WebDriverWait(self.driver, 10)
def click_nextpage(self,link):
# self.driver.get(link)
elem = self.wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "div[id^='arrowex']")))
#It keeeps clicking on the same link over and over again
self.wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "input[id$='_imgbtnNext']"))).click()
self.wait.until(EC.staleness_of(elem))
def parse(self, response):
self.driver.get(response.url)
while True:
for item in response.css("h1.faqsno-heading"):
name = item.css("div[id^='arrowex']::text").extract_first()
yield {"Name": name}
try:
self.click_nextpage(response.url) #initiate the method to do the clicking
response = response.replace(body=self.driver.page_source)
except TimeoutException:break
After that change it works perfect
In case you need pure Selenium solution:
driver.get("https://www.incometaxindia.gov.in/Pages/utilities/exempted-institutions.aspx")
while True:
for item in wait(driver, 10).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, "div[id^='arrowex']"))):
print(item.text)
try:
driver.find_element_by_xpath("//input[#text='Next' and not(contains(#class, 'disabledImageButton'))]").click()
except NoSuchElementException:
break
import scrapy
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from scrapy.crawler import CrawlerProcess
class IncomeTaxSpider(scrapy.Spider):
name = "taxspider"
start_urls = [
'https://www.incometaxindia.gov.in/Pages/utilities/exempted-institutions.aspx',
]
def __init__(self):
self.driver = webdriver.Chrome()
self.wait = WebDriverWait(self.driver, 10)
link = 'https://www.incometaxindia.gov.in/Pages/utilities/exempted-institutions.aspx'
self.driver.get(link)
def click_nextpage(self):
elem = self.wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "div[id^='arrowex']")))
#It keeeps clicking on the same link over and over again
self.wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "input[id$='_imgbtnNext']"))).click()
self.wait.until(EC.staleness_of(elem))
time.sleep(4)
def parse(self,response):
while True:
for item in response.css("h1.faqsno-heading"):
name = item.css("div[id^='arrowex']::text").extract_first()
yield {"Name": name}
try:
self.click_nextpage() #initiate the method to do the clicking
except TimeoutException:break
process = CrawlerProcess()
process.crawl(IncomeTaxSpider)
process.start()
Whenever the page gets loaded using the 'Next Page' arrow (using Selenium) it gets reset back to Page '1'. Not sure about the reason for this (may be the java script)
Hence changed the approach to use the input field to enter the page number needed and hitting ENTER key to navigate.
Here is the modified code. Hope this may be useful for you.
import scrapy
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.keys import Keys
class IncomeTaxSpider(scrapy.Spider):
name = "taxspider"
start_urls = [
'https://www.incometaxindia.gov.in/Pages/utilities/exempted-institutions.aspx',
]
def __init__(self):
self.driver = webdriver.Firefox()
self.wait = WebDriverWait(self.driver, 10)
def click_nextpage(self,link, number):
self.driver.get(link)
elem = self.wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "div[id^='arrowex']")))
#It keeeps clicking on the same link over and over again
inputElement = self.driver.find_element_by_xpath("//input[#id='ctl00_SPWebPartManager1_g_d6877ff2_42a8_4804_8802_6d49230dae8a_ctl00_txtPageNumber']")
inputElement.clear()
inputElement.send_keys(number)
inputElement.send_keys(Keys.ENTER)
self.wait.until(EC.staleness_of(elem))
def parse(self,response):
number = 1
while number < 10412: #Website shows it has 10411 pages.
for item in response.css("h1.faqsno-heading"):
name = item.css("div[id^='arrowex']::text").extract_first()
yield {"Name": name}
print (name)
try:
number += 1
self.click_nextpage(response.url, number) #initiate the method to do the clicking
except TimeoutException:break
Create a self.page_num or something.
def parse(self,response):
self.pages = self.driver.find_element_by_css_selector("#ctl00_SPWebPartManager1_g_d6877ff2_42a8_4804_8802_6d49230dae8a_ctl00_totalRecordsDiv.act_search_footer span")
self.pages = int(self.pages.split('of ')[1].split(']')[0])
self.page_num = 1
while self.page_num <= self.pages:
for item in response.css("h1.faqsno-heading"):
name = item.css("div[id^='arrowex']::text").extract_first()
yield {"Name": name}
try:
self.click_nextpage(response.url) #initiate the method to do the clicking
except TimeoutException:break
def click_nextpage(self,link):
self.driver.get(link)
elem = self.wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "div[id^='arrowex']")))
page_link = 'ctl00_SPWebPartManager1_g_d6877ff2_42a8_4804_8802_6d49230dae8a_ctl00_lnkBtn_' + str(self.page_num)
self.page_num = self.page_num + 1
self.wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "input[id$='_imgbtnNext']"))).click()
self.wait.until(EC.staleness_of(elem))