How to fetch information from web page chain wise(solving captcha)?

How to fetch information from web page chain wise(solving captcha)? - python

I have to go to here
Here I have to choose applicant name = “ltd”
But here before submitting the page, I have to solve a captcha. How to fetch the next page's information(application number, application title, date, application status etc.....) in an excel format using web scrapping?
---------------- Running the following script, getting error -----
import csv
import json
from time import sleep, time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver import DesiredCapabilities
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.chrome.webdriver import WebDriver
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
def save_to_csv(data: list) -> None:
with open(file='ipindiaservices.csv', mode='a', encoding="utf-8") as f:
writer = csv.writer(f, lineterminator='\n')
writer.writerow([*data])
def start_from_page(page_number: int, driver: WebDriver) -> None:
driver.execute_script(
f"""
document.querySelector('button.next').value = {page_number};
document.querySelector('button.next').click();
"""
)
def titles_validation(driver: WebDriver) -> None:
"""replace empty title name with '_'"""
driver.execute_script(
"""
let titles = document.querySelectorAll('input+.tab-pane tr:not(:first-child)>td:last-child')
Array.from(titles).forEach((e) => {
if (!e.textContent.trim()) {
e.textContent = '_';
}
});
"""
)
def get_network_data(log: dict, driver: WebDriver) -> dict:
log = json.loads(log["message"])["message"]
if all([
"Network.responseReceived" in log["method"],
"params" in log.keys(),
'CaptchaAudio' in str(log["params"].values())
]):
return driver.execute_cdp_cmd('Network.getResponseBody', {'requestId': log["params"]["requestId"]})
def get_captcha_text(driver: WebDriver, timeout: float) -> str:
"""Return captcha text
Arguments:
- driver: WebDriver
- timeout: pause before receiving data from the web driver log
"""
driver.execute_script(
"""
// document.querySelector('img[title="Captcha"]').click()
document.querySelector('img[title="Captcha Audio"]').click()
"""
)
sleep(timeout)
logs = driver.get_log('performance')
responses = [get_network_data(log, driver) for log in logs if get_network_data(log, driver)]
if responses:
return json.loads(responses[0]['body'])['CaptchaImageText']
else:
get_captcha_text(driver)
def submit_captcha(captcha_text: str, btn_name: str) -> None:
"""Submit captcha
Arguments:
- btn_name: captcha send button name["submit" or "search"]
"""
if btn_name == 'search':
captcha_locator = (By.CSS_SELECTOR, 'input[name="submit"]')
elif btn_name == 'submit':
captcha_locator = (By.ID, 'btnSubmit')
wait.until(EC.visibility_of_element_located((By.ID, 'CaptchaText'))).send_keys(captcha_text)
wait.until(EC.visibility_of_element_located(captcha_locator)).click()
# options = webdriver.ChromeOptions()
# options.add_argument('--headless')
# options.add_experimental_option("excludeSwitches", ["enable-automation", "enable-logging"])
# capabilities = DesiredCapabilities.CHROME
# capabilities["goog:loggingPrefs"] = {"performance": "ALL"}
# service = Service(executable_path="path/to/your/chromedriver.exe")
# # driver = webdriver.Chrome(service=service, options=options, desired_capabilities=capabilities)
wait = WebDriverWait(driver, 15)
table_values_locator = (By.CSS_SELECTOR, 'input+.tab-pane tr:not(:first-child)>td:last-child')
applicant_name_locator = (By.ID, 'TextField6')
page_number_locator = (By.CSS_SELECTOR, 'span.Selected')
app_satus_locator = (By.CSS_SELECTOR, 'button.btn')
next_btn_locator = (By.CSS_SELECTOR, 'button.next')
driver.get('https://ipindiaservices.gov.in/PublicSearch/')
# sometimes an alert with an error message("") may appear, so a small pause is used
sleep(1)
wait.until(EC.visibility_of_element_located(applicant_name_locator)).send_keys('ltd')
# on the start page and the page with the table, the names of the buttons are different
captcha_text = get_captcha_text(driver, 1)
submit_captcha(captcha_text, "search")
# the page where the search starts
start_from_page(1, driver)
while True:
start = time()
# get current page number
current_page = wait.until(EC.visibility_of_element_located(page_number_locator)).text
print(f"Current page: {current_page}")
# get all application status WebElements
app_status_elements = wait.until(EC.visibility_of_all_elements_located(app_satus_locator))
for element in range(len(app_status_elements)):
print(f"App number: {element}")
# update application status WebElements
app_status_elements = wait.until(EC.visibility_of_all_elements_located(app_satus_locator))
# click on application status
wait.until(EC.visibility_of(app_status_elements[element])).click()
# wait 2 seconds for the captcha to change
sleep(2)
# get text and submit captcha
captcha_text = get_captcha_text(driver, 1)
submit_captcha(captcha_text, "submit")
try:
# get all table data values(without titles) WebElements
table_data_values = wait.until(EC.visibility_of_all_elements_located(table_values_locator))
# if there are empty rows in the table replace them with "_"
titles_validation(driver)
# save data to csv
save_to_csv([val.text.replace('\n', ' ') for val in table_data_values])
except TimeoutException:
print("Application Number does not exist")
finally:
driver.back()
# print the current page number to the console
print(f"Time per page: {round(time()-start, 3)}")
# if the current page is equal to the specified one, then stop the search and close the driver
if current_page == '3776':
break
# click next page
wait.until(EC.visibility_of_element_located(next_btn_locator)).click()
driver.quit()

On this site, captcha can be solved without resorting to third-party services. When you click on the "Captcha Audio" button, a GET request is sent to the endpoint https://ipindiaservices.gov.in/PublicSearch/Captcha/CaptchaAudio The response is a dictionary {"CaptchaImageText":"hnnxd"} which you can access get from Selenium via the "Chrome Devtools Protocol" using the Network.getResponseBody method, or you can use the requests library.
To save data in csv, you can use, for example, the csv module included in the standard library.
Here is one possible solution:
import re
import csv
import json
from time import sleep
from selenium import webdriver
from typing import Generator, List, Tuple
from selenium.webdriver.common.by import By
from selenium.webdriver import DesiredCapabilities
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.chrome.webdriver import WebDriver
from selenium.webdriver.support import expected_conditions as EC
def get_captcha_text(log: dict, driver: WebDriver) -> dict:
log = json.loads(log["message"])["message"]
if all([
"Network.responseReceived" in log["method"],
"params" in log.keys(),
'CaptchaAudio' in str(log["params"].values())
]):
return driver.execute_cdp_cmd('Network.getResponseBody', {'requestId': log["params"]["requestId"]})
def save_to_csv(table_data: Generator[None, None, List[Tuple[str, str, str, str]]]) -> None:
for app_num, title, app_date, status in zip(*list(zip(*table_data))):
data = {
'Application Number': app_num,
'Title': title,
'Application Date': app_date,
'Status': status
}
with open(file='ipindiaservices.csv', mode='a', encoding="utf-8") as f:
writer = csv.writer(f, lineterminator='\n')
writer.writerow([data['Application Number'], data['Title'], data['Application Date'], data['Status']])
def start_from_page(page_number: int, driver: WebDriver) -> None:
driver.execute_script(
f"""
document.querySelector('button.next').value = {page_number};
document.querySelector('button.next').click();
"""
)
def titles_validation(driver: WebDriver) -> None:
"""replace empty title name with '_'"""
driver.execute_script(
"""
let titles = document.querySelectorAll('#tableData>tbody>tr>td.title')
Array.from(titles).forEach((e) => {
if (!e.textContent.trim()) {
e.textContent = '_';
}
});
"""
)
options = webdriver.ChromeOptions()
options.add_experimental_option("excludeSwitches", ["enable-automation", "enable-logging"])
capabilities = DesiredCapabilities.CHROME
capabilities["goog:loggingPrefs"] = {"performance": "ALL"}
service = Service(executable_path="path/to/your/chromedriver.exe")
driver = webdriver.Chrome(service=service, options=options, desired_capabilities=capabilities)
wait = WebDriverWait(driver, 10)
# regular expression to search for data in a table
pattern = r'^([0-9A-Z\/\-,]+) (.+)? ([0-9\/]+) (\w+)'
driver.get('https://ipindiaservices.gov.in/PublicSearch/')
# sometimes an alert with an error message("") may appear, so a small pause is used
sleep(1)
driver.find_element(By.CSS_SELECTOR, 'img[title="Captcha Audio"]').click()
driver.find_element(By.ID, 'TextField6').send_keys('ltd')
# short pause is needed here to write the log, otherwise we will get an empty list
sleep(1)
logs = driver.get_log('performance')
# get request data that is generated when click on the button listen to the text of the captcha
responses = [get_captcha_text(log, driver) for log in logs if get_captcha_text(log, driver)]
# get captcha text
captcha_text = json.loads(responses[0]['body'])['CaptchaImageText']
# enter the captcha text and click "Search" button
driver.find_element(By.ID, 'CaptchaText').send_keys(captcha_text)
driver.find_element(By.CSS_SELECTOR, 'input[name="submit"]').click()
# the page where the search starts
start_from_page(1, driver)
while True:
# get current page number
current_page = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, 'span.Selected'))).text
# print the current page number to the console
print(f"Current page: {current_page}")
# get all fields of the table
table_web_elements = wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, '#tableData>tbody>tr')))
# check title name
titles_validation(driver)
# get all table data on current page
table_data = (re.findall(pattern, data.text)[0] for data in table_web_elements)
# save table data to csv
save_to_csv(table_data)
# if the current page is equal to the specified one, then stop the search and close the driver
if current_page == '3768':
break
# click next page
driver.find_element(By.CSS_SELECTOR, 'button.next').click()
driver.quit()
The performance of this solution is about 280sec per 100 pages.
It will take about 2.5-3 hours to collect all the data.
Therefore, the ability to stop data collection on a specific page has been added (by default, this is the last page):
if current_page == '3768':
break
And start collection data from the specified page (by default, this is the first page):
start_from_page(1, driver)
Output is ipindiaservices.csv
202247057786,NON-AQUEOUS ELECTROLYTE SECONDARY BATTERY,10/10/2022,Published
202247057932,"COMMUNICATION METHOD, APPARATUS AND SYSTEM",10/10/2022,Published
202247057855,POLYOLEFIN RESIN FILM,10/10/2022,Published
202247057853,CEMENT COMPOSITION AND CURED PRODUCT THEREOF,10/10/2022,Published
...
To use this solution in Google Collab, follow these steps:
Install Selenium and ChromeDriver
!pip install selenium
!apt-get update
!apt install chromium-chromedriver
Make the necessary imports
import re
import csv
import json
from time import sleep
from typing import Generator, List, Tuple
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver import DesiredCapabilities
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.chrome.webdriver import WebDriver
from selenium.webdriver.support import expected_conditions as EC
Set options
options = webdriver.ChromeOptions()
options.add_argument('--headless')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
capabilities = DesiredCapabilities.CHROME
capabilities["goog:loggingPrefs"] = {"performance": "ALL"}
driver = webdriver.Chrome('chromedriver', chrome_options=options, desired_capabilities=capabilities)
Everything else remains unchanged.
And don't put all the code in one cell.
Update:
This is one possible solution to collect all the information from the table of each application:
import csv
import json
from time import sleep, time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver import DesiredCapabilities
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.chrome.webdriver import WebDriver
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
def save_to_csv(data: list) -> None:
with open(file='ipindiaservices.csv', mode='a', encoding="utf-8") as f:
writer = csv.writer(f, lineterminator='\n')
writer.writerow([*data])
def start_from_page(page_number: int, driver: WebDriver) -> None:
driver.execute_script(
f"""
document.querySelector('button.next').value = {page_number};
document.querySelector('button.next').click();
"""
)
def titles_validation(driver: WebDriver) -> None:
"""replace empty title name with '_'"""
driver.execute_script(
"""
let titles = document.querySelectorAll('input+.tab-pane tr:not(:first-child)>td:last-child')
Array.from(titles).forEach((e) => {
if (!e.textContent.trim()) {
e.textContent = '_';
}
});
"""
)
def get_network_data(log: dict, driver: WebDriver) -> dict:
log = json.loads(log["message"])["message"]
if all([
"Network.responseReceived" in log["method"],
"params" in log.keys(),
'CaptchaAudio' in str(log["params"].values())
]):
return driver.execute_cdp_cmd('Network.getResponseBody', {'requestId': log["params"]["requestId"]})
def get_captcha_text(driver: WebDriver, timeout: float) -> str:
"""Return captcha text
Arguments:
- driver: WebDriver
- timeout: pause before receiving data from the web driver log
"""
driver.execute_script(
"""
// document.querySelector('img[title="Captcha"]').click()
document.querySelector('img[title="Captcha Audio"]').click()
"""
)
sleep(timeout)
logs = driver.get_log('performance')
if responses := [get_network_data(log, driver) for log in logs if get_network_data(log, driver)]:
return json.loads(responses[0]['body'])['CaptchaImageText']
else:
get_captcha_text(driver, timeout)
def submit_captcha(captcha_text: str, btn_name: str) -> None:
"""Submit captcha
Arguments:
- btn_name: captcha send button name["submit" or "search"]
"""
if btn_name == 'search':
captcha_locator = (By.CSS_SELECTOR, 'input[name="submit"]')
elif btn_name == 'submit':
captcha_locator = (By.ID, 'btnSubmit')
wait.until(EC.visibility_of_element_located((By.ID, 'CaptchaText'))).send_keys(captcha_text)
wait.until(EC.visibility_of_element_located(captcha_locator)).click()
options = webdriver.ChromeOptions()
options.add_argument('--headless')
options.add_experimental_option("excludeSwitches", ["enable-automation", "enable-logging"])
capabilities = DesiredCapabilities.CHROME
capabilities["goog:loggingPrefs"] = {"performance": "ALL"}
service = Service(executable_path="path/to/your/chromedriver.exe")
driver = webdriver.Chrome(service=service, options=options, desired_capabilities=capabilities)
wait = WebDriverWait(driver, 15)
table_values_locator = (By.CSS_SELECTOR, 'input+.tab-pane tr:not(:first-child)>td:last-child')
applicant_name_locator = (By.ID, 'TextField6')
page_number_locator = (By.CSS_SELECTOR, 'span.Selected')
app_satus_locator = (By.CSS_SELECTOR, 'button.btn')
next_btn_locator = (By.CSS_SELECTOR, 'button.next')
driver.get('https://ipindiaservices.gov.in/PublicSearch/')
# sometimes an alert with an error message("") may appear, so a small pause is used
sleep(1)
wait.until(EC.visibility_of_element_located(applicant_name_locator)).send_keys('ltd')
# on the start page and the page with the table, the names of the buttons are different
captcha_text = get_captcha_text(driver, 1)
submit_captcha(captcha_text, "search")
# the page where the search starts
start_from_page(1, driver)
while True:
start = time()
# get current page number
current_page = wait.until(EC.visibility_of_element_located(page_number_locator)).text
print(f"Current page: {current_page}")
# get all application status WebElements
app_status_elements = wait.until(EC.visibility_of_all_elements_located(app_satus_locator))
for element in range(len(app_status_elements)):
print(f"App number: {element}")
# update application status WebElements
app_status_elements = wait.until(EC.visibility_of_all_elements_located(app_satus_locator))
# click on application status
wait.until(EC.visibility_of(app_status_elements[element])).click()
# wait 2 seconds for the captcha to change
sleep(2)
# get text and submit captcha
captcha_text = get_captcha_text(driver, 1)
submit_captcha(captcha_text, "submit")
try:
# get all table data values(without titles) WebElements
table_data_values = wait.until(EC.visibility_of_all_elements_located(table_values_locator))
# if there are empty rows in the table replace them with "_"
titles_validation(driver)
# save data to csv
save_to_csv([val.text.replace('\n', ' ') for val in table_data_values])
except TimeoutException:
print("Application Number does not exist")
finally:
driver.back()
# print the current page number to the console
print(f"Time per page: {round(time()-start, 3)}")
# if the current page is equal to the specified one, then stop the search and close the driver
if current_page == '3776':
break
# click next page
wait.until(EC.visibility_of_element_located(next_btn_locator)).click()
driver.quit()
Output is ipindiaservices.csv
202247059447,PCT NATIONAL PHASE APPLICATION,18/10/2022,"PANASONIC INTELLECTUAL PROPERTY MANAGEMENT CO., LTD.",SOLID ELECTROLYTE MATERIAL AND BATTERY USING SAME,ELECTRICAL,patent#depenning.com,_,_,PCT/JP2021/004427,05/02/2021,31/03/2020,--,21/10/2022
202247059470,PCT NATIONAL PHASE APPLICATION,18/10/2022,"SHENZHEN SKYWORTH-RGB ELECTRONIC CO., LTD.","ATTACHMENT FORCE ADJUSTMENT METHOD AND APPARATUS, DEVICE, AND STORAGE MEDIUM",COMPUTER SCIENCE,ipo#knspartners.com,_,_,PCT/CN2020/125555,30/10/2020,02/04/2020,18/10/2022,21/10/2022
202247058733,PCT NATIONAL PHASE APPLICATION,14/10/2022,"SUMITOMO ELECTRIC OPTIFRONTIER CO., LTD.","FUSION SPLICING SYSTEM, FUSION SPLICING DEVICE, AND DETERIORATION DETERMINATION METHOD",COMPUTER SCIENCE,patent#depenning.com,_,_,PCT/JP2021/017016,28/04/2021,30/04/2020,--,21/10/2022
The performance of this solution is about 230sec per page.
Sometimes there may be no data on the aplication status page (for example, by the number "00054/CAL/1998" we get "Application Number does not exist") Therefore, the script simply ignores this aplication.
2 sec timeout before receiving the captcha text is due to the fact that after clicking on "Application Status" one captcha is shown and after ~1sec it changes to another one which we must enter
Fix:
Since the captcha that appeared after clicking on Application status was removed from the site, its solution from the script was also removed. Also, the developers of this resource made the same classes for Application Number and Application status, so we needed to change the css selector for Application status.
Solution for Google Colab:
import csv
import json
from time import sleep, time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver import DesiredCapabilities
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.chrome.webdriver import WebDriver
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
def save_to_csv(data: list) -> None:
with open(file='ipindiaservices.csv', mode='a', encoding="utf-8") as f:
writer = csv.writer(f, lineterminator='\n')
writer.writerow([*data])
def start_from_page(page_number: int, driver: WebDriver) -> None:
driver.execute_script(
f"""
document.querySelector('button.next').value = {page_number};
document.querySelector('button.next').click();
"""
)
def titles_validation(driver: WebDriver) -> None:
"""replace empty title name with '_'"""
driver.execute_script(
"""
let titles = document.querySelectorAll('input+.tab-pane tr:not(:first-child)>td:last-child')
Array.from(titles).forEach((e) => {
if (!e.textContent.trim()) {
e.textContent = '_';
}
});
"""
)
def get_network_data(log: dict, driver: WebDriver) -> dict:
log = json.loads(log["message"])["message"]
if all([
"Network.responseReceived" in log["method"],
"params" in log.keys(),
'CaptchaAudio' in str(log["params"].values())
]):
return driver.execute_cdp_cmd('Network.getResponseBody', {'requestId': log["params"]["requestId"]})
def get_captcha_text(driver: WebDriver, timeout: float) -> str:
"""Return captcha text
Arguments:
- driver: WebDriver
- timeout: pause before receiving data from the web driver log
"""
driver.execute_script(
"""
// document.querySelector('img[title="Captcha"]').click()
document.querySelector('img[title="Captcha Audio"]').click()
"""
)
sleep(timeout)
logs = driver.get_log('performance')
responses = [get_network_data(log, driver) for log in logs if get_network_data(log, driver)]
if responses:
return json.loads(responses[0]['body'])['CaptchaImageText']
else:
get_captcha_text(driver, timeout)
def submit_captcha(captcha_text: str, btn_name: str) -> None:
"""Submit captcha
Arguments:
- btn_name: captcha send button name["submit" or "search"]
"""
if btn_name == 'search':
captcha_locator = (By.CSS_SELECTOR, 'input[name="submit"]')
elif btn_name == 'submit':
captcha_locator = (By.ID, 'btnSubmit')
wait.until(EC.visibility_of_element_located((By.ID, 'CaptchaText'))).send_keys(captcha_text)
wait.until(EC.visibility_of_element_located(captcha_locator)).click()
options = webdriver.ChromeOptions()
options.add_argument('--headless')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
options.add_experimental_option("excludeSwitches", ["enable-automation", "enable-logging"])
capabilities = DesiredCapabilities.CHROME
capabilities["goog:loggingPrefs"] = {"performance": "ALL"}
driver = webdriver.Chrome(executable_path='chromedriver', options=options, desired_capabilities=capabilities)
wait = WebDriverWait(driver, 10)
table_values_locator = (By.CSS_SELECTOR, 'input+.tab-pane tr:not(:first-child)>td:last-child')
applicant_name_locator = (By.ID, 'TextField6')
page_number_locator = (By.CSS_SELECTOR, 'span.Selected')
app_satus_locator = (By.CSS_SELECTOR, 'button[name="ApplicationSatus"]')
next_btn_locator = (By.CSS_SELECTOR, 'button.next')
driver.get('https://ipindiaservices.gov.in/PublicSearch/')
# sometimes an alert with an error message("") may appear, so a small pause is used
sleep(1)
wait.until(EC.visibility_of_element_located(applicant_name_locator)).send_keys('ltd')
# on the start page and the page with the table, the names of the buttons are different
captcha_text = get_captcha_text(driver, 1)
submit_captcha(captcha_text, "search")
# the page where the search starts
start_from_page(1, driver)
while True:
start = time()
# get current page number
current_page = wait.until(EC.visibility_of_element_located(page_number_locator)).text
print(f"Current page: {current_page}")
# get all application status WebElements
app_status_elements = wait.until(EC.visibility_of_all_elements_located(app_satus_locator))
for element in range(len(app_status_elements)):
print(f"App number: {element}")
# update application status WebElements
app_status_elements = wait.until(EC.visibility_of_all_elements_located(app_satus_locator))
# click on application status
wait.until(EC.visibility_of(app_status_elements[element])).click()
try:
# switch to new tab
driver.switch_to.window(driver.window_handles[-1])
# get all table data values(without titles) WebElements
table_data_values = wait.until(EC.visibility_of_all_elements_located(table_values_locator))
# if there are empty rows in the table replace them with "_"
titles_validation(driver)
# save data to csv
save_to_csv([val.text.replace('\n', ' ') for val in table_data_values])
except TimeoutException:
print("Application Number does not exist")
finally:
# close new tab
driver.close()
# switch to main tab
driver.switch_to.window(driver.window_handles[0])
# print the current page number to the console
print(f"Time per page: {round(time()-start, 3)}")
# if the current page is equal to the specified one, then stop the search and close the driver
if current_page == '100':
break
# click next page
wait.until(EC.visibility_of_element_located(next_btn_locator)).click()
driver.quit()
These fixes have been tested on Google Colab(python 3.7.15).
The performance of this solution is about 55sec per page.
Previous versions of script are left for visual comparison.

You can use any capctha solving sites. On these sites, users usually fix it themselves, so they can be slow, but it does the job.
Sample website (I didn't receive any ads)
You can use selenium to pull the information. It will be enough to take and place the elements with the "id" tag on the site.
Library for reading/writing excel in python

Related

How to fetch two table's information from a same webpage?

I have to go to here
Here I have to choose applicant name = “ASIAN PAINTS” (as an example)
By this code, [Google Colab]
!pip install selenium
!apt-get update
!apt install chromium-chromedriver
import re
import csv
import json
from time import sleep
from typing import Generator, List, Tuple
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver import DesiredCapabilities
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.chrome.webdriver import WebDriver
from selenium.webdriver.support import expected_conditions as EC
options = webdriver.ChromeOptions()
options.add_argument('--headless')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
capabilities = DesiredCapabilities.CHROME
capabilities["goog:loggingPrefs"] = {"performance": "ALL"}
driver = webdriver.Chrome('chromedriver', chrome_options=options, desired_capabilities=capabilities)
import csv
import json
from time import sleep, time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver import DesiredCapabilities
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.chrome.webdriver import WebDriver
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
def save_to_csv(data: list) -> None:
with open(file='ipindiaservices.csv', mode='a', encoding="utf-8") as f:
writer = csv.writer(f, lineterminator='\n')
writer.writerow([*data])
def start_from_page(page_number: int, driver: WebDriver) -> None:
driver.execute_script(
f"""
document.querySelector('button.next').value = {page_number};
document.querySelector('button.next').click();
"""
)
def titles_validation(driver: WebDriver) -> None:
"""replace empty title name with '_'"""
driver.execute_script(
"""
let titles = document.querySelectorAll('input+.tab-pane tr:not(:first-child)>td:last-child')
Array.from(titles).forEach((e) => {
if (!e.textContent.trim()) {
e.textContent = '_';
}
});
"""
)
def get_network_data(log: dict, driver: WebDriver) -> dict:
log = json.loads(log["message"])["message"]
if all([
"Network.responseReceived" in log["method"],
"params" in log.keys(),
'CaptchaAudio' in str(log["params"].values())
]):
return driver.execute_cdp_cmd('Network.getResponseBody', {'requestId': log["params"]["requestId"]})
def get_captcha_text(driver: WebDriver, timeout: float) -> str:
"""Return captcha text
Arguments:
- driver: WebDriver
- timeout: pause before receiving data from the web driver log
"""
driver.execute_script(
"""
// document.querySelector('img[title="Captcha"]').click()
document.querySelector('img[title="Captcha Audio"]').click()
"""
)
sleep(timeout)
logs = driver.get_log('performance')
responses = [get_network_data(log, driver) for log in logs if get_network_data(log, driver)]
if responses:
return json.loads(responses[0]['body'])['CaptchaImageText']
else:
get_captcha_text(driver, timeout)
def submit_captcha(captcha_text: str, btn_name: str) -> None:
"""Submit captcha
Arguments:
- btn_name: captcha send button name["submit" or "search"]
"""
if btn_name == 'search':
captcha_locator = (By.CSS_SELECTOR, 'input[name="submit"]')
elif btn_name == 'submit':
captcha_locator = (By.ID, 'btnSubmit')
wait.until(EC.visibility_of_element_located((By.ID, 'CaptchaText'))).send_keys(captcha_text)
wait.until(EC.visibility_of_element_located(captcha_locator)).click()
'''
options = webdriver.ChromeOptions()
options.add_argument('--headless')
options.add_experimental_option("excludeSwitches", ["enable-automation", "enable-logging"])
capabilities = DesiredCapabilities.CHROME
capabilities["goog:loggingPrefs"] = {"performance": "ALL"}
# service = Service(executable_path="path/to/your/chromedriver.exe")
# driver = webdriver.Chrome(service=service, options=options, desired_capabilities=capabilities)
driver = webdriver.Chrome('chromedriver', chrome_options=options, desired_capabilities=capabilities)
'''
wait = WebDriverWait(driver, 15)
table_values_locator = (By.CSS_SELECTOR, 'input+.tab-pane tr:not(:first-child)>td:last-child')
applicant_name_locator = (By.ID, 'TextField6')
page_number_locator = (By.CSS_SELECTOR, 'span.Selected')
app_satus_locator = (By.CSS_SELECTOR, 'button.btn')
next_btn_locator = (By.CSS_SELECTOR, 'button.next')
driver.get('https://ipindiaservices.gov.in/PublicSearch/')
# sometimes an alert with an error message("") may appear, so a small pause is used
sleep(1)
wait.until(EC.visibility_of_element_located(applicant_name_locator)).send_keys('ltd')
# on the start page and the page with the table, the names of the buttons are different
captcha_text = get_captcha_text(driver, 1)
submit_captcha(captcha_text, "search")
# the page where the search starts
start_from_page(1, driver)
while True:
start = time()
# get current page number
current_page = wait.until(EC.visibility_of_element_located(page_number_locator)).text
print(f"Current page: {current_page}")
# get all application status WebElements
app_status_elements = wait.until(EC.visibility_of_all_elements_located(app_satus_locator))
for element in range(len(app_status_elements)):
print(f"App number: {element}")
# update application status WebElements
app_status_elements = wait.until(EC.visibility_of_all_elements_located(app_satus_locator))
# click on application status
wait.until(EC.visibility_of(app_status_elements[element])).click()
# wait 2 seconds for the captcha to change
sleep(2)
# get text and submit captcha
captcha_text = get_captcha_text(driver, 1)
submit_captcha(captcha_text, "submit")
try:
# get all table data values(without titles) WebElements
table_data_values = wait.until(EC.visibility_of_all_elements_located(table_values_locator))
# if there are empty rows in the table replace them with "_"
titles_validation(driver)
# save data to csv
save_to_csv([val.text.replace('\n', ' ') for val in table_data_values])
except TimeoutException:
print("Application Number does not exist")
finally:
driver.back()
# print the current page number to the console
print(f"Time per page: {round(time()-start, 3)}")
# if the current page is equal to the specified one, then stop the search and close the driver
if current_page == '1':
break
# click next page
wait.until(EC.visibility_of_element_located(next_btn_locator)).click()
driver.quit()
import pandas as pd
data = pd.read_csv('/content/ipindiaservices.csv')
df = data.set_axis(['APPLICATION NUMBER', 'APPLICATION TYPE', 'DATE OF FILING', 'APPLICANT NAME', 'TITLE OF INVENTION','FIELD OF INVENTION','E-MAIL (As Per Record)','ADDITIONAL-EMAIL (As Per Record)','E-MAIL (UPDATED Online)','PCT INTERNATIONAL APPLICATION NUMBER','PCT INTERNATIONAL FILING DATE','PRIORITY DATE','REQUEST FOR EXAMINATION DATE','PUBLICATION DATE (U/S 11A)'], axis=1, inplace=False)
df.head(2)
from google.colab import drive
drive.mount('drive')
df.to_csv('data.csv')
df.to_csv('/drive/My Drive/folder_name/name_csv_file.csv')
I am successfully able to extract this information
I also need to extract this table's information(yellow marked). Can it be possible?
I want to append this status into my previous csv . Can it be done modifying the existing code. TIA
def save_to_csv(data: list) -> None:
with open(file='ipindiaservices.csv', mode='a', encoding="utf-8") as f:
writer = csv.writer(f, lineterminator='\n')
writer.writerow([*data])
def start_from_page(page_number: int, driver: WebDriver) -> None:
driver.execute_script(
f"""
document.querySelector('button.next').value = {page_number};
document.querySelector('button.next').click();
"""
)
def titles_validation(driver: WebDriver) -> None:
"""replace empty title name with '_'"""
driver.execute_script(
"""
let titles = document.querySelectorAll('input+.tab-pane tr:not(:first-child)>td:last-child')
Array.from(titles).forEach((e) => {
if (!e.textContent.trim()) {
e.textContent = '_';
}
});
"""
)
def get_network_data(log: dict, driver: WebDriver) -> dict:
log = json.loads(log["message"])["message"]
if all([
"Network.responseReceived" in log["method"],
"params" in log.keys(),
'CaptchaAudio' in str(log["params"].values())
]):
return driver.execute_cdp_cmd('Network.getResponseBody', {'requestId': log["params"]["requestId"]})
def get_captcha_text(driver: WebDriver, timeout: float) -> str:
"""Return captcha text
Arguments:
- driver: WebDriver
- timeout: pause before receiving data from the web driver log
"""
driver.execute_script(
"""
// document.querySelector('img[title="Captcha"]').click()
document.querySelector('img[title="Captcha Audio"]').click()
"""
)
sleep(timeout)
logs = driver.get_log('performance')
responses = [get_network_data(log, driver) for log in logs if get_network_data(log, driver)]
if responses:
return json.loads(responses[0]['body'])['CaptchaImageText']
else:
get_captcha_text(driver, timeout)
def submit_captcha(captcha_text: str, btn_name: str) -> None:
"""Submit captcha
Arguments:
- btn_name: captcha send button name["submit" or "search"]
"""
if btn_name == 'search':
captcha_locator = (By.CSS_SELECTOR, 'input[name="submit"]')
elif btn_name == 'submit':
captcha_locator = (By.ID, 'btnSubmit')
wait.until(EC.visibility_of_element_located((By.ID, 'CaptchaText'))).send_keys(captcha_text)
wait.until(EC.visibility_of_element_located(captcha_locator)).click()
'''
options = webdriver.ChromeOptions()
options.add_argument('--headless')
options.add_experimental_option("excludeSwitches", ["enable-automation", "enable-logging"])
capabilities = DesiredCapabilities.CHROME
capabilities["goog:loggingPrefs"] = {"performance": "ALL"}
# service = Service(executable_path="path/to/your/chromedriver.exe")
# driver = webdriver.Chrome(service=service, options=options, desired_capabilities=capabilities)
driver = webdriver.Chrome('chromedriver', chrome_options=options, desired_capabilities=capabilities)
'''
wait = WebDriverWait(driver, 15)
table_values_locator = (By.CSS_SELECTOR, 'input+.tab-pane tr:not(:first-child)>td:last-child')
# read 2nd Table
table2_values_locator = (By.CSS_SELECTOR, 'table tr:nth-of-type(2)')
applicant_name_locator = (By.ID, 'TextField6')
page_number_locator = (By.CSS_SELECTOR, 'span.Selected')
app_satus_locator = (By.CSS_SELECTOR, 'button.btn')
next_btn_locator = (By.CSS_SELECTOR, 'button.next')
driver.get('https://ipindiaservices.gov.in/PublicSearch/')
# sometimes an alert with an error message("") may appear, so a small pause is used
sleep(1)
wait.until(EC.visibility_of_element_located(applicant_name_locator)).send_keys('ASIAN PAINTS') # give input (according to your company name)
# on the start page and the page with the table, the names of the buttons are different
captcha_text = get_captcha_text(driver, 1)
submit_captcha(captcha_text, "search")
# the page where the search starts
start_from_page(1, driver)
while True:
start = time()
# get current page number
current_page = wait.until(EC.visibility_of_element_located(page_number_locator)).text
print(f"Current page: {current_page}")
# get all application status WebElements
app_status_elements = wait.until(EC.visibility_of_all_elements_located(app_satus_locator))
for element in range(len(app_status_elements)):
print(f"App number: {element}")
# update application status WebElements
app_status_elements = wait.until(EC.visibility_of_all_elements_located(app_satus_locator))
# click on application status
wait.until(EC.visibility_of(app_status_elements[element])).click()
# wait 2 seconds for the captcha to change
sleep(2)
# get text and submit captcha
captcha_text = get_captcha_text(driver, 1)
submit_captcha(captcha_text, "submit")
try:
# get all table data values(without titles) WebElements
table_data_values = wait.until(EC.visibility_of_all_elements_located(table_values_locator))
# if there are empty rows in the table replace them with "_"
titles_validation(driver)
# get all 2nd-table data WebElements
table_data_values2 = wait.until(EC.visibility_of_all_elements_located(table2_values_locator))
# save data to csv
save_to_csv([val.text.replace('\n', ' ') for val in table_data_values])
save_to_csv([val.text.replace('\n', ' ') for val in table_data_values2])
except TimeoutException:
print("Application Number does not exist")
finally:
driver.back()
# print the current page number to the console
print(f"Time per page: {round(time()-start, 3)}")
# if the current page is equal to the specified one, then stop the search and close the driver
if current_page == '1':
break
# click next page
wait.until(EC.visibility_of_element_located(next_btn_locator)).click()
driver.quit()
I have edited the code as per the suggestion, but getting this error

First-Step - GET the 2nd table CSS-Selector (after Code-Line 121):
...
table_values_locator = (By.CSS_SELECTOR, 'input+.tab-pane tr:not(:first-child)>td:last-child')
# read 2nd Table
table2_values_locator = (By.CSS_SELECTOR, 'table tr:nth-of-type(2)')
....
Second-Step - add the data form 2nd table css-selector to CSV (after Code-Line 163):
...
try:
# get all 1st-table data values(without titles) WebElements
table_data_values = wait.until(EC.visibility_of_all_elements_located(table_values_locator))
# if there are empty rows in the table replace them with "_"
titles_validation(driver)
# get all 2nd-table data WebElements
table_data_values2 = wait.until(EC.visibility_of_all_elements_located(table2_values_locator))
# save data to csv
save_to_csv([val.text.replace('\n', ' ') for val in table_data_values])
save_to_csv([val.text.replace('\n', ' ') for val in table_data_values2])
...

How to get all comments in 9gag using selenium?

I'm working on scraping the memes and all their comments from 9gag.
I used this code below but I am only getting few extra comments.
actions = ActionChains(driver)
link = driver.find_element(By.XPATH, "//button[#class='comment-list__load-more']")
actions.move_to_element(link).click(on_element=link).perform()
I would also like to access the subcomments under a comment by simulating click on view more replies.
From the html I found this XPATH element = driver.find_element(By.XPATH, "//div[#class='vue-recycle-scroller ready page-mode direction-vertical']")holds the comments section but I'm not sure how to iterate through each comment in this element and simulate these clicks.
This code should work directly provided the necessary libraries are present in case you wanna test it.
Please help me with these following tasks:
Getting all the comments from view all comments
Iterating through each comment section and clicking on view more replies to get all the subcomments
My Code
import time
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
import undetected_chromedriver as uc
if __name__ == '__main__':
options = Options()
# options.headless = True
options.add_argument("start-maximized") # ensure window is full-screen
driver = uc.Chrome(service=Service(ChromeDriverManager().install()), options=options)
driver.get("https://9gag.com/gag/a5EAv9O")
prev_h = 0
for i in range(10):
height = driver.execute_script("""
function getActualHeight() {
return Math.max(
Math.max(document.body.scrollHeight, document.documentElement.scrollHeight),
Math.max(document.body.offsetHeight, document.documentElement.offsetHeight),
Math.max(document.body.clientHeight, document.documentElement.clientHeight)
);
}
return getActualHeight();
""")
driver.execute_script(f"window.scrollTo({prev_h},{prev_h + 200})")
time.sleep(1)
prev_h += 200
if prev_h >= height:
break
time.sleep(5)
title = driver.title[:-7]
try:
upvotes_count = \
driver.find_element(By.XPATH, "//meta[#property='og:description']").get_attribute("content").split(' ')[0]
comments_count = \
driver.find_element(By.XPATH, "//meta[#property='og:description']").get_attribute("content").split(' ')[3]
upvotes_count = int(upvotes_count) if len(upvotes_count) <= 3 else int("".join(upvotes_count.split(',')))
comments_count = int(comments_count) if len(comments_count) <= 3 else int("".join(comments_count.split(',')))
date_posted = driver.find_element(By.XPATH, "//p[#class='message']")
date_posted = date_posted.text.split("·")[1].strip()
# actions = ActionChains(driver)
# link = driver.find_element(By.XPATH, "//button[#class='comment-list__load-more']")
# actions.move_to_element(link).click(on_element=link).perform()
element = driver.find_element(By.XPATH,
"//div[#class='vue-recycle-scroller ready page-mode direction-vertical']")
print(element.text)
driver.quit()
except NoSuchElementException or Exception as err:
print(err)
Output
Edit:
I managed to make the code work better. It scrolls through the page until it sees all the comments. It also clicks on view more replies if there are subcomments.
But it's only able to read comments from middle to end. Maybe as the page is scrolled down, the initial comments are hidden dynamically. I do not know how to overcome this. And clicking on view more replies stops after some clicks and is throwing the error
selenium.common.exceptions.MoveTargetOutOfBoundsException: Message: move target out of bounds
Here's the updated code
import driver as driver
from selenium.webdriver.remote.webelement import WebElement
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains
import time
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException, ElementClickInterceptedException
from selenium.webdriver.support.wait import WebDriverWait
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
import undetected_chromedriver as uc
def scroll_page(scrl_hgt):
prev_h = 0
for i in range(10):
height = driver.execute_script("""
function getActualHeight() {
return Math.max(
Math.max(document.body.scrollHeight, document.documentElement.scrollHeight),
Math.max(document.body.offsetHeight, document.documentElement.offsetHeight),
Math.max(document.body.clientHeight, document.documentElement.clientHeight)
);
}
return getActualHeight();
""")
driver.execute_script(f"window.scrollTo({prev_h},{prev_h + scrl_hgt})")
time.sleep(1)
prev_h += scrl_hgt
if prev_h >= height:
break
if __name__ == '__main__':
options = Options()
# options.headless = True
driver = uc.Chrome(service=Service(ChromeDriverManager().install()), options=options)
driver.maximize_window()
driver.get("https://9gag.com/gag/a5EAv9O")
time.sleep(5)
# click on I accept cookies
actions = ActionChains(driver)
consent_button = driver.find_element(By.XPATH, '//*[#id="qc-cmp2-ui"]/div[2]/div/button[2]')
actions.move_to_element(consent_button).click().perform()
scroll_page(150)
time.sleep(2)
# click on fresh comments sectin
fresh_comments = driver.find_element(By.XPATH, '//*[#id="page"]/div[1]/section[2]/section/header/div/button[2]')
actions.move_to_element(fresh_comments).click(on_element=fresh_comments).perform()
time.sleep(5)
# getting meta data
title = driver.title[:-7]
upvotes_count = driver.find_element(By.XPATH, "//meta[#property='og:description']").get_attribute("content").split(' ')[0]
comments_count = driver.find_element(By.XPATH, "//meta[#property='og:description']").get_attribute("content").split(' ')[3]
upvotes_count = int(upvotes_count) if len(upvotes_count) <= 3 else int("".join(upvotes_count.split(',')))
comments_count = int(comments_count) if len(comments_count) <= 3 else int("".join(comments_count.split(',')))
date_posted = driver.find_element(By.XPATH, "//p[#class='message']")
date_posted = date_posted.text.split("·")[1].strip()
time.sleep(3)
# click on lood more comments button to load all the comments
load_more_comments = driver.find_element(By.XPATH, "//button[#class='comment-list__load-more']")
actions.move_to_element(load_more_comments).click(on_element=load_more_comments).perform()
scroll_page(500)
print([my_elem.text for my_elem in driver.find_elements(By.CSS_SELECTOR, "div.comment-list-item__text")])
comments = driver.find_elements(By.CSS_SELECTOR, "div.vue-recycle-scroller__item-view")
for item in comments:
html = item.get_attribute("innerHTML")
if "comment-list-item__text" in html:
print(item.find_element(By.CSS_SELECTOR, "div.comment-list-item__text").text)
elif "comment-list-item__deleted-text" in html:
print(item.find_element(By.CSS_SELECTOR, "div.comment-list-item__deleted-text").text)
# get sub comments
if "comment-list-item__replies" in html:
#item.find_element(By.CSS_SELECTOR, "div.comment-list-item__replies").click()
sub_comments = item.find_element(By.CSS_SELECTOR, "div.comment-list-item__replies")
actions.move_to_element(sub_comments).click(on_element=sub_comments).perform()
time.sleep(2)
driver.quit()
PS: My goal is to get every single comments and all their sub comments (whether they are text, image, gif, etc) in the order they appear and save them somewhere so that I should be able to recreate the comments section again.

To extract and print the comment texts you need to induce WebDriverWait for visibility_of_all_elements_located() and you can use the following Locator Strategies:
driver.get("https://9gag.com/gag/a5EAv9O")
WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.CSS_SELECTOR, "button.comment-list__load-more"))).click()
print([my_elem.text for my_elem in driver.find_elements(By.CSS_SELECTOR, "div.comment-list-item__text")])
Console Output:
['Man, the battle of the cults is getting interesting now.', 'rent free in your head', 'Sorry saving all my money up for the Joe Biden Depends Multipack and the Karmella knee pads.', "It's basically a cult now.", "I'll take one. I'm not even American", '', 'that eagle looks familiar.', "Who doesn't want a trump card?"]
Note : You have to add the following imports :
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC

Web crawler wont complete loop form autofill task with python

I have a script linked with a csv file that should run a loop to fill in page form text, then submit the form, and go back to the prior page form and fill in form text with the next row of data from the csv file.
Currently the script completes the loop task for the first row of csv data, submits the form, then goes back to the original page form but doesnt loop again by autofilling the form for the next row of csv inputs. In short, the loop finishes a single cycle following the page submission and then ends.
What can I do to make the loop continue autofilling for the remainder of the csv rows? Thank you all!
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
import csv
import time
import pandas as pd
# import csv file
table = pd.read_csv(r'...test2.csv')
print(table)
address1 = table['Address2'].tolist()
unit1 = table['Unit2'].tolist()
unittype1 = table['Unit Type'].tolist()
beds1 = table['Beds2'].tolist()
bath1 = table['Baths2'].tolist()
rent1 = table['Rent2'].tolist()
vouchbeds1 = table['Vouchbeds2'].tolist()
# open chrome
# driver = Webdriver.chrome("C:\Python Tools\chromedriver.exe")
s = Service("C:\Python Tools\chromedriver.exe")
driver = webdriver.Chrome(service=s)
# Enter login
driver.get("https://hadashboard.gosection8.com/pages/login/Login.aspx")
driver.implicitly_wait(5)
driver.find_element(By.CSS_SELECTOR, ".form > input:nth-child(3)").send_keys("hiddenlogin")
driver.find_element(By.CSS_SELECTOR, ".form > input:nth-child(6)").send_keys("hiddenpassword")
driver.find_element(By.CSS_SELECTOR, ".m-col-12:nth-child(8)").click()
driver.implicitly_wait(10)
# go to rent reasonableness analysis
driver.find_element(By.CSS_SELECTOR, ".not-now-btn").click()
driver.find_element(By.CSS_SELECTOR, ".clear-fix > div > .rent-btn-row > .primary-button").click()
driver.implicitly_wait(10)
# https://stackoverflow.com/questions/66933061/looping-through-several-columns-and-rows-from-csv-to-fill-a-form
address = driver.find_element(By.ID, "SubjectPage_AutocompleteAddress")
unit = driver.find_element(By.ID, 'SubjectPage_AddressLine2_Auto')
beds = driver.find_element(By.ID, "SubjectPage_BedroomCount")
baths = driver.find_element(By.ID, "SubjectPage_FullBathCount")
rent = driver.find_element(By.ID, "SubjectPage_AskingRent")
vouchbeds = driver.find_element(By.ID, "SubjectPage_VoucherBedroomCount")
for address1, unit1, unittype1, beds1, bath1, rent1, vouchbeds1 in zip(address1, unit1, unittype1, beds1, bath1, rent1, vouchbeds1):
address.send_keys(address1)
time.sleep(4)
unit.send_keys(unit1)
driver.implicitly_wait(10)
beds.send_keys(beds1)
driver.implicitly_wait(10)
baths.send_keys(bath1)
driver.implicitly_wait(10)
driver.find_element(By.CSS_SELECTOR, "#SubjectPage_PropertyType_Fake > select").click()
dropdown = driver.find_element(By.CSS_SELECTOR, "#SubjectPage_PropertyType_Fake > select")
dropdown.find_element(By.XPATH, "//option[. = 'Apartment']").click()
#time.sleep(2)
rent.send_keys(rent1)
driver.implicitly_wait(10)
driver.find_element(By.ID, "SubjectPage_VoucherBedroomCount").click()
vouchbeds.send_keys(vouchbeds1)
driver.implicitly_wait(10)
submit = driver.find_element(By.ID, "SubjectPage_AnalyzeBottom").click()
time.sleep(10)
driver.find_element(By.CSS_SELECTOR, ".subject-cmn-btns:nth-child(1)").click()
time.sleep(5)
# return to page forms for next loop: https://hadashboard.gosection8.com/RentWatch5/RentWatch5.aspx
driver.get("https://hadashboard.gosection8.com/RentWatch5/RentWatch5.aspx")

Random TimeoutException even after using ui.WebDriverWait() chrome selenium python

Random timeout exception for the code below not sure whats the best approach to address these issues, and this timeout not happen all time, and also it does find elements sometime or all time
we appreciate your comments and suggestions and apparently explicit wait is not handling until the elements gets loaded into the browser application or elements are getting different interval in every single time when new page gets loaded
"""
"""
import platform , logging
import os,re
from time import sleep
import selenium.webdriver.support.ui as ui
import selenium.webdriver.support.expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.webdriver import WebDriver
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.action_chains import ActionChains
class Cloud(object):
"""
cloud class to get query and response
"""
def __init__(self, username='xxxxxx', password='xxxx'):
logging.basicConfig(format='%(asctime)s %(message)s', datefmt='%m/%d/%Y %I:%M:%S %p', level=logging.INFO)
self.logger = logging.getLogger(__name__)
self.url = "https://www.amazon.com"
self.username = username
self.password = password
self.timeout = 100
self.driver = None
self.get_chrome_driver()
def get_chrome_driver(self):
"""
get chrome driver
"""
if platform.system().lower() == 'windows':
if self.driver is None:
chrome_options = Options()
#chrome_options.add_argument('--disable-extensions')
chrome_options.add_argument('--start-maximized')
chrome_options.add_argument('--disable-popup-blocking')
chrome_options.add_argument('--ignore-certificate-errors')
chrome_options.add_argument('--allow-insecure-localhost')
chrome_options.add_argument('--disable-infobars')
chrome_options.add_argument("--log-level=3")
chrome_driver_path = os.path.join(str(os.environ['PYTHONPATH'].split(';')[0]),"bin","chromedriver","chromedriver.exe")
self.driver = WebDriver(executable_path=chrome_driver_path, chrome_options=chrome_options)
return self.driver
def login(self, username='xxxxxxx', password='xxxxx'):
"""
Login into amazon cloud
"""
self.logger.info("logging in amazon cloud username: %s and password: %s" %(self.username, re.sub(r".", "*", self.password)))
self.driver.get(self.url)
# wait for login username textbox
self.wait_visibility_element(By.XPATH, "//div[#id='nav-signin-tooltip']//span[#class='nav-action-inner'][contains(text(),'Sign in')]")
self.driver.find_element_by_xpath(" //div[#id='nav-signin-tooltip']//span[#class='nav-action-inner'][contains(text(),'Sign in')]").click()
self.wait_visibility_element(By.XPATH,"//label[#class='a-form-label']")
self.wait_visibility_element(By.XPATH,"//input[#id='ap_email']")
username_textbox = self.driver.find_element_by_xpath("//input[#id='ap_email']")
username_textbox.clear()
username_textbox.send_keys(self.username)
self.driver.find_element_by_xpath("//input[#id='continue']").click()
self.wait_visibility_element(By.XPATH,"//input[#id='ap_password']") #//label[#class='a-form-label']
password_textbox = self.driver.find_element_by_xpath("//input[#id='ap_password']")
password_textbox.clear()
password_textbox.send_keys(self.password)
# click on submit button
self.driver.find_element_by_xpath("//input[#id='signInSubmit']").click()
def wait_visibility_element(self, by_type, element_name):
"""
wait for visibility of element
:param by_type: Locate element using type of element
:param element_name: element name
"""
ui.WebDriverWait(self.driver, self.timeout).until(
EC.visibility_of_element_located((by_type, element_name)))
def get_audio_text(self, multi_turn_count=1):
self.login()
#Arrow in the Top Menu
self.wait_visibility_element(By.XPATH, "//a[#id='nav-link-accountList']//span[#class='nav-icon nav-arrow']")
ui.WebDriverWait(self.driver, self.timeout).until(EC.element_to_be_clickable((By.XPATH, "//a[#id='nav-link-accountList']//span[#class='nav-icon nav-arrow']")))
self.driver.find_element_by_xpath("//a[#id='nav-link-accountList']//span[#class='nav-icon nav-arrow']").click()
#To avoid click event ambiguity
firstLevelMenu = self.driver.find_element_by_xpath("//span[contains(#class,'nav-line-2')][contains(text(),'Account & Lists')]")
action = ActionChains(self.driver)
action.move_to_element(firstLevelMenu).perform()
#sub menu select and click
self.wait_visibility_element(By.XPATH, "//span[contains(text(),'Your Content and Devices')]")
self.driver.find_element_by_xpath("//span[contains(text(),'Your Content and Devices')]").click()
#Alexa Privacy
self.wait_visibility_element(By.XPATH, "//div[#id='ng-app']//div[2]//div[1]//div[1]//div[1]//div[1]//div[1]//div[2]//div[6]//div[1]//div[1]")
self.driver.find_element_by_xpath("//div[#id='ng-app']//div[2]//div[1]//div[1]//div[1]//div[1]//div[1]//div[2]//div[6]//div[1]//div[1]").click()
self.wait_visibility_element(By.XPATH,'//div[#class="navAlexaOptionTitle_alexaNavHeader_myx ng-binding"][contains(text(),"Review Voice History")]')
ui.WebDriverWait(self.driver, self.timeout).until(
EC.element_to_be_clickable((By.XPATH, '//div[#class="navAlexaOptionTitle_alexaNavHeader_myx ng-binding"][contains(text(),"Review Voice History")]')))
ui.WebDriverWait(self.driver, self.timeout).until(EC.text_to_be_present_in_element((By.XPATH, '//span[#class="overviewHeadingString_myx ng-binding"]'), 'Alexa Privacy'))
self.driver.find_element_by_xpath('//div[#class="navAlexaOptionTitle_alexaNavHeader_myx ng-binding"][contains(text(),"Overview")]').click()
self.driver.find_element_by_xpath("//div[#class='navAlexaOptionTitle_alexaNavHeader_myx ng-binding'][contains(text(),'Review Voice History')]").click()
# Select the dropdown box
self.wait_visibility_element(By.XPATH,"//span[#id='timePickerDesktop']//span[#class='a-button-text a-declarative']")
ui.WebDriverWait(self.driver, self.timeout).until(EC.element_to_be_clickable((By.XPATH, "//span[#id='timePickerDesktop']//span[#class='a-button-text a-declarative']")))
self.driver.find_element_by_xpath("//span[#id='timePickerDesktop']//span[#class='a-button-text a-declarative']").click()
#All history selection
self.wait_visibility_element(By.XPATH,"//a[#id='timePickerDesktop_4']")
ui.WebDriverWait(self.driver, self.timeout).until(EC.element_to_be_clickable((By.XPATH, "//a[#id='timePickerDesktop_4']")))
self.driver.find_element_by_xpath("//a[#id='timePickerDesktop_4']").click()
# read first text format of the data
self.wait_visibility_element(By.XPATH,"//span[#id='mainInfo-0']//div[contains(#class,'summaryCss')]")
txt = self.driver.find_element_by_xpath("//span[#id='mainInfo-0']//div[contains(#class,'summaryCss')]").text
question_text = txt.encode("utf-8")[3:-3]
# Dropdown the rectangle menu
self.driver.find_element_by_xpath("//div[#id='arrowExpand-0']//i[#class='fa fa-angle-down caretAlignment']").click()
# read AVS Response
self.wait_visibility_element(By.XPATH,"//div[#id='activityItemsInner-0']//div[#class='ttsInfo']")
ui.WebDriverWait(self.driver, self.timeout).until(EC.element_to_be_clickable((By.XPATH, "//div[#id='activityItemsInner-0']//div[#class='ttsInfo']")))
txt = self.driver.find_element_by_xpath("//div[#id='activityItemsInner-0']//div[#class='ttsInfo']").text
answer_text = txt.encode("utf-8")[3:-3]
self.sign_out_direct()
return question_text, answer_text
def sign_out(self):
#Sigout menu nevigation
self.driver.find_element_by_xpath("//i[#class='hm-icon nav-sprite']").click()
self.wait_visibility_element(By.XPATH,"//div[contains(text(),'SHOP BY CATEGORY')]")
#sign out
sign_out_element = self.driver.find_element_by_xpath("//li[44]//a[1]")
self.driver.execute_script("arguments[0].scrollIntoView();", sign_out_element)
#self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
ui.WebDriverWait(self.driver, self.timeout).until(EC.element_to_be_clickable((By.XPATH, "//li[44]//a[1]")))
self.driver.find_element_by_xpath("//li[44]//a[1]").click()
self.sign_out_direct()
#Close current tab
self.driver.close()
def sign_out_direct(self):
#Arrow in the Top Menu
self.wait_visibility_element(By.XPATH, "//a[#id='nav-link-accountList']//span[#class='nav-icon nav-arrow']")
ui.WebDriverWait(self.driver, self.timeout).until(EC.element_to_be_clickable((By.XPATH, "//a[#id='nav-link-accountList']//span[#class='nav-icon nav-arrow']")))
self.driver.find_element_by_xpath("//a[#id='nav-link-accountList']//span[#class='nav-icon nav-arrow']").click()
#To avoid click event ambiguity
firstLevelMenu = self.driver.find_element_by_xpath("//span[contains(#class,'nav-line-2')][contains(text(),'Account & Lists')]")
action = ActionChains(self.driver)
action.move_to_element(firstLevelMenu).perform()
#sub menu select and click
self.wait_visibility_element(By.XPATH, " //span[contains(text(),'Sign Out')]")
self.driver.find_element_by_xpath("//span[contains(text(),'Sign Out')]").click()
#Close current tab
self.driver.close()
if __name__ == '__main__':
for loop in range(20):
PAGE = Cloud()
#PAGE.login()
OUTPUT = PAGE.get_audio_text()
print("\n\nQuestion:: %s"%str(list(OUTPUT)[0]).upper())
print("Answer:: %s"%str(list(OUTPUT)[1]).upper())
#PAGE.sign_out()
#PAGE.sign_out_direct()
sleep(2)

If you post the lines of code in particular that are throwing the timeout exceptions, this will help track down the issues easier.
I noticed most of your waits are for visibility_of_element_located. I would recommend trying to change some of those to element_to_be_clickable instead, because some elements will appear on the DOM before they are fully rendered.

How to send text to username and password field in Instagram through Selenium and Python

Im having the error messages for a python program I got they are :
C:\Users\chanm\AppData\Local\Programs\Python\Python37-32\python.exe C:/Users/chanm/OneDrive/Desktop/bot/Commenter.py
Traceback (most recent call last):
File "C:/Users/chanm/OneDrive/Desktop/bot/Commenter.py", line 133, in <module>
com.login()
File "C:/Users/chanm/OneDrive/Desktop/bot/Commenter.py", line 28, in login
login_button = driver.find_element_by_xpath("//a[#href='/accounts/login/']")
File "C:\Users\chanm\AppData\Local\Programs\Python\Python37-32\lib\site-packages\selenium\webdriver\remote\webdriver.py", line 394, in find_element_by_xpath
return self.find_element(by=By.XPATH, value=xpath)
File "C:\Users\chanm\AppData\Local\Programs\Python\Python37-32\lib\site-packages\selenium\webdriver\remote\webdriver.py", line 978, in find_element
'value': value})['value']
File "C:\Users\chanm\AppData\Local\Programs\Python\Python37-32\lib\site-packages\selenium\webdriver\remote\webdriver.py", line 321, in execute
self.error_handler.check_response(response)
File "C:\Users\chanm\AppData\Local\Programs\Python\Python37-32\lib\site-packages\selenium\webdriver\remote\errorhandler.py", line 242, in check_response
raise exception_class(message, screen, stacktrace)
selenium.common.exceptions.NoSuchElementException: Message: no such element: Unable to locate element: {"method":"xpath","selector":"//a[#href='/accounts/login/']"}
(Session info: chrome=71.0.3578.80)
(Driver info: chromedriver=2.44.609538 (b655c5a60b0b544917107a59d4153d4bf78e1b90),platform=Windows NT 10.0.17134 x86_64)
Process finished with exit code 1
This is my code that I have
Commenter.py
import time
import random
import re
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import NoSuchElementException, StaleElementReferenceException
from chatterbot.trainers import ListTrainer
from chatterbot import ChatBot
class Commenter:
def __init__(self, username, password):
self.username = username
self.password = password
self.driver = webdriver.Chrome()
self.driver.set_window_size(700, 900)
"""closing browser"""
def closeBrowser(self):
self.driver.close()
"""login in to Instagram"""
def login(self) -> object:
driver = self.driver
driver.get("https://www.instagram.com/")
time.sleep(2)
login_button = driver.find_element_by_xpath("//a[#href='/accounts/login/']")
login_button.click()
time.sleep(2)
user_name_elem = driver.find_element_by_xpath("//input[#name='username']")
user_name_elem.clear()
user_name_elem.send_keys(self.username)
passworword_elem = driver.find_element_by_xpath("//input[#name='password']")
passworword_elem.clear()
passworword_elem.send_keys(self.password)
passworword_elem.send_keys(Keys.RETURN)
time.sleep(2)
"""getting pictures on a hashtag page"""
def get_pictures_on_page(self, hashtag, scrolls=int):
self.driver.get("https://www.instagram.com/explore/tags/" + hashtag + "/")
time.sleep(2)
# gathering photos
pic_hrefs = []
for i in range(1, scrolls):
try:
self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(2)
# get tags
hrefs_in_view = self.driver.find_elements_by_tag_name('a')
# finding relevant hrefs
hrefs_in_view = [elem.get_attribute('href') for elem in hrefs_in_view if
hashtag in elem.get_attribute('href')]
# building list of unique photos
[pic_hrefs.append(href) for href in hrefs_in_view if href not in pic_hrefs]
# print("Check: pic href length " + str(len(pic_hrefs)))
except Exception:
continue
return pic_hrefs
"""write comment in text area using lambda function"""
def write_comment(self, comment_text):
try:
comment_button = lambda: self.driver.find_element_by_link_text('Comment')
comment_button().click()
except NoSuchElementException:
pass
try:
comment_box_elem = lambda: self.driver.find_element_by_xpath("//textarea[#aria-label='Add a comment…']")
comment_box_elem().send_keys('')
comment_box_elem().clear()
for letter in comment_text:
comment_box_elem().send_keys(letter)
time.sleep((random.randint(1, 7) / 30))
return comment_box_elem
except StaleElementReferenceException and NoSuchElementException as e:
print(e)
return False
"""actually post a comment"""
def post_comment(self, comment_text):
time.sleep(random.randint(1,5))
comment_box_elem = self.write_comment(comment_text)
if comment_text in self.driver.page_source:
comment_box_elem().send_keys(Keys.ENTER)
try:
post_button = lambda: self.driver.find_element_by_xpath("//button[#type='Post']")
post_button().click()
print('clicked post button')
except NoSuchElementException:
pass
time.sleep(random.randint(4, 6))
self.driver.refresh()
if comment_text in self.driver.page_source:
return True
return False
"""grab comments from a picture page"""
def get_comments(self):
# load more comments if button exists
time.sleep(3)
try:
comments_block = self.driver.find_element_by_class_name('Xl2Pu')
comments_in_block = comments_block.find_elements_by_class_name('gElp9')
comments = [x.find_element_by_tag_name('span') for x in comments_in_block]
user_comment = re.sub(r'#.\w*', '', comments[0].text)
except NoSuchElementException:
return ''
return user_comment
"""have bot comment on picture"""
def comment_on_picture(self):
bot = ChatBot('YouTubeChatBot')
bot.set_trainer(ListTrainer)
picture_comment = self.get_comments()
# user's comment and bot's response
response = bot.get_response(picture_comment).__str__()
print("User's Comment", picture_comment)
print("Bot's Response", response)
return self.post_comment(response)
com: Commenter = Commenter(username='username', password='password')
com.login()
for pic in com.get_pictures_on_page(hashtag='gaming', scrolls=5)[1:]:
com.driver.get(pic)
time.sleep(3)
print('Posted Comment:', com.comment_on_picture())
time.sleep(3)
that is the script I have the most problems I have I have tried things along the lines of changing extensions and other little things it resolved most of them now im stuck with these ones

The username and password field within Instagram are JavaScript enabled element so you have to induce WebDriverWait for the desired element to be clickable and you can use the following solution:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
options = Options()
options.add_argument("start-maximized")
options.add_argument("disable-infobars")
options.add_argument("--disable-extensions")
driver = webdriver.Chrome(chrome_options=options, executable_path=r'C:\Utility\BrowserDrivers\chromedriver.exe')
driver.get('https://www.instagram.com')
WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.CSS_SELECTOR, "a[href='/accounts/login/?source=auth_switcher']"))).click()
WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.CSS_SELECTOR, "input[name='username'][aria-label='Phone number, username, or email']"))).send_keys("JRProgrammer")
driver.find_element_by_css_selector("input[name='password'][aria-label='Password']").send_keys("JRProgrammer")
driver.find_element_by_xpath("//button[text()='Log in']").click()

If you read the stack trace, it says that it is failing to find the login button, so you've entered a bad selector. I'm not really sure why you're calling that anyway, since you wouldn't want to click the login button before entering user info.
Try:
def login(self) -> object:
driver = self.driver
driver.get("https://www.instagram.com/")
time.sleep(2)
user_name_elem = driver.find_element_by_xpath("//input[#name='username']")
user_name_elem.clear()
user_name_elem.send_keys(self.username)
passworword_elem = driver.find_element_by_xpath("//input[#name='password']")
passworword_elem.clear()
passworword_elem.send_keys(self.password)
passworword_elem.send_keys(Keys.RETURN)
time.sleep(2)
login_button = driver.find_element_by_css_selector("button[type='submit']")
login_button.click()
time.sleep(2)

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

How to fetch information from web page chain wise(solving captcha)? - python

Related

How to fetch two table's information from a same webpage?

How to get all comments in 9gag using selenium?

Web crawler wont complete loop form autofill task with python

Random TimeoutException even after using ui.WebDriverWait() chrome selenium python

How to send text to username and password field in Instagram through Selenium and Python

Categories

Resources