I have to go to here
Here I have to choose applicant name = “ASIAN PAINTS” (as an example)
By this code, [Google Colab]
!pip install selenium
!apt-get update
!apt install chromium-chromedriver
import re
import csv
import json
from time import sleep
from typing import Generator, List, Tuple
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver import DesiredCapabilities
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.chrome.webdriver import WebDriver
from selenium.webdriver.support import expected_conditions as EC
options = webdriver.ChromeOptions()
options.add_argument('--headless')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
capabilities = DesiredCapabilities.CHROME
capabilities["goog:loggingPrefs"] = {"performance": "ALL"}
driver = webdriver.Chrome('chromedriver', chrome_options=options, desired_capabilities=capabilities)
import csv
import json
from time import sleep, time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver import DesiredCapabilities
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.chrome.webdriver import WebDriver
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
def save_to_csv(data: list) -> None:
with open(file='ipindiaservices.csv', mode='a', encoding="utf-8") as f:
writer = csv.writer(f, lineterminator='\n')
writer.writerow([*data])
def start_from_page(page_number: int, driver: WebDriver) -> None:
driver.execute_script(
f"""
document.querySelector('button.next').value = {page_number};
document.querySelector('button.next').click();
"""
)
def titles_validation(driver: WebDriver) -> None:
"""replace empty title name with '_'"""
driver.execute_script(
"""
let titles = document.querySelectorAll('input+.tab-pane tr:not(:first-child)>td:last-child')
Array.from(titles).forEach((e) => {
if (!e.textContent.trim()) {
e.textContent = '_';
}
});
"""
)
def get_network_data(log: dict, driver: WebDriver) -> dict:
log = json.loads(log["message"])["message"]
if all([
"Network.responseReceived" in log["method"],
"params" in log.keys(),
'CaptchaAudio' in str(log["params"].values())
]):
return driver.execute_cdp_cmd('Network.getResponseBody', {'requestId': log["params"]["requestId"]})
def get_captcha_text(driver: WebDriver, timeout: float) -> str:
"""Return captcha text
Arguments:
- driver: WebDriver
- timeout: pause before receiving data from the web driver log
"""
driver.execute_script(
"""
// document.querySelector('img[title="Captcha"]').click()
document.querySelector('img[title="Captcha Audio"]').click()
"""
)
sleep(timeout)
logs = driver.get_log('performance')
responses = [get_network_data(log, driver) for log in logs if get_network_data(log, driver)]
if responses:
return json.loads(responses[0]['body'])['CaptchaImageText']
else:
get_captcha_text(driver, timeout)
def submit_captcha(captcha_text: str, btn_name: str) -> None:
"""Submit captcha
Arguments:
- btn_name: captcha send button name["submit" or "search"]
"""
if btn_name == 'search':
captcha_locator = (By.CSS_SELECTOR, 'input[name="submit"]')
elif btn_name == 'submit':
captcha_locator = (By.ID, 'btnSubmit')
wait.until(EC.visibility_of_element_located((By.ID, 'CaptchaText'))).send_keys(captcha_text)
wait.until(EC.visibility_of_element_located(captcha_locator)).click()
'''
options = webdriver.ChromeOptions()
options.add_argument('--headless')
options.add_experimental_option("excludeSwitches", ["enable-automation", "enable-logging"])
capabilities = DesiredCapabilities.CHROME
capabilities["goog:loggingPrefs"] = {"performance": "ALL"}
# service = Service(executable_path="path/to/your/chromedriver.exe")
# driver = webdriver.Chrome(service=service, options=options, desired_capabilities=capabilities)
driver = webdriver.Chrome('chromedriver', chrome_options=options, desired_capabilities=capabilities)
'''
wait = WebDriverWait(driver, 15)
table_values_locator = (By.CSS_SELECTOR, 'input+.tab-pane tr:not(:first-child)>td:last-child')
applicant_name_locator = (By.ID, 'TextField6')
page_number_locator = (By.CSS_SELECTOR, 'span.Selected')
app_satus_locator = (By.CSS_SELECTOR, 'button.btn')
next_btn_locator = (By.CSS_SELECTOR, 'button.next')
driver.get('https://ipindiaservices.gov.in/PublicSearch/')
# sometimes an alert with an error message("") may appear, so a small pause is used
sleep(1)
wait.until(EC.visibility_of_element_located(applicant_name_locator)).send_keys('ltd')
# on the start page and the page with the table, the names of the buttons are different
captcha_text = get_captcha_text(driver, 1)
submit_captcha(captcha_text, "search")
# the page where the search starts
start_from_page(1, driver)
while True:
start = time()
# get current page number
current_page = wait.until(EC.visibility_of_element_located(page_number_locator)).text
print(f"Current page: {current_page}")
# get all application status WebElements
app_status_elements = wait.until(EC.visibility_of_all_elements_located(app_satus_locator))
for element in range(len(app_status_elements)):
print(f"App number: {element}")
# update application status WebElements
app_status_elements = wait.until(EC.visibility_of_all_elements_located(app_satus_locator))
# click on application status
wait.until(EC.visibility_of(app_status_elements[element])).click()
# wait 2 seconds for the captcha to change
sleep(2)
# get text and submit captcha
captcha_text = get_captcha_text(driver, 1)
submit_captcha(captcha_text, "submit")
try:
# get all table data values(without titles) WebElements
table_data_values = wait.until(EC.visibility_of_all_elements_located(table_values_locator))
# if there are empty rows in the table replace them with "_"
titles_validation(driver)
# save data to csv
save_to_csv([val.text.replace('\n', ' ') for val in table_data_values])
except TimeoutException:
print("Application Number does not exist")
finally:
driver.back()
# print the current page number to the console
print(f"Time per page: {round(time()-start, 3)}")
# if the current page is equal to the specified one, then stop the search and close the driver
if current_page == '1':
break
# click next page
wait.until(EC.visibility_of_element_located(next_btn_locator)).click()
driver.quit()
import pandas as pd
data = pd.read_csv('/content/ipindiaservices.csv')
df = data.set_axis(['APPLICATION NUMBER', 'APPLICATION TYPE', 'DATE OF FILING', 'APPLICANT NAME', 'TITLE OF INVENTION','FIELD OF INVENTION','E-MAIL (As Per Record)','ADDITIONAL-EMAIL (As Per Record)','E-MAIL (UPDATED Online)','PCT INTERNATIONAL APPLICATION NUMBER','PCT INTERNATIONAL FILING DATE','PRIORITY DATE','REQUEST FOR EXAMINATION DATE','PUBLICATION DATE (U/S 11A)'], axis=1, inplace=False)
df.head(2)
from google.colab import drive
drive.mount('drive')
df.to_csv('data.csv')
df.to_csv('/drive/My Drive/folder_name/name_csv_file.csv')
I am successfully able to extract this information
I also need to extract this table's information(yellow marked). Can it be possible?
I want to append this status into my previous csv . Can it be done modifying the existing code. TIA
def save_to_csv(data: list) -> None:
with open(file='ipindiaservices.csv', mode='a', encoding="utf-8") as f:
writer = csv.writer(f, lineterminator='\n')
writer.writerow([*data])
def start_from_page(page_number: int, driver: WebDriver) -> None:
driver.execute_script(
f"""
document.querySelector('button.next').value = {page_number};
document.querySelector('button.next').click();
"""
)
def titles_validation(driver: WebDriver) -> None:
"""replace empty title name with '_'"""
driver.execute_script(
"""
let titles = document.querySelectorAll('input+.tab-pane tr:not(:first-child)>td:last-child')
Array.from(titles).forEach((e) => {
if (!e.textContent.trim()) {
e.textContent = '_';
}
});
"""
)
def get_network_data(log: dict, driver: WebDriver) -> dict:
log = json.loads(log["message"])["message"]
if all([
"Network.responseReceived" in log["method"],
"params" in log.keys(),
'CaptchaAudio' in str(log["params"].values())
]):
return driver.execute_cdp_cmd('Network.getResponseBody', {'requestId': log["params"]["requestId"]})
def get_captcha_text(driver: WebDriver, timeout: float) -> str:
"""Return captcha text
Arguments:
- driver: WebDriver
- timeout: pause before receiving data from the web driver log
"""
driver.execute_script(
"""
// document.querySelector('img[title="Captcha"]').click()
document.querySelector('img[title="Captcha Audio"]').click()
"""
)
sleep(timeout)
logs = driver.get_log('performance')
responses = [get_network_data(log, driver) for log in logs if get_network_data(log, driver)]
if responses:
return json.loads(responses[0]['body'])['CaptchaImageText']
else:
get_captcha_text(driver, timeout)
def submit_captcha(captcha_text: str, btn_name: str) -> None:
"""Submit captcha
Arguments:
- btn_name: captcha send button name["submit" or "search"]
"""
if btn_name == 'search':
captcha_locator = (By.CSS_SELECTOR, 'input[name="submit"]')
elif btn_name == 'submit':
captcha_locator = (By.ID, 'btnSubmit')
wait.until(EC.visibility_of_element_located((By.ID, 'CaptchaText'))).send_keys(captcha_text)
wait.until(EC.visibility_of_element_located(captcha_locator)).click()
'''
options = webdriver.ChromeOptions()
options.add_argument('--headless')
options.add_experimental_option("excludeSwitches", ["enable-automation", "enable-logging"])
capabilities = DesiredCapabilities.CHROME
capabilities["goog:loggingPrefs"] = {"performance": "ALL"}
# service = Service(executable_path="path/to/your/chromedriver.exe")
# driver = webdriver.Chrome(service=service, options=options, desired_capabilities=capabilities)
driver = webdriver.Chrome('chromedriver', chrome_options=options, desired_capabilities=capabilities)
'''
wait = WebDriverWait(driver, 15)
table_values_locator = (By.CSS_SELECTOR, 'input+.tab-pane tr:not(:first-child)>td:last-child')
# read 2nd Table
table2_values_locator = (By.CSS_SELECTOR, 'table tr:nth-of-type(2)')
applicant_name_locator = (By.ID, 'TextField6')
page_number_locator = (By.CSS_SELECTOR, 'span.Selected')
app_satus_locator = (By.CSS_SELECTOR, 'button.btn')
next_btn_locator = (By.CSS_SELECTOR, 'button.next')
driver.get('https://ipindiaservices.gov.in/PublicSearch/')
# sometimes an alert with an error message("") may appear, so a small pause is used
sleep(1)
wait.until(EC.visibility_of_element_located(applicant_name_locator)).send_keys('ASIAN PAINTS') # give input (according to your company name)
# on the start page and the page with the table, the names of the buttons are different
captcha_text = get_captcha_text(driver, 1)
submit_captcha(captcha_text, "search")
# the page where the search starts
start_from_page(1, driver)
while True:
start = time()
# get current page number
current_page = wait.until(EC.visibility_of_element_located(page_number_locator)).text
print(f"Current page: {current_page}")
# get all application status WebElements
app_status_elements = wait.until(EC.visibility_of_all_elements_located(app_satus_locator))
for element in range(len(app_status_elements)):
print(f"App number: {element}")
# update application status WebElements
app_status_elements = wait.until(EC.visibility_of_all_elements_located(app_satus_locator))
# click on application status
wait.until(EC.visibility_of(app_status_elements[element])).click()
# wait 2 seconds for the captcha to change
sleep(2)
# get text and submit captcha
captcha_text = get_captcha_text(driver, 1)
submit_captcha(captcha_text, "submit")
try:
# get all table data values(without titles) WebElements
table_data_values = wait.until(EC.visibility_of_all_elements_located(table_values_locator))
# if there are empty rows in the table replace them with "_"
titles_validation(driver)
# get all 2nd-table data WebElements
table_data_values2 = wait.until(EC.visibility_of_all_elements_located(table2_values_locator))
# save data to csv
save_to_csv([val.text.replace('\n', ' ') for val in table_data_values])
save_to_csv([val.text.replace('\n', ' ') for val in table_data_values2])
except TimeoutException:
print("Application Number does not exist")
finally:
driver.back()
# print the current page number to the console
print(f"Time per page: {round(time()-start, 3)}")
# if the current page is equal to the specified one, then stop the search and close the driver
if current_page == '1':
break
# click next page
wait.until(EC.visibility_of_element_located(next_btn_locator)).click()
driver.quit()
I have edited the code as per the suggestion, but getting this error
First-Step - GET the 2nd table CSS-Selector (after Code-Line 121):
...
table_values_locator = (By.CSS_SELECTOR, 'input+.tab-pane tr:not(:first-child)>td:last-child')
# read 2nd Table
table2_values_locator = (By.CSS_SELECTOR, 'table tr:nth-of-type(2)')
....
Second-Step - add the data form 2nd table css-selector to CSV (after Code-Line 163):
...
try:
# get all 1st-table data values(without titles) WebElements
table_data_values = wait.until(EC.visibility_of_all_elements_located(table_values_locator))
# if there are empty rows in the table replace them with "_"
titles_validation(driver)
# get all 2nd-table data WebElements
table_data_values2 = wait.until(EC.visibility_of_all_elements_located(table2_values_locator))
# save data to csv
save_to_csv([val.text.replace('\n', ' ') for val in table_data_values])
save_to_csv([val.text.replace('\n', ' ') for val in table_data_values2])
...
Related
# 뉴스 크롤링.py
#######################################'사용후핵연료' 키워드 검색##################################################
import sys, os
from bs4 import BeautifulSoup
import requests
from selenium import webdriver
import selenium
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
from datetime import datetime, timedelta
from pandas import DataFrame
import time
from openpyxl.workbook import Workbook
sleep_sec = 0.5
wb = Workbook()
# User-Agent를 입력해주세요.
headers = {'User-Agent' : '________________'}
query = 'spent nuclear fuel'
yesterday = (datetime.today() - timedelta(1)).strftime("%Y.%m.%d")
def news_crawling():
service = Service(executable_path=ChromeDriverManager().install())
browser = webdriver.Chrome(service=service)
print('브라우저를 실행시킵니다(자동 제어)\n')
news_url = 'https://www.google.com/search?q={0}&tbm=nws&source-news]'.format(query, yesterday)
browser.get(news_url)
time.sleep(sleep_sec)
print('\n크롤링을 시작합니다.')
#####동적 제어로 페이지 넘어가며 크롤링
news_dict = {}
idx = 1
cur_page = 1
news_num = 1000000
while True:
table = browser.find_element("xpath",'.//div[#data-hveid="CBAQAA"]')
li_list = table.find_elements("xpath",'.//li[contains(#class="vJOb1e aIfcHf Hw13jc"]')
area_list = [li.find_element("xpath",'.//div[#class="mCBkyc y355M ynAwRc MBeuO nDgy9d"]') for li in li_list]
for a in area_list[:min(len(area_list), news_num-idx+1)]:
n = a.find_element("xpath",'.//div[#role="heading"]')
n_url = n.get_attribute('href')
try:
img = a.find_element(By.CSS_SELECTOR,'img#dimg_').find_element(By.CSS_SELECTOR, 'img')
img = img.get_attribute('src')
except:
img = " "
news_dict[idx] = {'Title' : n.get_attribute('title'),
'url' : n_url,
'thumbnail': img}
idx += 1
try:
next_btn = browser.find_element(By.CSS_SELECTOR, 'a#pnnext')
next_btn.click()
cur_page +=1
# pages = browser.find_element("xpath",'//div[#class="sc_page_inner"]')
# next_page_url = [p for p in pages.find_elements("xpath",'.//a') if p.text == str(cur_page)][0].get_attribute('href')
pages = browser.find_element("xpath",'//table[#class="fl"]')
next_page_url = [p for p in pages.find_elements("xpath",'.//a') if p.text == str(cur_page)][0].get_attribute('aria-lable')
browser.get(next_page_url)
time.sleep(sleep_sec)
except:
print('\n브라우저를 종료합니다.\n' + '=' * 100)
time.sleep(0.7)
browser.close()
break
########################################################여기까지 수정 완료################################################################
# 엑셀파일 추출
print('데이터프레임 변환\n')
news_df = DataFrame(news_dict).T
folder_path = os.getcwd()
xlsx_file_name = '{}_{}.xlsx'.format(query, yesterday)
news_df.to_excel(xlsx_file_name, index=False)
print('엑셀 저장 완료 | 경로 : {}\\{}\n'.format(folder_path, xlsx_file_name))
news_crawling()
this is my code. I use it on Korean website and it works well. But after I modified it for google search, it wouldn't work.
I want to search something on google and then get the news titles into a xlsx file.
I before used it in Korean website, so I changed the part below
table = browser.find_element("xpath",'.//div[#data-hveid="CBAQAA"]')
li_list = table.find_elements("xpath",'.//li[contains(#class="vJOb1e aIfcHf Hw13jc"]')
area_list = [li.find_element("xpath",'.//div[#class="mCBkyc y355M ynAwRc MBeuO nDgy9d"]') for li in li_list]
and when I run the code, it only gives me an empty xlsx file.
can anyone help with this please? I would be so appreciate.
Here is one possible solution:
from openpyxl import Workbook
from datetime import datetime
from datetime import timedelta
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
def get_url(query: str, min_date: str, max_date: str) -> str:
return f'https://www.google.com/search?q={query}&tbm=nws&source-news&tbs=cdr:1,cd_min:{min_date},cd_max:{max_date}'
options = webdriver.ChromeOptions()
options.add_argument('--headless')
options.add_experimental_option("excludeSwitches", ["enable-automation", "enable-logging"])
service = Service(executable_path="path/to/your/chromedriver.exe")
driver = webdriver.Chrome(service=service, options=options)
wait = WebDriverWait(driver, 5)
yesterday = (datetime.now() - timedelta(1)).strftime("%m.%d.%Y")
driver.get(get_url('spent nuclear fuel', yesterday, yesterday))
url_locator = (By.CSS_SELECTOR, '#rso a')
title_locator = (By.CSS_SELECTOR, 'a div[role="heading"]')
thumbnail_locator = (By.CSS_SELECTOR, '#rso a>div>div:first-child img')
workbook = Workbook()
worksheet = workbook.active
page = 1
while True:
print(f'Current page: {page}')
url_web_elements = wait.until(EC.visibility_of_all_elements_located(url_locator))
title_web_elements = wait.until(EC.presence_of_all_elements_located(title_locator))
thumbnail_web_elements = wait.until(EC.visibility_of_all_elements_located(thumbnail_locator))
titles = [title.text.replace(',', '.') for title in title_web_elements]
urls = [link.get_attribute('href') for link in url_web_elements]
thumbnails = [thumbnail.get_attribute('src') for thumbnail in thumbnail_web_elements]
for data in zip(titles, urls, thumbnails):
news = {
'title' : data[0],
'url' : data[1],
'thumbnail': data[2]
}
worksheet.append(list(news.values()))
try:
page += 1
wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, '#pnnext'))).click()
except TimeoutException:
break
workbook.save(f'google_news_{yesterday}.xlsx')
driver.quit()
Output is xlsx file google_news_11.10.2022.xlsx
In the get_url function, you can pass a range of dates for which the news will be displayed. For example get_url('spent nuclear fuel', 01.11.2022, 11.11.2022)
You can also save data to csv using this solution:
import csv
from datetime import datetime
from datetime import timedelta
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
def get_url(query: str, min_date: str, max_date: str) -> str:
return f'https://www.google.com/search?q={query}&tbm=nws&source-news&tbs=cdr:1,cd_min:{min_date},cd_max:{max_date}'
def save_to_csv(data: list) -> None:
with open(file='google_news.csv', mode='a', encoding="utf-8") as f:
writer = csv.writer(f, lineterminator='\n')
writer.writerow([*data])
options = webdriver.ChromeOptions()
options.add_argument('--headless')
options.add_experimental_option("excludeSwitches", ["enable-automation", "enable-logging"])
service = Service(executable_path="path/to/your/chromedriver.exe")
driver = webdriver.Chrome(service=service, options=options)
wait = WebDriverWait(driver, 5)
yesterday = (datetime.now() - timedelta(1)).strftime("%m.%d.%Y")
driver.get(get_url('spent nuclear fuel', yesterday, yesterday))
url_locator = (By.CSS_SELECTOR, '#rso a')
title_locator = (By.CSS_SELECTOR, 'a div[role="heading"]')
thumbnail_locator = (By.CSS_SELECTOR, '#rso a>div>div:first-child img')
page = 1
while True:
print(f'Current page: {page}')
url_web_elements = wait.until(EC.visibility_of_all_elements_located(url_locator))
title_web_elements = wait.until(EC.presence_of_all_elements_located(title_locator))
thumbnail_web_elements = wait.until(EC.visibility_of_all_elements_located(thumbnail_locator))
titles = [title.text.replace(',', '.') for title in title_web_elements]
urls = [link.get_attribute('href') for link in url_web_elements]
thumbnails = [thumbnail.get_attribute('src') for thumbnail in thumbnail_web_elements]
for data in zip(titles, urls, thumbnails):
news = {
'title' : data[0],
'url' : data[1],
'thumbnail': data[2]
}
save_to_csv(news.values())
try:
page += 1
wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, '#pnnext'))).click()
except TimeoutException:
break
driver.quit()
Output is csv file google_news.csv:
COP27: nuclear boss doesn't expect surge in waste recycling,https://news.yahoo.com/cop27-nuclear-boss-doesnt-expect-072631885.html,""
UN Nuclear Chief Says Recycling Nuclear Waste 'Difficult ...,https://www.theepochtimes.com/un-nuclear-chief-says-recycling-nuclear-waste-difficult-after-biden-looks-to-fund-reprocessing-projects_4855151.html,""
COP27: UN nuclear chief says radioactive waste recycling is 'difficult' technology,https://www.deccanherald.com/international/world-news-politics/cop27-un-nuclear-chief-says-radioactive-waste-recycling-is-difficult-technology-1161036.html,""
Tested on Python 3.9.10. Used Selenium 4.5.0, openpyxl 3.0.10
I have to go to here
Here I have to choose applicant name = “ltd”
But here before submitting the page, I have to solve a captcha. How to fetch the next page's information(application number, application title, date, application status etc.....) in an excel format using web scrapping?
---------------- Running the following script, getting error -----
import csv
import json
from time import sleep, time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver import DesiredCapabilities
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.chrome.webdriver import WebDriver
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
def save_to_csv(data: list) -> None:
with open(file='ipindiaservices.csv', mode='a', encoding="utf-8") as f:
writer = csv.writer(f, lineterminator='\n')
writer.writerow([*data])
def start_from_page(page_number: int, driver: WebDriver) -> None:
driver.execute_script(
f"""
document.querySelector('button.next').value = {page_number};
document.querySelector('button.next').click();
"""
)
def titles_validation(driver: WebDriver) -> None:
"""replace empty title name with '_'"""
driver.execute_script(
"""
let titles = document.querySelectorAll('input+.tab-pane tr:not(:first-child)>td:last-child')
Array.from(titles).forEach((e) => {
if (!e.textContent.trim()) {
e.textContent = '_';
}
});
"""
)
def get_network_data(log: dict, driver: WebDriver) -> dict:
log = json.loads(log["message"])["message"]
if all([
"Network.responseReceived" in log["method"],
"params" in log.keys(),
'CaptchaAudio' in str(log["params"].values())
]):
return driver.execute_cdp_cmd('Network.getResponseBody', {'requestId': log["params"]["requestId"]})
def get_captcha_text(driver: WebDriver, timeout: float) -> str:
"""Return captcha text
Arguments:
- driver: WebDriver
- timeout: pause before receiving data from the web driver log
"""
driver.execute_script(
"""
// document.querySelector('img[title="Captcha"]').click()
document.querySelector('img[title="Captcha Audio"]').click()
"""
)
sleep(timeout)
logs = driver.get_log('performance')
responses = [get_network_data(log, driver) for log in logs if get_network_data(log, driver)]
if responses:
return json.loads(responses[0]['body'])['CaptchaImageText']
else:
get_captcha_text(driver)
def submit_captcha(captcha_text: str, btn_name: str) -> None:
"""Submit captcha
Arguments:
- btn_name: captcha send button name["submit" or "search"]
"""
if btn_name == 'search':
captcha_locator = (By.CSS_SELECTOR, 'input[name="submit"]')
elif btn_name == 'submit':
captcha_locator = (By.ID, 'btnSubmit')
wait.until(EC.visibility_of_element_located((By.ID, 'CaptchaText'))).send_keys(captcha_text)
wait.until(EC.visibility_of_element_located(captcha_locator)).click()
# options = webdriver.ChromeOptions()
# options.add_argument('--headless')
# options.add_experimental_option("excludeSwitches", ["enable-automation", "enable-logging"])
# capabilities = DesiredCapabilities.CHROME
# capabilities["goog:loggingPrefs"] = {"performance": "ALL"}
# service = Service(executable_path="path/to/your/chromedriver.exe")
# # driver = webdriver.Chrome(service=service, options=options, desired_capabilities=capabilities)
wait = WebDriverWait(driver, 15)
table_values_locator = (By.CSS_SELECTOR, 'input+.tab-pane tr:not(:first-child)>td:last-child')
applicant_name_locator = (By.ID, 'TextField6')
page_number_locator = (By.CSS_SELECTOR, 'span.Selected')
app_satus_locator = (By.CSS_SELECTOR, 'button.btn')
next_btn_locator = (By.CSS_SELECTOR, 'button.next')
driver.get('https://ipindiaservices.gov.in/PublicSearch/')
# sometimes an alert with an error message("") may appear, so a small pause is used
sleep(1)
wait.until(EC.visibility_of_element_located(applicant_name_locator)).send_keys('ltd')
# on the start page and the page with the table, the names of the buttons are different
captcha_text = get_captcha_text(driver, 1)
submit_captcha(captcha_text, "search")
# the page where the search starts
start_from_page(1, driver)
while True:
start = time()
# get current page number
current_page = wait.until(EC.visibility_of_element_located(page_number_locator)).text
print(f"Current page: {current_page}")
# get all application status WebElements
app_status_elements = wait.until(EC.visibility_of_all_elements_located(app_satus_locator))
for element in range(len(app_status_elements)):
print(f"App number: {element}")
# update application status WebElements
app_status_elements = wait.until(EC.visibility_of_all_elements_located(app_satus_locator))
# click on application status
wait.until(EC.visibility_of(app_status_elements[element])).click()
# wait 2 seconds for the captcha to change
sleep(2)
# get text and submit captcha
captcha_text = get_captcha_text(driver, 1)
submit_captcha(captcha_text, "submit")
try:
# get all table data values(without titles) WebElements
table_data_values = wait.until(EC.visibility_of_all_elements_located(table_values_locator))
# if there are empty rows in the table replace them with "_"
titles_validation(driver)
# save data to csv
save_to_csv([val.text.replace('\n', ' ') for val in table_data_values])
except TimeoutException:
print("Application Number does not exist")
finally:
driver.back()
# print the current page number to the console
print(f"Time per page: {round(time()-start, 3)}")
# if the current page is equal to the specified one, then stop the search and close the driver
if current_page == '3776':
break
# click next page
wait.until(EC.visibility_of_element_located(next_btn_locator)).click()
driver.quit()
On this site, captcha can be solved without resorting to third-party services. When you click on the "Captcha Audio" button, a GET request is sent to the endpoint https://ipindiaservices.gov.in/PublicSearch/Captcha/CaptchaAudio The response is a dictionary {"CaptchaImageText":"hnnxd"} which you can access get from Selenium via the "Chrome Devtools Protocol" using the Network.getResponseBody method, or you can use the requests library.
To save data in csv, you can use, for example, the csv module included in the standard library.
Here is one possible solution:
import re
import csv
import json
from time import sleep
from selenium import webdriver
from typing import Generator, List, Tuple
from selenium.webdriver.common.by import By
from selenium.webdriver import DesiredCapabilities
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.chrome.webdriver import WebDriver
from selenium.webdriver.support import expected_conditions as EC
def get_captcha_text(log: dict, driver: WebDriver) -> dict:
log = json.loads(log["message"])["message"]
if all([
"Network.responseReceived" in log["method"],
"params" in log.keys(),
'CaptchaAudio' in str(log["params"].values())
]):
return driver.execute_cdp_cmd('Network.getResponseBody', {'requestId': log["params"]["requestId"]})
def save_to_csv(table_data: Generator[None, None, List[Tuple[str, str, str, str]]]) -> None:
for app_num, title, app_date, status in zip(*list(zip(*table_data))):
data = {
'Application Number': app_num,
'Title': title,
'Application Date': app_date,
'Status': status
}
with open(file='ipindiaservices.csv', mode='a', encoding="utf-8") as f:
writer = csv.writer(f, lineterminator='\n')
writer.writerow([data['Application Number'], data['Title'], data['Application Date'], data['Status']])
def start_from_page(page_number: int, driver: WebDriver) -> None:
driver.execute_script(
f"""
document.querySelector('button.next').value = {page_number};
document.querySelector('button.next').click();
"""
)
def titles_validation(driver: WebDriver) -> None:
"""replace empty title name with '_'"""
driver.execute_script(
"""
let titles = document.querySelectorAll('#tableData>tbody>tr>td.title')
Array.from(titles).forEach((e) => {
if (!e.textContent.trim()) {
e.textContent = '_';
}
});
"""
)
options = webdriver.ChromeOptions()
options.add_experimental_option("excludeSwitches", ["enable-automation", "enable-logging"])
capabilities = DesiredCapabilities.CHROME
capabilities["goog:loggingPrefs"] = {"performance": "ALL"}
service = Service(executable_path="path/to/your/chromedriver.exe")
driver = webdriver.Chrome(service=service, options=options, desired_capabilities=capabilities)
wait = WebDriverWait(driver, 10)
# regular expression to search for data in a table
pattern = r'^([0-9A-Z\/\-,]+) (.+)? ([0-9\/]+) (\w+)'
driver.get('https://ipindiaservices.gov.in/PublicSearch/')
# sometimes an alert with an error message("") may appear, so a small pause is used
sleep(1)
driver.find_element(By.CSS_SELECTOR, 'img[title="Captcha Audio"]').click()
driver.find_element(By.ID, 'TextField6').send_keys('ltd')
# short pause is needed here to write the log, otherwise we will get an empty list
sleep(1)
logs = driver.get_log('performance')
# get request data that is generated when click on the button listen to the text of the captcha
responses = [get_captcha_text(log, driver) for log in logs if get_captcha_text(log, driver)]
# get captcha text
captcha_text = json.loads(responses[0]['body'])['CaptchaImageText']
# enter the captcha text and click "Search" button
driver.find_element(By.ID, 'CaptchaText').send_keys(captcha_text)
driver.find_element(By.CSS_SELECTOR, 'input[name="submit"]').click()
# the page where the search starts
start_from_page(1, driver)
while True:
# get current page number
current_page = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, 'span.Selected'))).text
# print the current page number to the console
print(f"Current page: {current_page}")
# get all fields of the table
table_web_elements = wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, '#tableData>tbody>tr')))
# check title name
titles_validation(driver)
# get all table data on current page
table_data = (re.findall(pattern, data.text)[0] for data in table_web_elements)
# save table data to csv
save_to_csv(table_data)
# if the current page is equal to the specified one, then stop the search and close the driver
if current_page == '3768':
break
# click next page
driver.find_element(By.CSS_SELECTOR, 'button.next').click()
driver.quit()
The performance of this solution is about 280sec per 100 pages.
It will take about 2.5-3 hours to collect all the data.
Therefore, the ability to stop data collection on a specific page has been added (by default, this is the last page):
if current_page == '3768':
break
And start collection data from the specified page (by default, this is the first page):
start_from_page(1, driver)
Output is ipindiaservices.csv
202247057786,NON-AQUEOUS ELECTROLYTE SECONDARY BATTERY,10/10/2022,Published
202247057932,"COMMUNICATION METHOD, APPARATUS AND SYSTEM",10/10/2022,Published
202247057855,POLYOLEFIN RESIN FILM,10/10/2022,Published
202247057853,CEMENT COMPOSITION AND CURED PRODUCT THEREOF,10/10/2022,Published
...
To use this solution in Google Collab, follow these steps:
Install Selenium and ChromeDriver
!pip install selenium
!apt-get update
!apt install chromium-chromedriver
Make the necessary imports
import re
import csv
import json
from time import sleep
from typing import Generator, List, Tuple
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver import DesiredCapabilities
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.chrome.webdriver import WebDriver
from selenium.webdriver.support import expected_conditions as EC
Set options
options = webdriver.ChromeOptions()
options.add_argument('--headless')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
capabilities = DesiredCapabilities.CHROME
capabilities["goog:loggingPrefs"] = {"performance": "ALL"}
driver = webdriver.Chrome('chromedriver', chrome_options=options, desired_capabilities=capabilities)
Everything else remains unchanged.
And don't put all the code in one cell.
Update:
This is one possible solution to collect all the information from the table of each application:
import csv
import json
from time import sleep, time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver import DesiredCapabilities
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.chrome.webdriver import WebDriver
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
def save_to_csv(data: list) -> None:
with open(file='ipindiaservices.csv', mode='a', encoding="utf-8") as f:
writer = csv.writer(f, lineterminator='\n')
writer.writerow([*data])
def start_from_page(page_number: int, driver: WebDriver) -> None:
driver.execute_script(
f"""
document.querySelector('button.next').value = {page_number};
document.querySelector('button.next').click();
"""
)
def titles_validation(driver: WebDriver) -> None:
"""replace empty title name with '_'"""
driver.execute_script(
"""
let titles = document.querySelectorAll('input+.tab-pane tr:not(:first-child)>td:last-child')
Array.from(titles).forEach((e) => {
if (!e.textContent.trim()) {
e.textContent = '_';
}
});
"""
)
def get_network_data(log: dict, driver: WebDriver) -> dict:
log = json.loads(log["message"])["message"]
if all([
"Network.responseReceived" in log["method"],
"params" in log.keys(),
'CaptchaAudio' in str(log["params"].values())
]):
return driver.execute_cdp_cmd('Network.getResponseBody', {'requestId': log["params"]["requestId"]})
def get_captcha_text(driver: WebDriver, timeout: float) -> str:
"""Return captcha text
Arguments:
- driver: WebDriver
- timeout: pause before receiving data from the web driver log
"""
driver.execute_script(
"""
// document.querySelector('img[title="Captcha"]').click()
document.querySelector('img[title="Captcha Audio"]').click()
"""
)
sleep(timeout)
logs = driver.get_log('performance')
if responses := [get_network_data(log, driver) for log in logs if get_network_data(log, driver)]:
return json.loads(responses[0]['body'])['CaptchaImageText']
else:
get_captcha_text(driver, timeout)
def submit_captcha(captcha_text: str, btn_name: str) -> None:
"""Submit captcha
Arguments:
- btn_name: captcha send button name["submit" or "search"]
"""
if btn_name == 'search':
captcha_locator = (By.CSS_SELECTOR, 'input[name="submit"]')
elif btn_name == 'submit':
captcha_locator = (By.ID, 'btnSubmit')
wait.until(EC.visibility_of_element_located((By.ID, 'CaptchaText'))).send_keys(captcha_text)
wait.until(EC.visibility_of_element_located(captcha_locator)).click()
options = webdriver.ChromeOptions()
options.add_argument('--headless')
options.add_experimental_option("excludeSwitches", ["enable-automation", "enable-logging"])
capabilities = DesiredCapabilities.CHROME
capabilities["goog:loggingPrefs"] = {"performance": "ALL"}
service = Service(executable_path="path/to/your/chromedriver.exe")
driver = webdriver.Chrome(service=service, options=options, desired_capabilities=capabilities)
wait = WebDriverWait(driver, 15)
table_values_locator = (By.CSS_SELECTOR, 'input+.tab-pane tr:not(:first-child)>td:last-child')
applicant_name_locator = (By.ID, 'TextField6')
page_number_locator = (By.CSS_SELECTOR, 'span.Selected')
app_satus_locator = (By.CSS_SELECTOR, 'button.btn')
next_btn_locator = (By.CSS_SELECTOR, 'button.next')
driver.get('https://ipindiaservices.gov.in/PublicSearch/')
# sometimes an alert with an error message("") may appear, so a small pause is used
sleep(1)
wait.until(EC.visibility_of_element_located(applicant_name_locator)).send_keys('ltd')
# on the start page and the page with the table, the names of the buttons are different
captcha_text = get_captcha_text(driver, 1)
submit_captcha(captcha_text, "search")
# the page where the search starts
start_from_page(1, driver)
while True:
start = time()
# get current page number
current_page = wait.until(EC.visibility_of_element_located(page_number_locator)).text
print(f"Current page: {current_page}")
# get all application status WebElements
app_status_elements = wait.until(EC.visibility_of_all_elements_located(app_satus_locator))
for element in range(len(app_status_elements)):
print(f"App number: {element}")
# update application status WebElements
app_status_elements = wait.until(EC.visibility_of_all_elements_located(app_satus_locator))
# click on application status
wait.until(EC.visibility_of(app_status_elements[element])).click()
# wait 2 seconds for the captcha to change
sleep(2)
# get text and submit captcha
captcha_text = get_captcha_text(driver, 1)
submit_captcha(captcha_text, "submit")
try:
# get all table data values(without titles) WebElements
table_data_values = wait.until(EC.visibility_of_all_elements_located(table_values_locator))
# if there are empty rows in the table replace them with "_"
titles_validation(driver)
# save data to csv
save_to_csv([val.text.replace('\n', ' ') for val in table_data_values])
except TimeoutException:
print("Application Number does not exist")
finally:
driver.back()
# print the current page number to the console
print(f"Time per page: {round(time()-start, 3)}")
# if the current page is equal to the specified one, then stop the search and close the driver
if current_page == '3776':
break
# click next page
wait.until(EC.visibility_of_element_located(next_btn_locator)).click()
driver.quit()
Output is ipindiaservices.csv
202247059447,PCT NATIONAL PHASE APPLICATION,18/10/2022,"PANASONIC INTELLECTUAL PROPERTY MANAGEMENT CO., LTD.",SOLID ELECTROLYTE MATERIAL AND BATTERY USING SAME,ELECTRICAL,patent#depenning.com,_,_,PCT/JP2021/004427,05/02/2021,31/03/2020,--,21/10/2022
202247059470,PCT NATIONAL PHASE APPLICATION,18/10/2022,"SHENZHEN SKYWORTH-RGB ELECTRONIC CO., LTD.","ATTACHMENT FORCE ADJUSTMENT METHOD AND APPARATUS, DEVICE, AND STORAGE MEDIUM",COMPUTER SCIENCE,ipo#knspartners.com,_,_,PCT/CN2020/125555,30/10/2020,02/04/2020,18/10/2022,21/10/2022
202247058733,PCT NATIONAL PHASE APPLICATION,14/10/2022,"SUMITOMO ELECTRIC OPTIFRONTIER CO., LTD.","FUSION SPLICING SYSTEM, FUSION SPLICING DEVICE, AND DETERIORATION DETERMINATION METHOD",COMPUTER SCIENCE,patent#depenning.com,_,_,PCT/JP2021/017016,28/04/2021,30/04/2020,--,21/10/2022
The performance of this solution is about 230sec per page.
Sometimes there may be no data on the aplication status page (for example, by the number "00054/CAL/1998" we get "Application Number does not exist") Therefore, the script simply ignores this aplication.
2 sec timeout before receiving the captcha text is due to the fact that after clicking on "Application Status" one captcha is shown and after ~1sec it changes to another one which we must enter
Fix:
Since the captcha that appeared after clicking on Application status was removed from the site, its solution from the script was also removed. Also, the developers of this resource made the same classes for Application Number and Application status, so we needed to change the css selector for Application status.
Solution for Google Colab:
import csv
import json
from time import sleep, time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver import DesiredCapabilities
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.chrome.webdriver import WebDriver
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
def save_to_csv(data: list) -> None:
with open(file='ipindiaservices.csv', mode='a', encoding="utf-8") as f:
writer = csv.writer(f, lineterminator='\n')
writer.writerow([*data])
def start_from_page(page_number: int, driver: WebDriver) -> None:
driver.execute_script(
f"""
document.querySelector('button.next').value = {page_number};
document.querySelector('button.next').click();
"""
)
def titles_validation(driver: WebDriver) -> None:
"""replace empty title name with '_'"""
driver.execute_script(
"""
let titles = document.querySelectorAll('input+.tab-pane tr:not(:first-child)>td:last-child')
Array.from(titles).forEach((e) => {
if (!e.textContent.trim()) {
e.textContent = '_';
}
});
"""
)
def get_network_data(log: dict, driver: WebDriver) -> dict:
log = json.loads(log["message"])["message"]
if all([
"Network.responseReceived" in log["method"],
"params" in log.keys(),
'CaptchaAudio' in str(log["params"].values())
]):
return driver.execute_cdp_cmd('Network.getResponseBody', {'requestId': log["params"]["requestId"]})
def get_captcha_text(driver: WebDriver, timeout: float) -> str:
"""Return captcha text
Arguments:
- driver: WebDriver
- timeout: pause before receiving data from the web driver log
"""
driver.execute_script(
"""
// document.querySelector('img[title="Captcha"]').click()
document.querySelector('img[title="Captcha Audio"]').click()
"""
)
sleep(timeout)
logs = driver.get_log('performance')
responses = [get_network_data(log, driver) for log in logs if get_network_data(log, driver)]
if responses:
return json.loads(responses[0]['body'])['CaptchaImageText']
else:
get_captcha_text(driver, timeout)
def submit_captcha(captcha_text: str, btn_name: str) -> None:
"""Submit captcha
Arguments:
- btn_name: captcha send button name["submit" or "search"]
"""
if btn_name == 'search':
captcha_locator = (By.CSS_SELECTOR, 'input[name="submit"]')
elif btn_name == 'submit':
captcha_locator = (By.ID, 'btnSubmit')
wait.until(EC.visibility_of_element_located((By.ID, 'CaptchaText'))).send_keys(captcha_text)
wait.until(EC.visibility_of_element_located(captcha_locator)).click()
options = webdriver.ChromeOptions()
options.add_argument('--headless')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
options.add_experimental_option("excludeSwitches", ["enable-automation", "enable-logging"])
capabilities = DesiredCapabilities.CHROME
capabilities["goog:loggingPrefs"] = {"performance": "ALL"}
driver = webdriver.Chrome(executable_path='chromedriver', options=options, desired_capabilities=capabilities)
wait = WebDriverWait(driver, 10)
table_values_locator = (By.CSS_SELECTOR, 'input+.tab-pane tr:not(:first-child)>td:last-child')
applicant_name_locator = (By.ID, 'TextField6')
page_number_locator = (By.CSS_SELECTOR, 'span.Selected')
app_satus_locator = (By.CSS_SELECTOR, 'button[name="ApplicationSatus"]')
next_btn_locator = (By.CSS_SELECTOR, 'button.next')
driver.get('https://ipindiaservices.gov.in/PublicSearch/')
# sometimes an alert with an error message("") may appear, so a small pause is used
sleep(1)
wait.until(EC.visibility_of_element_located(applicant_name_locator)).send_keys('ltd')
# on the start page and the page with the table, the names of the buttons are different
captcha_text = get_captcha_text(driver, 1)
submit_captcha(captcha_text, "search")
# the page where the search starts
start_from_page(1, driver)
while True:
start = time()
# get current page number
current_page = wait.until(EC.visibility_of_element_located(page_number_locator)).text
print(f"Current page: {current_page}")
# get all application status WebElements
app_status_elements = wait.until(EC.visibility_of_all_elements_located(app_satus_locator))
for element in range(len(app_status_elements)):
print(f"App number: {element}")
# update application status WebElements
app_status_elements = wait.until(EC.visibility_of_all_elements_located(app_satus_locator))
# click on application status
wait.until(EC.visibility_of(app_status_elements[element])).click()
try:
# switch to new tab
driver.switch_to.window(driver.window_handles[-1])
# get all table data values(without titles) WebElements
table_data_values = wait.until(EC.visibility_of_all_elements_located(table_values_locator))
# if there are empty rows in the table replace them with "_"
titles_validation(driver)
# save data to csv
save_to_csv([val.text.replace('\n', ' ') for val in table_data_values])
except TimeoutException:
print("Application Number does not exist")
finally:
# close new tab
driver.close()
# switch to main tab
driver.switch_to.window(driver.window_handles[0])
# print the current page number to the console
print(f"Time per page: {round(time()-start, 3)}")
# if the current page is equal to the specified one, then stop the search and close the driver
if current_page == '100':
break
# click next page
wait.until(EC.visibility_of_element_located(next_btn_locator)).click()
driver.quit()
These fixes have been tested on Google Colab(python 3.7.15).
The performance of this solution is about 55sec per page.
Previous versions of script are left for visual comparison.
You can use any capctha solving sites. On these sites, users usually fix it themselves, so they can be slow, but it does the job.
Sample website (I didn't receive any ads)
You can use selenium to pull the information. It will be enough to take and place the elements with the "id" tag on the site.
Library for reading/writing excel in python
My project consists of making a competitive watch table for hotel rates for an agency. It is a painful action that I wanted to automate, the code extract correctly the name of hotels and the prices I want to extract but it's working correctly only for the first hotel and I don't know where is the problem. I provide you with the code and the output, if any of you can help me and thank you in advance.
NB : the code 2 works correctly but when i've added more operations the problem appeared
code 1
#!/usr/bin/env python
# coding: utf-8
import time
from time import sleep
import ast
import pandas as pd
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait, Select
from selenium.common.exceptions import StaleElementReferenceException, NoSuchElementException
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
driver = webdriver.Chrome("C:\\Users\\marketing2\\Documents\\chromedriver.exe")
driver.get('https://tn.tunisiebooking.com/')
# params to select
params = {
'destination': 'Tozeur',
'date_from': '11/09/2021',
'date_to': '12/09/2021',
'bedroom': '1'
}
# select destination
destination_select = Select(WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.ID, 'ville_des'))))
destination_select.select_by_value(params['destination'])
# select bedroom
bedroom_select = Select(WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.ID, 'select_ch'))))
bedroom_select.select_by_value(params['bedroom'])
# select dates
script = f"document.getElementById('checkin').value ='{params['date_from']}';"
script += f"document.getElementById('checkout').value ='{params['date_to']}';"
script += f"document.getElementById('depart').value ='{params['date_from']}';"
script += f"document.getElementById('arrivee').value ='{params['date_to']}';"
driver.execute_script(script)
# submit form
btn_rechercher = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, '//*[#id="boutonr"]')))
btn_rechercher.click()
urls = []
hotels = WebDriverWait(driver, 10).until(EC.presence_of_all_elements_located((By.XPATH, "//div[starts-with(#id,'produit_affair')]")))
for hotel in hotels:
link = hotel.find_element_by_xpath(".//span[#class='tittre_hotel']/a").get_attribute("href")
urls.append(link)
for url in urls:
driver.get(url)
def existsElement(xpath):
try:
driver.find_element_by_id(xpath);
except NoSuchElementException:
return "false"
else:
return "true"
if (existsElement('result_par_arrangement')=="false"):
btn_t = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, '//*[#id="moteur_rech"]/form/div/div[3]/div')))
btn_t.click()
sleep(10)
else :
pass
try:
name = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.XPATH, "//div[#class='bloc_titre_hotels']/h2"))).text
arropt = driver.find_element_by_xpath("//div[contains(#class,'line_result')][1]")
opt = arropt.find_element_by_tag_name("b").text
num = len(arropt.find_elements_by_tag_name("option"))
optiondata = {}
achats = {}
marges= {}
selection = Select(driver.find_element_by_id("arrangement"))
for i in range(num):
try:
selection = Select(driver.find_element_by_id("arrangement"))
selection.select_by_index(i)
time.sleep(2)
arr = driver.find_element_by_xpath("//select[#id='arrangement']/option[#selected='selected']").text
prize = driver.find_element_by_id("prix_total").text
optiondata[arr] = (int(prize))
btn_passe = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, '//*[#id="resultat"]/div/form/div/div[2]/div[1]/div[2]/div[2]/div')))
btn_passe.click()
# params to select
params = {
'civilite_acheteur': 'Mlle',
'prenom_acheteur': 'test',
'nom_acheteur': 'test',
'e_mail_acheteur': 'test#gmail.com',
'portable_acheteur': '22222222',
'ville_acheteur': 'Test',
}
# select civilite
civilite_acheteur = Select(WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.NAME, 'civilite_acheteur'))))
civilite_acheteur.select_by_value(params['civilite_acheteur'])
# saisir prenom
script = f"document.getElementsByName('prenom_acheteur')[0].value ='{params['prenom_acheteur']}';"
script += f"document.getElementsByName('nom_acheteur')[0].value ='{params['nom_acheteur']}';"
script += f"document.getElementsByName('e_mail_acheteur')[0].value ='{params['e_mail_acheteur']}';"
script += f"document.getElementsByName('portable_acheteur')[0].value ='{params['portable_acheteur']}';"
script += f"document.getElementsByName('ville_acheteur')[0].value ='{params['ville_acheteur']}';"
driver.execute_script(script)
# submit form
btn_agence = driver.find_element_by_id('titre_Nabeul')
btn_agence.click()
btn_continuez = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.ID, 'boutonr')))
btn_continuez.click()
achat = int(driver.find_element_by_xpath('/html/body/header/div[2]/div[1]/div[1]/div[4]/div[2]/div[2]').text.replace(' TND', ''))
achats[arr]=achat
marge =int(((float(prize) - float(achat)) / float(achat)) * 100);
marges[arr]=marge
optiondata[arr]=prize,achat,marge
driver.get(url)
btn_display = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, '//*[#id="moteur_rech"]/form/div/div[3]/div')))
btn_display.click()
sleep(10)
except StaleElementReferenceException:
pass
except NoSuchElementException:
pass
s="- {} | {} : {}".format(name, opt, optiondata)
print(s)
ds = []
for l in s.splitlines():
d = l.split('-')
if len(d) > 1:
df = pd.DataFrame(ast.literal_eval(d[1].strip()))
ds.append(df)
for df in ds:
df.reset_index(drop=True, inplace=True)
df = pd.concat(ds, axis= 1)
cols = df.columns
cols = [((col.split('.')[0], col)) for col in df.columns]
df.columns=pd.MultiIndex.from_tuples(cols)
print(df.T)
#print("{} : {} - {}".format(name, opt, optiondata))
code 2
from selenium.webdriver.support.ui import Select
from selenium.common.exceptions import StaleElementReferenceException,NoSuchElementException
urls = []
hotels = driver.find_elements_by_xpath("//div[starts-with(#id,'produit_affair')]")
for hotel in hotels:
link = hotel.find_element_by_xpath(".//span[#class='tittre_hotel']/a").get_attribute("href")
urls.append(link)
for url in urls:
driver.get(url)
try:
name = driver.find_element_by_xpath("//div[#class='bloc_titre_hotels']/h2").text
arropt = driver.find_element_by_xpath("//div[contains(#class,'line_result')][1]")
opt = arropt.find_element_by_tag_name("b").text
num = len(arropt.find_elements_by_tag_name("option"))
optiondata = {}
selection = Select(driver.find_element_by_id("arrangement"))
for i in range(num):
try:
selection = Select(driver.find_element_by_id("arrangement"))
selection.select_by_index(i)
time.sleep(2)
arr = driver.find_element_by_xpath("//select[#id='arrangement']/option[#selected='selected']").text
prize = driver.find_element_by_id("prix_total").text
optiondata[arr]=prize
except StaleElementReferenceException:
pass
except NoSuchElementException:
pass
print("{} : {} - {} - {}".format(name,opt,num,optiondata))
Your code is outdated. The HTML has been changed/updated and elements such as the one with identity boutonr doesn't exist on the page anymore.
Your loop and order of execution is wrong so this makes the code evaluating still the same fields.
You should not use or at least minimise the usage of time.sleep() to an absolute minimum as it is a waste of time for your code execution. Use WebDriverWait(...) instead
I don't speak French so I could not understand what you are after in your code, but this minimised example below should help you to understand the principle.
#!/usr/bin/env python
# coding: utf-8
import time
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait, Select
from selenium.common.exceptions import StaleElementReferenceException, NoSuchElementException
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
driver = webdriver.Chrome("C:\chromedriver.exe")
driver.get('https://tn.tunisiebooking.com/')
# params to select
params = { 'destination': 'Nabeul',
'date_from': '25/08/2021',
'date_to': '26/08/2021',
'bedroom': '1' }
# select destination
destination_select = Select(WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.ID, 'ville_des'))))
destination_select.select_by_value(params['destination'])
# select bedroom
bedroom_select = Select(WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.ID, 'select_ch'))))
bedroom_select.select_by_value(params['bedroom'])
# select dates
script = f"document.getElementById('checkin').value ='{params['date_from']}';"
script += f"document.getElementById('checkout').value ='{params['date_to']}';"
script += f"document.getElementById('depart').value ='{params['date_from']}';"
script += f"document.getElementById('arrivee').value ='{params['date_to']}';"
driver.execute_script(script)
# submit form
btn_rechercher = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, '//div[#onclick="return submit_hotel_recherche()"]')))
btn_rechercher.click()
urls = []
hotels = WebDriverWait(driver, 10).until(EC.presence_of_all_elements_located((By.XPATH, "//div[starts-with(#id,'produit_affair')]")))
for hotel in hotels:
link = hotel.find_element_by_xpath(".//span[#class='tittre_hotel']/a").get_attribute("href")
urls.append(link)
for url in urls:
driver.get(url)
try:
name = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.XPATH, "//div[#class='bloc_titre_hotels']/h2"))).text
arropt = driver.find_element_by_xpath("//div[contains(#class,'line_result')][1]")
opt = arropt.find_element_by_tag_name("b").text
num = len(arropt.find_elements_by_tag_name("option"))
optiondata = {}
achats = {}
marges= {}
for i in range(num):
try:
selection = Select(WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.ID, 'arrangement')))).select_by_index(i)
time.sleep(0.5)
arr = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, "//select[#id='arrangement']/option[#selected='selected']"))).text
prize = driver.find_element_by_id("prix_total").text
optiondata[arr] = int(prize)
except StaleElementReferenceException:
pass
print("{} : {} - {}".format(name, opt, optiondata))
except NoSuchElementException:
pass
driver.quit()
Result:
Byzance Nabeul : Chambre Double - {'All Inclusive soft': 93, 'Demi Pension': 38, 'Petit Dejeuner': 28, 'Pension Complete': 78}
Palmyra Club Nabeul Nabeul : Double Standard - {'All Inclusive soft': 92}
The following code goes to the payment page and extracts all the info there:
#!/usr/bin/env python
# coding: utf-8
import time
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait, Select
from selenium.common.exceptions import StaleElementReferenceException, NoSuchElementException
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
driver = webdriver.Chrome("/usr/local/bin/chromedriver")
driver.get('https://tn.tunisiebooking.com/')
# params to select
params = {
'destination': 'Nabeul',
'date_from': '29/08/2021',
'date_to': '30/08/2021',
'bedroom': '1'
}
# select destination
destination_select = Select(WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.ID, 'ville_des'))))
destination_select.select_by_value(params['destination'])
# select bedroom
bedroom_select = Select(WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.ID, 'select_ch'))))
bedroom_select.select_by_value(params['bedroom'])
# select dates
script = f"document.getElementById('checkin').value ='{params['date_from']}';"
script += f"document.getElementById('checkout').value ='{params['date_to']}';"
script += f"document.getElementById('depart').value ='{params['date_from']}';"
script += f"document.getElementById('arrivee').value ='{params['date_to']}';"
driver.execute_script(script)
# submit form
btn_rechercher = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, '//div[#onclick="return submit_hotel_recherche()"]')))
btn_rechercher.click()
urls = []
hotels = WebDriverWait(driver, 10).until(EC.presence_of_all_elements_located((By.XPATH, "//div[starts-with(#id,'produit_affair')]")))
for hotel in hotels:
link = hotel.find_element_by_xpath(".//span[#class='tittre_hotel']/a").get_attribute("href")
urls.append(link)
for url in urls:
driver.get(url)
try:
name = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.XPATH, "//div[#class='bloc_titre_hotels']/h2"))).text
arropt = driver.find_element_by_xpath("//div[contains(#class,'line_result')][1]")
opt = arropt.find_element_by_tag_name("b").text
num = len(arropt.find_elements_by_tag_name("option"))
optiondata = {}
achats = {}
marges= {}
try:
selection = Select(WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.ID, 'arrangement'))))
time.sleep(0.5)
arr = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, "//select[#id='arrangement']/option[#selected='selected']"))).text
prize = driver.find_element_by_id("prix_total").text
optiondata[arr] = (int(prize))
btn_passe = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.ID, 'resa')))
btn_passe.click()
tot = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.ID, 'montant_total_apres_code')))
total = int(tot.text.replace(' €', ''))
# params to select
params = {
'civilite_acheteur': 'Mlle',
'prenom_acheteur': 'test',
'nom_acheteur': 'test',
'e_mail_acheteur': 'test#gmail.com',
'portable_acheteur': '22222222',
'ville_acheteur': 'Test',
}
# select civilite
civilite_acheteur = Select(WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.NAME, 'civilite_acheteur'))))
civilite_acheteur.select_by_value(params['civilite_acheteur'])
# saisir prenom
script = f"document.getElementsByName('prenom_acheteur')[0].value ='{params['prenom_acheteur']}';"
script += f"document.getElementsByName('nom_acheteur')[0].value ='{params['nom_acheteur']}';"
script += f"document.getElementsByName('e_mail_acheteur')[0].value ='{params['e_mail_acheteur']}';"
script += f"document.getElementsByName('portable_acheteur')[0].value ='{params['portable_acheteur']}';"
script += f"document.getElementsByName('ville_acheteur')[0].value ='{params['ville_acheteur']}';"
driver.execute_script(script)
# submit form
btn_agence = driver.find_element_by_class_name('continuez_resa')
btn_agence.click()
achat1 = int(driver.find_element_by_id('montant_a_payer').text.replace(' €', ''))
achat = int(driver.find_element_by_id('montant_restant').text.replace(' €', ''))
achat3 = float(driver.find_element_by_xpath('//div[#class="ligne_interne_total"]/div[3]/div[#class="prix_total1 text_shadow"]').text.replace(' TND', ''))
achats[arr]=achat
marge =int(((float(prize) - float(achat)) / float(achat)) * 100);
marges[arr]=marge
optiondata[arr]=prize,total,achat1,achat,achat3,marge
except StaleElementReferenceException:
pass
print("{} : {} - {}".format(name, opt, optiondata))
except NoSuchElementException:
pass
driver.quit()
Output:
Byzance Nabeul : Chambre Double - {'Petit Dejeuner': (36, 41, 12, 29, 4.0, 24)}
Where:
36 = Prix Total
41 = Montant Total
12 = Montant de l'acompte
29 = Vous payerez le reste à votre arrivée à l'hôtel
4.0 = Total taxe de séjour à payer sur place à l'hôtel est
24 = Marges
Hotel page:
You are using sleeps to load the pages in your first example but not in your second one (the one that you state works just fine).
This is typically not the way you want to actually use selenium and leads me to believe that your timing is off.
This SO answer shows you how to use "Explicit Waits" on "expected_conditions" to not have "specific timings" which can/will fail.
You even create a wait object but never use it.
Use it in conjunction with expected_conditions and remove the specific timed sleeps and things will get better.
expected_conditions docs are here
The problem was that it can't access to the element listing arrangements for the rest of the hotels in the list i've added a function that tests the presence of the data and it workod
for url in urls:
driver.get(url)
def existsElement(xpath):
try:
driver.find_element_by_id(xpath);
except NoSuchElementException:
return "false"
else:
return "true"
if (existsElement('result_par_arrangement')=="false"):
btn_t = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, '//*[#id="moteur_rech"]/form/div/div[3]/div')))
btn_t.click()
else :
pass
Random timeout exception for the code below not sure whats the best approach to address these issues, and this timeout not happen all time, and also it does find elements sometime or all time
we appreciate your comments and suggestions and apparently explicit wait is not handling until the elements gets loaded into the browser application or elements are getting different interval in every single time when new page gets loaded
"""
"""
import platform , logging
import os,re
from time import sleep
import selenium.webdriver.support.ui as ui
import selenium.webdriver.support.expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.webdriver import WebDriver
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.action_chains import ActionChains
class Cloud(object):
"""
cloud class to get query and response
"""
def __init__(self, username='xxxxxx', password='xxxx'):
logging.basicConfig(format='%(asctime)s %(message)s', datefmt='%m/%d/%Y %I:%M:%S %p', level=logging.INFO)
self.logger = logging.getLogger(__name__)
self.url = "https://www.amazon.com"
self.username = username
self.password = password
self.timeout = 100
self.driver = None
self.get_chrome_driver()
def get_chrome_driver(self):
"""
get chrome driver
"""
if platform.system().lower() == 'windows':
if self.driver is None:
chrome_options = Options()
#chrome_options.add_argument('--disable-extensions')
chrome_options.add_argument('--start-maximized')
chrome_options.add_argument('--disable-popup-blocking')
chrome_options.add_argument('--ignore-certificate-errors')
chrome_options.add_argument('--allow-insecure-localhost')
chrome_options.add_argument('--disable-infobars')
chrome_options.add_argument("--log-level=3")
chrome_driver_path = os.path.join(str(os.environ['PYTHONPATH'].split(';')[0]),"bin","chromedriver","chromedriver.exe")
self.driver = WebDriver(executable_path=chrome_driver_path, chrome_options=chrome_options)
return self.driver
def login(self, username='xxxxxxx', password='xxxxx'):
"""
Login into amazon cloud
"""
self.logger.info("logging in amazon cloud username: %s and password: %s" %(self.username, re.sub(r".", "*", self.password)))
self.driver.get(self.url)
# wait for login username textbox
self.wait_visibility_element(By.XPATH, "//div[#id='nav-signin-tooltip']//span[#class='nav-action-inner'][contains(text(),'Sign in')]")
self.driver.find_element_by_xpath(" //div[#id='nav-signin-tooltip']//span[#class='nav-action-inner'][contains(text(),'Sign in')]").click()
self.wait_visibility_element(By.XPATH,"//label[#class='a-form-label']")
self.wait_visibility_element(By.XPATH,"//input[#id='ap_email']")
username_textbox = self.driver.find_element_by_xpath("//input[#id='ap_email']")
username_textbox.clear()
username_textbox.send_keys(self.username)
self.driver.find_element_by_xpath("//input[#id='continue']").click()
self.wait_visibility_element(By.XPATH,"//input[#id='ap_password']") #//label[#class='a-form-label']
password_textbox = self.driver.find_element_by_xpath("//input[#id='ap_password']")
password_textbox.clear()
password_textbox.send_keys(self.password)
# click on submit button
self.driver.find_element_by_xpath("//input[#id='signInSubmit']").click()
def wait_visibility_element(self, by_type, element_name):
"""
wait for visibility of element
:param by_type: Locate element using type of element
:param element_name: element name
"""
ui.WebDriverWait(self.driver, self.timeout).until(
EC.visibility_of_element_located((by_type, element_name)))
def get_audio_text(self, multi_turn_count=1):
self.login()
#Arrow in the Top Menu
self.wait_visibility_element(By.XPATH, "//a[#id='nav-link-accountList']//span[#class='nav-icon nav-arrow']")
ui.WebDriverWait(self.driver, self.timeout).until(EC.element_to_be_clickable((By.XPATH, "//a[#id='nav-link-accountList']//span[#class='nav-icon nav-arrow']")))
self.driver.find_element_by_xpath("//a[#id='nav-link-accountList']//span[#class='nav-icon nav-arrow']").click()
#To avoid click event ambiguity
firstLevelMenu = self.driver.find_element_by_xpath("//span[contains(#class,'nav-line-2')][contains(text(),'Account & Lists')]")
action = ActionChains(self.driver)
action.move_to_element(firstLevelMenu).perform()
#sub menu select and click
self.wait_visibility_element(By.XPATH, "//span[contains(text(),'Your Content and Devices')]")
self.driver.find_element_by_xpath("//span[contains(text(),'Your Content and Devices')]").click()
#Alexa Privacy
self.wait_visibility_element(By.XPATH, "//div[#id='ng-app']//div[2]//div[1]//div[1]//div[1]//div[1]//div[1]//div[2]//div[6]//div[1]//div[1]")
self.driver.find_element_by_xpath("//div[#id='ng-app']//div[2]//div[1]//div[1]//div[1]//div[1]//div[1]//div[2]//div[6]//div[1]//div[1]").click()
self.wait_visibility_element(By.XPATH,'//div[#class="navAlexaOptionTitle_alexaNavHeader_myx ng-binding"][contains(text(),"Review Voice History")]')
ui.WebDriverWait(self.driver, self.timeout).until(
EC.element_to_be_clickable((By.XPATH, '//div[#class="navAlexaOptionTitle_alexaNavHeader_myx ng-binding"][contains(text(),"Review Voice History")]')))
ui.WebDriverWait(self.driver, self.timeout).until(EC.text_to_be_present_in_element((By.XPATH, '//span[#class="overviewHeadingString_myx ng-binding"]'), 'Alexa Privacy'))
self.driver.find_element_by_xpath('//div[#class="navAlexaOptionTitle_alexaNavHeader_myx ng-binding"][contains(text(),"Overview")]').click()
self.driver.find_element_by_xpath("//div[#class='navAlexaOptionTitle_alexaNavHeader_myx ng-binding'][contains(text(),'Review Voice History')]").click()
# Select the dropdown box
self.wait_visibility_element(By.XPATH,"//span[#id='timePickerDesktop']//span[#class='a-button-text a-declarative']")
ui.WebDriverWait(self.driver, self.timeout).until(EC.element_to_be_clickable((By.XPATH, "//span[#id='timePickerDesktop']//span[#class='a-button-text a-declarative']")))
self.driver.find_element_by_xpath("//span[#id='timePickerDesktop']//span[#class='a-button-text a-declarative']").click()
#All history selection
self.wait_visibility_element(By.XPATH,"//a[#id='timePickerDesktop_4']")
ui.WebDriverWait(self.driver, self.timeout).until(EC.element_to_be_clickable((By.XPATH, "//a[#id='timePickerDesktop_4']")))
self.driver.find_element_by_xpath("//a[#id='timePickerDesktop_4']").click()
# read first text format of the data
self.wait_visibility_element(By.XPATH,"//span[#id='mainInfo-0']//div[contains(#class,'summaryCss')]")
txt = self.driver.find_element_by_xpath("//span[#id='mainInfo-0']//div[contains(#class,'summaryCss')]").text
question_text = txt.encode("utf-8")[3:-3]
# Dropdown the rectangle menu
self.driver.find_element_by_xpath("//div[#id='arrowExpand-0']//i[#class='fa fa-angle-down caretAlignment']").click()
# read AVS Response
self.wait_visibility_element(By.XPATH,"//div[#id='activityItemsInner-0']//div[#class='ttsInfo']")
ui.WebDriverWait(self.driver, self.timeout).until(EC.element_to_be_clickable((By.XPATH, "//div[#id='activityItemsInner-0']//div[#class='ttsInfo']")))
txt = self.driver.find_element_by_xpath("//div[#id='activityItemsInner-0']//div[#class='ttsInfo']").text
answer_text = txt.encode("utf-8")[3:-3]
self.sign_out_direct()
return question_text, answer_text
def sign_out(self):
#Sigout menu nevigation
self.driver.find_element_by_xpath("//i[#class='hm-icon nav-sprite']").click()
self.wait_visibility_element(By.XPATH,"//div[contains(text(),'SHOP BY CATEGORY')]")
#sign out
sign_out_element = self.driver.find_element_by_xpath("//li[44]//a[1]")
self.driver.execute_script("arguments[0].scrollIntoView();", sign_out_element)
#self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
ui.WebDriverWait(self.driver, self.timeout).until(EC.element_to_be_clickable((By.XPATH, "//li[44]//a[1]")))
self.driver.find_element_by_xpath("//li[44]//a[1]").click()
self.sign_out_direct()
#Close current tab
self.driver.close()
def sign_out_direct(self):
#Arrow in the Top Menu
self.wait_visibility_element(By.XPATH, "//a[#id='nav-link-accountList']//span[#class='nav-icon nav-arrow']")
ui.WebDriverWait(self.driver, self.timeout).until(EC.element_to_be_clickable((By.XPATH, "//a[#id='nav-link-accountList']//span[#class='nav-icon nav-arrow']")))
self.driver.find_element_by_xpath("//a[#id='nav-link-accountList']//span[#class='nav-icon nav-arrow']").click()
#To avoid click event ambiguity
firstLevelMenu = self.driver.find_element_by_xpath("//span[contains(#class,'nav-line-2')][contains(text(),'Account & Lists')]")
action = ActionChains(self.driver)
action.move_to_element(firstLevelMenu).perform()
#sub menu select and click
self.wait_visibility_element(By.XPATH, " //span[contains(text(),'Sign Out')]")
self.driver.find_element_by_xpath("//span[contains(text(),'Sign Out')]").click()
#Close current tab
self.driver.close()
if __name__ == '__main__':
for loop in range(20):
PAGE = Cloud()
#PAGE.login()
OUTPUT = PAGE.get_audio_text()
print("\n\nQuestion:: %s"%str(list(OUTPUT)[0]).upper())
print("Answer:: %s"%str(list(OUTPUT)[1]).upper())
#PAGE.sign_out()
#PAGE.sign_out_direct()
sleep(2)
If you post the lines of code in particular that are throwing the timeout exceptions, this will help track down the issues easier.
I noticed most of your waits are for visibility_of_element_located. I would recommend trying to change some of those to element_to_be_clickable instead, because some elements will appear on the DOM before they are fully rendered.
Im having the error messages for a python program I got they are :
C:\Users\chanm\AppData\Local\Programs\Python\Python37-32\python.exe C:/Users/chanm/OneDrive/Desktop/bot/Commenter.py
Traceback (most recent call last):
File "C:/Users/chanm/OneDrive/Desktop/bot/Commenter.py", line 133, in <module>
com.login()
File "C:/Users/chanm/OneDrive/Desktop/bot/Commenter.py", line 28, in login
login_button = driver.find_element_by_xpath("//a[#href='/accounts/login/']")
File "C:\Users\chanm\AppData\Local\Programs\Python\Python37-32\lib\site-packages\selenium\webdriver\remote\webdriver.py", line 394, in find_element_by_xpath
return self.find_element(by=By.XPATH, value=xpath)
File "C:\Users\chanm\AppData\Local\Programs\Python\Python37-32\lib\site-packages\selenium\webdriver\remote\webdriver.py", line 978, in find_element
'value': value})['value']
File "C:\Users\chanm\AppData\Local\Programs\Python\Python37-32\lib\site-packages\selenium\webdriver\remote\webdriver.py", line 321, in execute
self.error_handler.check_response(response)
File "C:\Users\chanm\AppData\Local\Programs\Python\Python37-32\lib\site-packages\selenium\webdriver\remote\errorhandler.py", line 242, in check_response
raise exception_class(message, screen, stacktrace)
selenium.common.exceptions.NoSuchElementException: Message: no such element: Unable to locate element: {"method":"xpath","selector":"//a[#href='/accounts/login/']"}
(Session info: chrome=71.0.3578.80)
(Driver info: chromedriver=2.44.609538 (b655c5a60b0b544917107a59d4153d4bf78e1b90),platform=Windows NT 10.0.17134 x86_64)
Process finished with exit code 1
This is my code that I have
Commenter.py
import time
import random
import re
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import NoSuchElementException, StaleElementReferenceException
from chatterbot.trainers import ListTrainer
from chatterbot import ChatBot
class Commenter:
def __init__(self, username, password):
self.username = username
self.password = password
self.driver = webdriver.Chrome()
self.driver.set_window_size(700, 900)
"""closing browser"""
def closeBrowser(self):
self.driver.close()
"""login in to Instagram"""
def login(self) -> object:
driver = self.driver
driver.get("https://www.instagram.com/")
time.sleep(2)
login_button = driver.find_element_by_xpath("//a[#href='/accounts/login/']")
login_button.click()
time.sleep(2)
user_name_elem = driver.find_element_by_xpath("//input[#name='username']")
user_name_elem.clear()
user_name_elem.send_keys(self.username)
passworword_elem = driver.find_element_by_xpath("//input[#name='password']")
passworword_elem.clear()
passworword_elem.send_keys(self.password)
passworword_elem.send_keys(Keys.RETURN)
time.sleep(2)
"""getting pictures on a hashtag page"""
def get_pictures_on_page(self, hashtag, scrolls=int):
self.driver.get("https://www.instagram.com/explore/tags/" + hashtag + "/")
time.sleep(2)
# gathering photos
pic_hrefs = []
for i in range(1, scrolls):
try:
self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(2)
# get tags
hrefs_in_view = self.driver.find_elements_by_tag_name('a')
# finding relevant hrefs
hrefs_in_view = [elem.get_attribute('href') for elem in hrefs_in_view if
hashtag in elem.get_attribute('href')]
# building list of unique photos
[pic_hrefs.append(href) for href in hrefs_in_view if href not in pic_hrefs]
# print("Check: pic href length " + str(len(pic_hrefs)))
except Exception:
continue
return pic_hrefs
"""write comment in text area using lambda function"""
def write_comment(self, comment_text):
try:
comment_button = lambda: self.driver.find_element_by_link_text('Comment')
comment_button().click()
except NoSuchElementException:
pass
try:
comment_box_elem = lambda: self.driver.find_element_by_xpath("//textarea[#aria-label='Add a comment…']")
comment_box_elem().send_keys('')
comment_box_elem().clear()
for letter in comment_text:
comment_box_elem().send_keys(letter)
time.sleep((random.randint(1, 7) / 30))
return comment_box_elem
except StaleElementReferenceException and NoSuchElementException as e:
print(e)
return False
"""actually post a comment"""
def post_comment(self, comment_text):
time.sleep(random.randint(1,5))
comment_box_elem = self.write_comment(comment_text)
if comment_text in self.driver.page_source:
comment_box_elem().send_keys(Keys.ENTER)
try:
post_button = lambda: self.driver.find_element_by_xpath("//button[#type='Post']")
post_button().click()
print('clicked post button')
except NoSuchElementException:
pass
time.sleep(random.randint(4, 6))
self.driver.refresh()
if comment_text in self.driver.page_source:
return True
return False
"""grab comments from a picture page"""
def get_comments(self):
# load more comments if button exists
time.sleep(3)
try:
comments_block = self.driver.find_element_by_class_name('Xl2Pu')
comments_in_block = comments_block.find_elements_by_class_name('gElp9')
comments = [x.find_element_by_tag_name('span') for x in comments_in_block]
user_comment = re.sub(r'#.\w*', '', comments[0].text)
except NoSuchElementException:
return ''
return user_comment
"""have bot comment on picture"""
def comment_on_picture(self):
bot = ChatBot('YouTubeChatBot')
bot.set_trainer(ListTrainer)
picture_comment = self.get_comments()
# user's comment and bot's response
response = bot.get_response(picture_comment).__str__()
print("User's Comment", picture_comment)
print("Bot's Response", response)
return self.post_comment(response)
com: Commenter = Commenter(username='username', password='password')
com.login()
for pic in com.get_pictures_on_page(hashtag='gaming', scrolls=5)[1:]:
com.driver.get(pic)
time.sleep(3)
print('Posted Comment:', com.comment_on_picture())
time.sleep(3)
that is the script I have the most problems I have I have tried things along the lines of changing extensions and other little things it resolved most of them now im stuck with these ones
The username and password field within Instagram are JavaScript enabled element so you have to induce WebDriverWait for the desired element to be clickable and you can use the following solution:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
options = Options()
options.add_argument("start-maximized")
options.add_argument("disable-infobars")
options.add_argument("--disable-extensions")
driver = webdriver.Chrome(chrome_options=options, executable_path=r'C:\Utility\BrowserDrivers\chromedriver.exe')
driver.get('https://www.instagram.com')
WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.CSS_SELECTOR, "a[href='/accounts/login/?source=auth_switcher']"))).click()
WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.CSS_SELECTOR, "input[name='username'][aria-label='Phone number, username, or email']"))).send_keys("JRProgrammer")
driver.find_element_by_css_selector("input[name='password'][aria-label='Password']").send_keys("JRProgrammer")
driver.find_element_by_xpath("//button[text()='Log in']").click()
If you read the stack trace, it says that it is failing to find the login button, so you've entered a bad selector. I'm not really sure why you're calling that anyway, since you wouldn't want to click the login button before entering user info.
Try:
def login(self) -> object:
driver = self.driver
driver.get("https://www.instagram.com/")
time.sleep(2)
user_name_elem = driver.find_element_by_xpath("//input[#name='username']")
user_name_elem.clear()
user_name_elem.send_keys(self.username)
passworword_elem = driver.find_element_by_xpath("//input[#name='password']")
passworword_elem.clear()
passworword_elem.send_keys(self.password)
passworword_elem.send_keys(Keys.RETURN)
time.sleep(2)
login_button = driver.find_element_by_css_selector("button[type='submit']")
login_button.click()
time.sleep(2)