Here's how I'm getting page content
from selenium.webdriver.support.wait import WebDriverWait
import os
from seleniumwire import webdriver
from selenium.webdriver.support import expected_conditions as ec
from selenium.webdriver.common.by import By
from selenium.webdriver.firefox.options import Options
from seleniumwire.handler import log as seleniumLog
from seleniumwire.server import logger as selenium_server_log
from webdriver_manager.firefox import GeckoDriverManager
options = Options()
options.add_argument('--user-agent="' + user_agent + '"')
options.add_argument("--start-maximized")
options.add_argument("--headless")
driver = webdriver.Firefox(
executable_path=GeckoDriverManager().install(),
options=options,
)
driver.set_page_load_timeout(30)
try:
driver.get('https://xn--e1aicoccdeejjbbl0l.xn--p1ai/uslugi/stroitelstvo/price/')
WebDriverWait(driver, 40).until(ec.presence_of_element_located((By.TAG_NAME, "html")))
except Exception as e:
error = True
print(e)
This is the output I have:
Message: Reached error page: about:neterror?e=dnsNotFound&u=https%3A//xn--e1aicoccdeejjbbl0l.xn--p1ai/uslugi/stroitelstvo/price/&c=UTF-8&d=We%20can%E2%80%99t%20connect%20to%20the%20server%20at%20xn--e1aicoccdeejjbbl0l.xn--p1ai.
When I try to get a content from usual latinic url, everything works ok. The problem occurs when I use cyrillic or punycode urls.
What can I do about it?
Related
I am not getting price they give me empty output this is page link https://www.amazon.com/dp/B00M0DWQYI?th=1
from selenium import webdriver
import time
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support.select import Select
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
url='https://www.amazon.com/dp/B00M0DWQYI?th=1'
PATH="C:\Program Files (x86)\chromedriver.exe"
driver =webdriver.Chrome(PATH)
driver.get(url)
item=dict()
try:
item['price'] = driver.find_element(By.XPATH, "//div[#id='corePrice_feature_div'] //span[#class='a-offscreen']").text
except:
item['price']=''
print(item)
You may want to wait for that element to properly load, prior to locating it:
[...]
wait = WebDriverWait(driver, 10)
item['price'] = wait.until(EC.element_to_be_clickable((By.XPATH, "//div[#id='corePrice_feature_div']//span[#class='a-offscreen']"))).text
Selenium documentation can be found at https://www.selenium.dev/documentation/
EDIT: Here is a complete example of how you can get that information:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
import time as t
chrome_options = Options()
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument('disable-notifications')
chrome_options.add_argument("window-size=1920,1080")
webdriver_service = Service("chromedriver/chromedriver") ## path to where you saved chromedriver binary
driver = webdriver.Chrome(service=webdriver_service, options=chrome_options)
wait = WebDriverWait(driver, 5)
items = dict()
driver.get('https://www.amazon.com/dp/B00M0DWQYI?th=1')
t.sleep(1)
driver.refresh()
items['price'] = wait.until(EC.element_to_be_clickable((By.XPATH, '//div[#id="corePrice_feature_div"]//span[#class="a-price aok-align-center"]'))).text.replace('\n', '.')
print(items)
Result in terminal:
{'price': '$32.98'}
You need to wait for element visibility and then to extract it's text.
The following Selenium code works:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
options = Options()
options.add_argument("start-maximized")
webdriver_service = Service('C:\webdrivers\chromedriver.exe')
driver = webdriver.Chrome(service=webdriver_service, options=options)
url = 'https://www.amazon.com/dp/B00M0DWQYI'
driver.get(url)
wait = WebDriverWait(driver, 10)
print(wait.until(EC.visibility_of_element_located((By.XPATH, "//div[#id='corePrice_feature_div']"))).text)
The output is
$32
98
You can use bs4 and it will work fine
from bs4 import BeautifulSoup
soup = BeautifulSoup(driver.page_source, 'lxml')
try:
item['price'] = soup.find('input', id="attach-base-product-price").get('value')
except:
item['price'] = ''
finally:
driver.close()
driver.quit()
print(item)
I need some help.
There is URL: https://www.inipec.gov.it/cerca-pec/-/pecs/companies.
I need to click checkbox Captcha:
My code is look like:
import os, urllib.request, requests, datetime, time, random, ssl, json, codecs, csv, urllib
from urllib.request import Request, urlopen
from urllib.request import urlretrieve
from datetime import datetime
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoAlertPresentException
from selenium.webdriver.chrome.options import Options
chromedriver = "chromedriver"
os.environ["webdriver.chrome.driver"] = chromedriver
chrome_options = webdriver.ChromeOptions()
driver = webdriver.Chrome(executable_path=chromedriver, chrome_options=chrome_options)
driver.get("https://www.inipec.gov.it/cerca-pec/-/pecs/companies")
driver.switch_to_default_content()
element = driver.find_elements_by_css_selector('iframe')[1]
driver.switch_to_frame(element)
driver.find_elements_by_xpath('//*[#id="recaptcha-anchor"]/div[1]').click()
During the execution, there is an error:
driver.find_elements_by_xpath('//*[#id="recaptcha-anchor"]/div1').click()
AttributeError: 'list' object has no attribute 'click'
Please, help to fix it.
Solution update (11-Feb-2020)
Using the following set of binaries:
Selenium v3.141.0
ChromeDriver v80.0
Chrome Version 80.0
You can use the following updated block of code as a solution:
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
options = webdriver.ChromeOptions()
options.add_argument("start-maximized")
options.add_experimental_option("excludeSwitches", ["enable-automation"])
options.add_experimental_option('useAutomationExtension', False)
driver = webdriver.Chrome(options=options, executable_path=r'C:\WebDrivers\chromedriver.exe')
driver.get("https://www.inipec.gov.it/cerca-pec/-/pecs/companies")
WebDriverWait(driver, 10).until(EC.frame_to_be_available_and_switch_to_it((By.CSS_SELECTOR,"iframe[name^='a-'][src^='https://www.google.com/recaptcha/api2/anchor?']")))
WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, "//span[#id='recaptcha-anchor']"))).click()
Original solution
Within the URL https://www.inipec.gov.it/cerca-pec/-/pecs/companies to invoke click() on the reCAPTCHA checkbox you need to:
Induce WebDriverWait for the desired frame to be available and switch to it.
Induce WebDriverWait for the desired element to be clickable.
You can use the following solution:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
options = webdriver.ChromeOptions()
options.add_argument("start-maximized")
options.add_argument('disable-infobars')
driver = webdriver.Chrome(executable_path=r'C:\WebDrivers\chromedriver.exe', chrome_options=options)
driver.get("https://www.inipec.gov.it/cerca-pec/-/pecs/companies")
WebDriverWait(driver, 10).until(EC.frame_to_be_available_and_switch_to_it((By.CSS_SELECTOR,"iframe[name^='a-'][src^='https://www.google.com/recaptcha/api2/anchor?']")))
WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, "//span[#class='recaptcha-checkbox goog-inline-block recaptcha-checkbox-unchecked rc-anchor-checkbox']/div[#class='recaptcha-checkbox-checkmark']"))).click()
I resolved this, you can try this with your landing website url.
from selenium import webdriver
from selenium.webdriver.support.select import Select
from selenium.common.exceptions import SessionNotCreatedException
options = webdriver.ChromeOptions()
prefs = {"download.default_directory": download_dir}
options.add_experimental_option("prefs", prefs)
options.add_argument("--no-sandbox")
driver = webdriver.Chrome("/usr/bin/chromedriver", chrome_options = options)
driver.get("https://www.google.com/recaptcha/api2/demo")
driver.maximize_window()
price = driver.find_element_by_xpath("//div[#class='g-recaptcha']")
price_content = price.get_attribute('innerHTML')
start = str(price_content).find(";k=")+len(";k=")
end = str(price_content).find("&co")
driver.implicitly_wait(20)
driver.execute_script("document.getElementById('g-recaptcha-response').style.display = '';")
recaptcha_text_area = driver.find_element_by_id("g-recaptcha-response")
recaptcha_text_area.clear()
recaptcha_text_area.send_keys(price_content[start:end])
#.....................................................................................
button = driver.find_element_by_id("recaptcha-demo-submit")
I want to download pdf files on the website using beautiful soup and selenium.
I've written the code up to here and it's incomplete. However, since I can't find the link to download the pdf file.
#!/usr/bin/python
from bs4 import BeautifulSoup
from selenium import webdriver
import webbrowser
import os
import requests
import urllib2
import time
import urllib
try:
options = webdriver.ChromeOptions()
options.add_argument("--headless")
options.add_argument('--no-sandbox')
driver = webdriver.Chrome("/usr/bin/chromedriver", chrome_options=options)
except urllib2.HTTPError as e:
print(e)
except urllib2.URLError:
print ("Server down or incorrect domains.")
else:
def not_relative_uri(href):
return re.compile('^https://').search(href) is not None
driver.get("https://xxxxxx")
# print(driver.page_source.encode('utf-8'))
my_folder="/home/python/"
soup_res = BeautifulSoup(driver.page_source.encode('utf-8'), 'html.parser')
tr = soup_res.find("div", {"id":"pageWrapper"}).find("div", {"class":"EGZDefault-List"}).find("div", {"class":"EGZDefault-List-Info-List"}).find("table", {"class":"gridview"}).find("tbody").find_all('tr')[1:21]
I hope someone can help me.
With Selenium you can do it as following:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains
import time
driver = webdriver.Chrome("/usr/bin/chromedriver", chrome_options=options)
wait = WebDriverWait(driver, 20)
actions = ActionChains(driver)
driver.get("https://xxxxxx")
wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "table.gridview input[type='image']")))
time.sleep(2)
images = driver.find_elements_by_css_selector("table.gridview input[type='image']")
for image in images:
actions.move_to_element(image).perform()
time.sleep(0.5)
image.click()
time.sleep(5)
from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
def main(driver):
driver.get('https://account.t-mobile.com/signin/')
waiter = WebDriverWait(driver, 30)
try:
waiter.until(EC.presence_of_element_located(
(By.ID, 'usernameTextBox'))).send_keys("test#test.com")
driver.find_element_by_id("lp1-next-btn").click()
except TimeoutException as e:
print(repr(e))
if __name__ == "__main__":
options = Options()
options.set_preference("dom.webdriver.enabled", False)
driver = webdriver.Firefox(options=options)
main(driver)
I'm unable to land to the login page as it's keep loads forever when am using selenium.
Even i tried with Chrome which is leading to same scenario.
looks like some issue with Firefox form application side.
I tried switching to chrome and it did work.
Code :
from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
executablePath= r'C:\\Users\\Desktop\\Selenium+Python\\chromedriver.exe'
def main(driver):
driver.get('https://account.t-mobile.com/signin/')
waiter = WebDriverWait(driver, 30)
try:
waiter.until(EC.presence_of_element_located(
(By.ID, 'usernameTextBox'))).send_keys("test#test.com")
driver.find_element_by_id("lp1-next-btn").click()
except TimeoutException as e:
print(repr(e))
if __name__ == "__main__":
options = webdriver.ChromeOptions()
# options.set_preference("dom.webdriver.enabled", False)
driver = webdriver.Chrome(executable_path = executablePath, options=options)
main(driver)
You can get a latest chromedriver from official website here
I'm trying to use selenium to scrape some data from mouser.com website but after sending some data in the search bar. The website results in access denied, I need help to bypass it. I tried using an agent but the same thing.
import time
from openpyxl import load_workbook
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from fake_useragent import UserAgent
driver = webdriver.Chrome(executable_path='C:/Users/amuri/AppData/Local/Microsoft/WindowsApps/PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0/site-packages/chromedriver.exe')
driver.implicitly_wait(1)
#def get_comp_type(comp_pn):
url ='https://www.mouser.com/'
driver.get(url)
print(driver.title)
wait = WebDriverWait(driver, timeout=10)
wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, ".form-control.headerSearchBox.search-input.js-search-autosuggest.as-input")))
elem = driver.find_element_by_css_selector(".form-control.headerSearchBox.search-input.js-search-autosuggest.as-input")
elem.click()
elem.send_keys("myString")
elem.send_keys(Keys.RETURN)
time.sleep(1)
from selenium.webdriver.chrome.options import Option
options = Options()
from fake_useragent import UserAgent
ua = UserAgent()
user_agent = ua.random
print(user_agent)
options.add_argument(f'user-agent={user_agent}')
driver = webdriver.Chrome(executable_path='C:/Users/amuri/AppData/Local/Microsoft/WindowsApps/PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0/site-packages/chromedriver.exe',options=options)
You didn't use your useragent anywhere in options. ALso you need to set javascript and cookies as well.