I'm trying to scrape some site data and have cleared the CAPTCHA I'm triggering manually - however I continue to load the CAPTCHA success page after I close and reopen my session:
Code:
import urllib, os, urllib.request, time, requests, random, pandas as pd
from datetime import date
from time import sleep
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.chrome.options import Options
from google_trans_new import google_translator
chrome_options = Options()
chrome_options.add_argument("user-data-dir=C:\\environments\\selenium")
driver = webdriver.Chrome(chrome_options=chrome_options)
driver.maximize_window()
driver.implicitly_wait(10)
driver.get("https://ca.indeed.com/")
search_company = driver.find_element(By.XPATH,"//*[#id='text-input-what']")
search_company.send_keys(Keys.CONTROL + "a")
search_company.send_keys(Keys.DELETE)
search_company.send_keys("Sales")
search_loc = driver.find_element(By.XPATH,"//*[#id='text-input-where']")
search_loc.send_keys(Keys.CONTROL + "a")
search_loc.send_keys(Keys.DELETE)
search_loc.send_keys("Quebec")
click_search = driver.find_element(By.XPATH,"//*[#id='jobsearch']/button")
click_search.click()
After running this block, I run:
page = driver.current_url
html = requests.get(page,verify=False)
soup = BeautifulSoup(html.content, 'html.parser', from_encoding = 'utf-8')
soup
And I can't avoid the HTML, and thus have nothing to scrape:
hCaptcha solve page
How do I stop returning the CAPTCHA success page and revert back to the page I'm trying to scrape? I've added my environment to try and retain the cookies but I'm at a loss on how to proceed.
Related
I am using the following code to click on the next page, by clicking the next element.
However this code is not working. Any thoughts on what I might be doing wrong.
Final goal:
Use BeautifulSoup on each of the pages.
from selenium import webdriver
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup as soup
from urllib.request import urlopen as ureq
import random
import time
from bs4 import BeautifulSoup
from selenium.webdriver.common.by import By
chrome_options = webdriver.ChromeOptions()
prefs = {"profile.default_content_setting_values.notifications": 2}
chrome_options.add_experimental_option("prefs", prefs)
# A randomizer for the delay
seconds = 1 + (random.random() * 2)
# create a new Chrome session
driver = webdriver.Chrome(chrome_options=chrome_options)
driver.implicitly_wait(30)
# driver.maximize_window()
# navigate to the application home page
driver.get("https://www.fda.gov/inspections-compliance-enforcement-and-criminal-investigations/compliance-actions-and-activities/warning-letters")
time.sleep(seconds)
time.sleep(seconds)
next_page = driver.find_element(By.CLASS_NAME, "next")
#print (next_page.get_attribute('innerHTML'), type(next_page))
next_page.find_element(By.XPATH("//a[#href='#']")).click
# next_page.find_element(By.LINK_TEXT("Next")).click()
This code does not click on the next page.
Select your element more specific and wait .until(EC.element_to_be_clickable()) - This will give you the next page.
Example
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
url = 'https://www.fda.gov/inspections-compliance-enforcement-and-criminal-investigations/compliance-actions-and-activities/warning-letters'
driver.get(url)
WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.CSS_SELECTOR, '#datatable_next a'))).click()
Selenium does not find the accept cookies button.
Tested: xpath, class and css
Error
Command
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from time import sleep
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
import pandas as pd
import csv
options = Options()
options = webdriver.ChromeOptions()
options.add_experimental_option('excludeSwitches', ['enable-logging'])
navegador = webdriver.Chrome(options=options)
navegador.get('https://app-vlc.hotmart.com/market/search?categoryId=25&page=1&userLanguage=PT_BR')
navegador.implicitly_wait(30)
sleep(30)
navegador.find_element(By.CSS_SELECTOR, ".cookie-policy-accept-all.hot-button.hot-button--primary").click()
navegador.implicitly_wait(30)
elem=navegador.find_element(By.XPATH,"//div[#id='hotmart-cookie-policy']").shadow_root
elem.find_element(By.CSS_SELECTOR, ".cookie-policy-accept-all.hot-button.hot-button--primary").click()
You need to find the shadow root and then find from there.
Since the above didn't work try this one.
navegador.get('https://app-vlc.hotmart.com/market/search?categoryId=25&page=1&userLanguage=PT_BR')
time.sleep(10)
elem=navegador.find_element(By.XPATH,"//div[#id='hotmart-cookie-policy']")
script='''return arguments[0].shadowRoot.querySelector(".cookie-policy-accept-all.hot-button.hot-button--primary")'''
elem1= navegador.execute_script(script, elem)
elem1.click()
I want to download pdf files on the website using beautiful soup and selenium.
I've written the code up to here and it's incomplete. However, since I can't find the link to download the pdf file.
#!/usr/bin/python
from bs4 import BeautifulSoup
from selenium import webdriver
import webbrowser
import os
import requests
import urllib2
import time
import urllib
try:
options = webdriver.ChromeOptions()
options.add_argument("--headless")
options.add_argument('--no-sandbox')
driver = webdriver.Chrome("/usr/bin/chromedriver", chrome_options=options)
except urllib2.HTTPError as e:
print(e)
except urllib2.URLError:
print ("Server down or incorrect domains.")
else:
def not_relative_uri(href):
return re.compile('^https://').search(href) is not None
driver.get("https://xxxxxx")
# print(driver.page_source.encode('utf-8'))
my_folder="/home/python/"
soup_res = BeautifulSoup(driver.page_source.encode('utf-8'), 'html.parser')
tr = soup_res.find("div", {"id":"pageWrapper"}).find("div", {"class":"EGZDefault-List"}).find("div", {"class":"EGZDefault-List-Info-List"}).find("table", {"class":"gridview"}).find("tbody").find_all('tr')[1:21]
I hope someone can help me.
With Selenium you can do it as following:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains
import time
driver = webdriver.Chrome("/usr/bin/chromedriver", chrome_options=options)
wait = WebDriverWait(driver, 20)
actions = ActionChains(driver)
driver.get("https://xxxxxx")
wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "table.gridview input[type='image']")))
time.sleep(2)
images = driver.find_elements_by_css_selector("table.gridview input[type='image']")
for image in images:
actions.move_to_element(image).perform()
time.sleep(0.5)
image.click()
time.sleep(5)
When you go to :
https://www.youtube.com/feed/trending
3 buttons : music gaming movies appears
i wanted to select the music element's <a> tag . so i could extract the href value from it. i used the below code but it keeps giving me an empty list.
from urllib.request import urlopen
from lxml import etree
url = "https://www.youtube.com/feed/trending"
response = urlopen(url)
htmlparser = etree.HTMLParser()
tree = etree.parse(response, htmlparser)
print(tree.xpath('//*[#id="contents"]/ytd-channel-list-sub-menu-avatar-renderer[1]/a'))
You can use selenium if requests is not working. I have tried it at my end using selenium and it is working flawlessly. Below is the code which you can refer.
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.common.exceptions import *
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support import expected_conditions as EC
URL = "https://www.youtube.com/feed/trending"
chrome_options = Options()
driver = webdriver.Chrome("./chromedriver/chromedriver.exe", options=chrome_options)#download chrome driver and add path here.
driver.maximize_window()
driver.get(URL)
wait1 = WebDriverWait(driver, 200)
wait1.until(EC.presence_of_element_located((By.XPATH, '//*[#id="img"]')))
print('-' * 100)
print(driver.find_element_by_xpath('//*[#id="contents"]/ytd-channel-list-sub-menu-avatar-renderer[1]/a').get_attribute('href'))
print('-' * 100)
I am stuggling to scrape as per code below. Would apprciate it if someone can have a look at what I am missing?
Regards
PyProg70
from selenium import webdriver
from selenium.webdriver import FirefoxOptions
from selenium.webdriver.firefox.firefox_binary import FirefoxBinary
from bs4 import BeautifulSoup
import pandas as pd
import re, time
binary = FirefoxBinary('/usr/bin/firefox')
opts = FirefoxOptions()
opts.add_argument("--headless")
browser = webdriver.Firefox(options=opts, firefox_binary=binary)
browser.implicitly_wait(10)
url = 'http://tenderbulletin.eskom.co.za/'
browser.get(url)
html = browser.page_source
soup = BeautifulSoup(html, 'lxml')
print(soup.prettify())
not Java but Javascript. it dynamic page you need to wait and check if Ajax finished the request and content rendered using WebDriverWait.
....
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
.....
browser.get(url)
# wait max 30 second until table loaded
WebDriverWait(browser, 30).until(EC.presence_of_element_located((By.CSS_SELECTOR , 'table.CSSTableGenerator .ng-binding')))
html = browser.find_element_by_css_selector('table.CSSTableGenerator')
soup = BeautifulSoup(html.get_attribute("outerHTML"), 'lxml')
print(soup.prettify().encode('utf-8'))