Scraping web page after accepting cookies in python

Scraping web page after accepting cookies in python - python

I'm trying to scrape a web page but before accessing the page, there is a banner for accepting cookies. I am using selenium to click on the button "Accept all cookies" but even after clicking on the button I can't access the right HTML page.
This is my code :
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
url = 'https://www.wikiparfum.fr/explore/by-name?query=dior'
driver = webdriver.Chrome(executable_path=DRIVER_PATH)
driver.get(url)
driver.find_element_by_id('onetrust-accept-btn-handler').click()
html = driver.page_source
soup = BeautifulSoup(html, 'lxml')
print(soup)
And this is the beginning of the HTML page that is printed :
If anyone can help me with this one, thank you!

You should wait for the accept cookies button element appearance before clicking it
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
url = 'https://www.wikiparfum.fr/explore/by-name?query=dior'
driver = webdriver.Chrome(executable_path=DRIVER_PATH)
wait = WebDriverWait(driver, 20)
driver.get(url)
wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "#onetrust-accept-btn-handler"))).click()
html = driver.page_source
soup = BeautifulSoup(html, 'lxml')
print(soup)

Related

Scraping webpage with tabs that do not change url

I am trying to scrape Nasdaq webpage and have some issue with locating elements:
My code:
from selenium import webdriver
import time
import pandas as pd
driver.get('http://www.nasdaqomxnordic.com/shares/microsite?Instrument=CSE32679&symbol=ALK%20B&name=ALK-Abell%C3%B3%20B')
time.sleep(5)
btn_overview = driver.find_element_by_xpath('//*[#id="tabarea"]/section/nav/ul/li[2]/a')
btn_overview.click()
time.sleep(5)
employees = driver.find_element_by_xpath('//*[#id="CompanyProfile"]/div[6]')
After the last call, I receive the following error:
NoSuchElementException: no such element: Unable to locate element: {"method":"xpath","selector":"//*[#id="CompanyProfile"]/div[6]"}
Normally the problem would be in wrong 'xpath' but I tried several items, also by 'id'. I suspect that it has something to do with tabs (in my case navigating to "Overview"). Visually the webpage changes, but if for example, I scrape the table, it gets it from the first page:
table_test = pd.read_html(driver.page_source)[0]
What am I missing or doing wrong?

The overview page is under iframe
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
option = webdriver.ChromeOptions()
option.add_argument("start-maximized")
#chrome to stay open
option.add_experimental_option("detach", True)
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()),options=option)
driver.get('http://www.nasdaqomxnordic.com/shares/microsite?Instrument=CSE32679&symbol=ALK%20B&name=ALK-Abell%C3%B3%20B')
WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.XPATH, '//*[#id="tabarea"]/section/nav/ul/li[2]/a'))).click()
WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.XPATH, '//*[#id="cookieConsentOK"]'))).click()
WebDriverWait(driver, 20).until(EC.frame_to_be_available_and_switch_to_it((By.CSS_SELECTOR,"iframe#MorningstarIFrame")))
employees=WebDriverWait(driver, 20).until(EC.presence_of_element_located((By.XPATH, '//*[#id="CompanyProfile"]/div[6]'))).text.split()[1]
print(employees)
Output:
2,537
webdriverManager

You sure you need Selenium?
import requests
from bs4 import BeautifulSoup
url = 'http://lt.morningstar.com/gj8uge2g9k/stockreport/default.aspx'
payload = {
'SecurityToken': '0P0000A5LL]3]1]E0EXG$XCSE_3060'}
response = requests.get(url, params=payload)
soup = BeautifulSoup(response.text, 'html.parser')
employees = soup.find('h3', text='Employees').next_sibling.text
print(employees)
Output:
2,537

Get html attribute? selenium / bs4

How do I get the value of status= on an twitch page using bs4 or selenium ?
Example of someone offline
How it shows in the html source:
<a class="ScHalo-sc-1l14b0i-0 dcbwCs tw-halo" size="72" status="offline" href="/mizkif">........</a>
code:
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
op = Options()
op.add_argument("user-data-dir=C:\\Users\\bestg\\AppData\\Local\\Google\\Chrome\\bor")
driver = webdriver.Chrome(options=op)
driver.get('https://www.twitch.tv/mizkif')
html = driver.page_source
soup = BeautifulSoup(html, features="html.parser")
#HOW DO GET VALUE OF "STATUS=" in soup?
#unrelated: (used to focus offline streams)
#click the avatar
stream = WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.XPATH, ".//a[#status='offline']")))
stream.click()

You can use the get_attribute(name) function in Selenium
element = driver.find_element(By.XPATH,"/the/X/path")
attribute = element.get_attribute('status')
PS: driver.find_element with that syntax works with Selenium 4

Selenium Scraping Javascript Table

I am stuggling to scrape as per code below. Would apprciate it if someone can have a look at what I am missing?
Regards
PyProg70
from selenium import webdriver
from selenium.webdriver import FirefoxOptions
from selenium.webdriver.firefox.firefox_binary import FirefoxBinary
from bs4 import BeautifulSoup
import pandas as pd
import re, time
binary = FirefoxBinary('/usr/bin/firefox')
opts = FirefoxOptions()
opts.add_argument("--headless")
browser = webdriver.Firefox(options=opts, firefox_binary=binary)
browser.implicitly_wait(10)
url = 'http://tenderbulletin.eskom.co.za/'
browser.get(url)
html = browser.page_source
soup = BeautifulSoup(html, 'lxml')
print(soup.prettify())

not Java but Javascript. it dynamic page you need to wait and check if Ajax finished the request and content rendered using WebDriverWait.
....
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
.....
browser.get(url)
# wait max 30 second until table loaded
WebDriverWait(browser, 30).until(EC.presence_of_element_located((By.CSS_SELECTOR , 'table.CSSTableGenerator .ng-binding')))
html = browser.find_element_by_css_selector('table.CSSTableGenerator')
soup = BeautifulSoup(html.get_attribute("outerHTML"), 'lxml')
print(soup.prettify().encode('utf-8'))

Can't click button with selenium PhantomJS and get the data

I'm trying to fetch some data from booking. However, even though if I open the page from a browser I can see prices, when I try to download a page with its source code with python, the request return a page with buttons: 'Show prices' instead of the prices.
The source code of the page, around the button, is:
data-click-store-id="sr-compset-2128695"
data-et-click="customGoal:YPNdKNKNKZJUESUPTOdJDUFYQC:1
customGoal:YPNdKNKNKZAMUVdFePOdXeRe:1"
data-et-focus="customGoal:OTfdASFOQJNDYBWfBQVT:1" target="_blank"
<span class="b-button__text"> Show prices </span>
I based this code on a similar question,
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
web_site='https://www.booking.com/searchresults.en-gb.html?aid=376363&label=bdot-1gtfXe7K0wVduEQU2KBU*QS144456159570%3Apl%3Ata%3Ap1%3Ap21%2C093%2C000%3Aac%3Aap1t1%3Aneg%3Afi%3Atiaud-146342138710%3Akwd-334108349%3Alp1008736%3Ali%3Adec%3Adm&lang=en-gb&sid=316b1ca4ddb0b74abc941811e1a769db&sb=1&src=index&src_elem=sb&error_url=https%3A%2F%2Fwww.booking.com%2Findex.en-gb.html%3Faid%3D376363%3Blabel%3Dbdot-1gtfXe7K0wVduEQU2KBU%252AQS144456159570%253Apl%253Ata%253Ap1%253Ap21%252C093%252C000%253Aac%253Aap1t1%253Aneg%253Afi%253Atiaud-146342138710%253Akwd-334108349%253Alp1008736%253Ali%253Adec%253Adm%3Bsid%3D316b1ca4ddb0b74abc941811e1a769db%3Bsb_price_type%3Dtotal%26%3B&ss=Rome%2C+Lazio%2C+Italy&ssne=Apia&ssne_untouched=Apia&checkin_monthday=28&checkin_month=10&checkin_year=2017&checkout_monthday=31&checkout_month=10&checkout_year=2017&no_rooms=1&group_adults=2&group_children=0&genius_rate=1&from_sf=1&ss_raw=rom&ac_position=0&ac_langcode=en&dest_id=-126693&dest_type=city&search_pageview_id=18384c2ba57602b5&search_selected=true&search_pageview_id=18384c2ba57602b5&ac_suggestion_list_length=5&ac_suggestion_theme_list_length=0'
driver = webdriver.PhantomJS()
driver.get(web_site)
driver.save_screenshot('screenshot1.png')
wait = WebDriverWait(driver, 30)
# click proceed
proceed = wait.until(EC.presence_of_element_located((By.LINK_TEXT, "\nShow prices\n")))
proceed.click()
# wait for the content to be present
wait.until(EC.presence_of_element_located((By.ID, "workskin")))
soup = BeautifulSoup(driver.page_source, "html.parser")
soup.prettify()
This is a saved screenshot by phantomJS

Python, Selenium, and Beautiful Soup for URL

I am trying to write a script using Selenium to access pastebin do a search and print out in text the URL results. I need the visible URL results and nothing else.
<div class="gs-bidi-start-align gs-visibleUrl gs-visibleUrl-long" dir="ltr" style="word-break:break-all;">pastebin.com/VYQTSbzY</div>
Current script is:
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup
browser = webdriver.Firefox()
browser.get('http://www.pastebin.com')
search = browser.find_element_by_name('q')
search.send_keys("test")
search.send_keys(Keys.RETURN)
soup=BeautifulSoup(browser.page_source)
for link in soup.find_all('a'):
print link.get('href',None),link.get_text()

You don't actually need BeautifulSoup. selenium itself is very powerful at locating element:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
browser = webdriver.Firefox()
browser.get('http://www.pastebin.com')
search = browser.find_element_by_name('q')
search.send_keys("test")
search.send_keys(Keys.RETURN)
# wait for results to appear
wait = WebDriverWait(browser, 10)
results = wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "div.gsc-resultsbox-visible")))
# grab results
for link in results.find_elements_by_css_selector("a.gs-title"):
print link.get_attribute("href")
browser.close()
Prints:
http://pastebin.com/VYQTSbzY
http://pastebin.com/VYQTSbzY
http://pastebin.com/VAAQCjkj
...
http://pastebin.com/fVUejyRK
http://pastebin.com/fVUejyRK
Note the use of an Explicit Wait which helps to wait for the search results to appear.

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Scraping web page after accepting cookies in python - python

Related

Scraping webpage with tabs that do not change url

Get html attribute? selenium / bs4

Selenium Scraping Javascript Table

Can't click button with selenium PhantomJS and get the data

Python, Selenium, and Beautiful Soup for URL

Categories

Resources