Can't click button with selenium PhantomJS and get the data - python

I'm trying to fetch some data from booking. However, even though if I open the page from a browser I can see prices, when I try to download a page with its source code with python, the request return a page with buttons: 'Show prices' instead of the prices.
The source code of the page, around the button, is:
data-click-store-id="sr-compset-2128695"
data-et-click="customGoal:YPNdKNKNKZJUESUPTOdJDUFYQC:1
customGoal:YPNdKNKNKZAMUVdFePOdXeRe:1"
data-et-focus="customGoal:OTfdASFOQJNDYBWfBQVT:1" target="_blank"
<span class="b-button__text"> Show prices </span>
I based this code on a similar question,
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
web_site='https://www.booking.com/searchresults.en-gb.html?aid=376363&label=bdot-1gtfXe7K0wVduEQU2KBU*QS144456159570%3Apl%3Ata%3Ap1%3Ap21%2C093%2C000%3Aac%3Aap1t1%3Aneg%3Afi%3Atiaud-146342138710%3Akwd-334108349%3Alp1008736%3Ali%3Adec%3Adm&lang=en-gb&sid=316b1ca4ddb0b74abc941811e1a769db&sb=1&src=index&src_elem=sb&error_url=https%3A%2F%2Fwww.booking.com%2Findex.en-gb.html%3Faid%3D376363%3Blabel%3Dbdot-1gtfXe7K0wVduEQU2KBU%252AQS144456159570%253Apl%253Ata%253Ap1%253Ap21%252C093%252C000%253Aac%253Aap1t1%253Aneg%253Afi%253Atiaud-146342138710%253Akwd-334108349%253Alp1008736%253Ali%253Adec%253Adm%3Bsid%3D316b1ca4ddb0b74abc941811e1a769db%3Bsb_price_type%3Dtotal%26%3B&ss=Rome%2C+Lazio%2C+Italy&ssne=Apia&ssne_untouched=Apia&checkin_monthday=28&checkin_month=10&checkin_year=2017&checkout_monthday=31&checkout_month=10&checkout_year=2017&no_rooms=1&group_adults=2&group_children=0&genius_rate=1&from_sf=1&ss_raw=rom&ac_position=0&ac_langcode=en&dest_id=-126693&dest_type=city&search_pageview_id=18384c2ba57602b5&search_selected=true&search_pageview_id=18384c2ba57602b5&ac_suggestion_list_length=5&ac_suggestion_theme_list_length=0'
driver = webdriver.PhantomJS()
driver.get(web_site)
driver.save_screenshot('screenshot1.png')
wait = WebDriverWait(driver, 30)
# click proceed
proceed = wait.until(EC.presence_of_element_located((By.LINK_TEXT, "\nShow prices\n")))
proceed.click()
# wait for the content to be present
wait.until(EC.presence_of_element_located((By.ID, "workskin")))
soup = BeautifulSoup(driver.page_source, "html.parser")
soup.prettify()
This is a saved screenshot by phantomJS

Related

Selenium is opening the browser but How to get links of the news articles by clicking at load more button continuously

I am trying to get all the news links of articles by clicking the load more or next button from the given newspaper link but I am failing to do so in continuation...
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
# Here Chrome is used
# URL of website
url = "https://www.business-standard.com/search?q=economy"
# Opening the website
driver.get(url)
wait = WebDriverWait(driver, 20)
wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "div.btnno"))).click()
wait.until(EC.element_to_be_clickable((By.XPATH, "//div[#class='next-colum']"))).click()

Unable to scrape the src if it is nested inside the source tag inside video via python selenium and beautiful soup

I was scraping an anime website as a project but when I tried to scrape the src it gave me an error. The src is nested inside the source tag. I am giving the screenshot and code below.
example screenshot
Code :
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup
import re
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
#launch url
url = "https://bestdubbedanime.com/Demon-Slayer-Kimetsu-no-Yaiba/26"
# create a new Firefox session
driver = webdriver.Firefox()
# driver.implicitly_wait(30)
driver.get(url)
# python_button = driver.find_element_by_class_name('playostki') #FHSU
# python_button.click() #click fhsu link
soup1 = BeautifulSoup(driver.page_source, 'html.parser')
video = soup1.find('video', id='my_video_1_html5_api')
# video = driver.find_element_by_id('my_video_1_html5_api')
WebDriverWait(driver, 20).until(EC.visibility_of_element_located((By.CSS_SELECTOR, ".playostki"))).click()
driver.stop_client
driver.close
driver.quit
The reason why you are not getting the src tag, because it is displayed after clicking the video. You have to first click on that video, and then try to find the attribute "src" from the element.
driver.maximize_window()
driver.get("https://bestdubbedanime.com/Demon-Slayer-Kimetsu-no-Yaiba/26")
WebDriverWait(driver, 60).until(EC.visibility_of_element_located((By.XPATH, "//div[#class='playostki']//img"))).click()
print(WebDriverWait(driver, 60).until(EC.presence_of_element_located((By.CSS_SELECTOR, "#my_video_1_html5_api > source"))).get_attribute("src"))
driver.quit()
Output:
https://bestdubbedanime.com/xz/api/v.php?u=eVcxb0ZCUEMraFd1Vi9pM2xqWUhtbXZMWjZ0Mlpoc1U0Tmhqc2VFcVViQUc3VUVhR0pZV1EvaW1nY1duaXBMeXYvUUY4RG5ab3p4MEtEMUFHRmVaN0taVG9sY3ZVcTRoeDZoVHhWLzdiYjQ5UStNN2FYSjJBSWNKL0t5S1hLNGEyVlZqV1BYQ2MwaCsyNWcvak1Db01EMnNtWGwwTTBBVld4MkNER0V3eGNCRXJ0cEY4RHFPclhwbTJpWFBPSmJI

Scraping web page after accepting cookies in python

I'm trying to scrape a web page but before accessing the page, there is a banner for accepting cookies. I am using selenium to click on the button "Accept all cookies" but even after clicking on the button I can't access the right HTML page.
This is my code :
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
url = 'https://www.wikiparfum.fr/explore/by-name?query=dior'
driver = webdriver.Chrome(executable_path=DRIVER_PATH)
driver.get(url)
driver.find_element_by_id('onetrust-accept-btn-handler').click()
html = driver.page_source
soup = BeautifulSoup(html, 'lxml')
print(soup)
And this is the beginning of the HTML page that is printed :
If anyone can help me with this one, thank you!
You should wait for the accept cookies button element appearance before clicking it
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
url = 'https://www.wikiparfum.fr/explore/by-name?query=dior'
driver = webdriver.Chrome(executable_path=DRIVER_PATH)
wait = WebDriverWait(driver, 20)
driver.get(url)
wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "#onetrust-accept-btn-handler"))).click()
html = driver.page_source
soup = BeautifulSoup(html, 'lxml')
print(soup)

Python, Selenium. Google Chrome. Web Scraping. How to navigate between 'tabs' in website

im quite noob in python and right now building up a web scraper in Selenium that would take all URL's for products in the clicked 'tab' on web page. But my code take the URL's from the first 'tab'. Code below. Thank you guys. Im starting to be kind of frustrated lol.
Screenshot
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
from lxml import html
PATH = 'C:\Program Files (x86)\chromedriver.exe'
driver = webdriver.Chrome(PATH)
url = 'https://www.alza.sk/vypredaj-akcia-zlava/e0.htm'
driver.get(url)
driver.find_element_by_xpath('//*[#id="tabs"]/ul/li[2]').click()
links = []
try:
WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.CLASS_NAME, 'blockFilter')))
link = driver.find_elements_by_xpath("//a[#class='name browsinglink impression-binded']")
for i in link:
links.append(i.get_attribute('href'))
finally:
driver.quit()
print(links)
To select current tab:
current_tab = driver.current_window_handle
To switch between tabs:
driver.switch_to_window(driver.window_handles[1])
driver.switch_to.window(driver.window_handles[-1])
Assuming you have the new tab url as TAB_URL, you should try:
from selenium.webdriver.common.action_chains import ActionChains
action = ActionChains(driver)
action.key_down(Keys.CONTROL).click(TAB_URL).key_up(Keys.CONTROL).perform()
Also, apparently the li doesn't have a click event, are you sure this element you are getting '//*[#id="tabs"]/ul/li[2]' has the aria-selected property set to true or any of these classes: ui-tabs-active ui-state-active?
If not, you should call click on the a tag inside this li.
Then you should increase the timeout parameter of your WebDriverWait to guarantee that the div is loaded.

Python Selenium Webdriver doesn't refresh html after changing dropdown value in AJAX pages

I'm trying to scrape an AJAX webpage using Python and Selenium. The problem is, when I change the dropdown value, the page content changes according to my selection, but the selenium returns the same old html code from the page. I'd appreciate if anyone can help. Here is my code:
from selenium import webdriver
from selenium.webdriver.support.ui import Select
import time
url = "https://myurl.com/PATH"
driver = webdriver.Chrome()
driver.get(url)
time.sleep(5)
# change the dropdown value
sprintSelect = Select(driver.find_element_by_id("dropdown-select"))
sprintSelect.select_by_visible_text("DropDown_Value2")
html = driver.execute_script("return document.documentElement.outerHTML")
print(html)
You need to wait for the ajax to load the website after your selection.
Try to put implicit or explicit wait after selection.
driver.implicitly_wait(10) # 10 seconds
or if you know the tag/id etc. of the web element you want, try the explicit
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
element = WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.ID, "some_ID"))

Categories

Resources