I was trying to get the embedded video URL from https://www.fmovies.is . I'm using selenium.PhantomJS(). The exact same code works perfectly if I use selenium.Firefox() driver . It seems as though I'm doing something wrong during the waiting phase.
If someone could point out what I was doing wrong , I would really appreciate it.
from bs4 import BeautifulSoup
from selenium import webdriver
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver import DesiredCapabilities
desired_capabilities = DesiredCapabilities.PHANTOMJS.copy()
desired_capabilities['phantomjs.page.customHeaders.User-Agent'] = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_5)AppleWebKit 537.36 (KHTML, like Gecko) Chrome"
desired_capabilities['phantomjs.page.customHeaders.Accept'] = "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8"
url = "https://fmovies.is/film/kung-fu-panda-2.9kx/q8kkyj"
driver = webdriver.PhantomJS(service_args=['--ignore-ssl-errors=true', '--ssl-protocol=any'],desired_capabilities=desired_capabilities)
driver.get(url)
try:
element = WebDriverWait(driver, 100).until(EC.presence_of_element_located((By.ID, "jw")))
finally:
driver.find_element_by_id("player").click()
pageSource = driver.page_source
soup = BeautifulSoup(pageSource,'lxml')
url = soup.find("video",{"class":"jw-video"})
print url
videoURL = ''
if url:
videoURL = url['src']
print videoURL
Related
I am trying to scrape data from the following site. I was able to click on load more yet the code doesn't catch most of the elements and I do not really know what to do.
url = 'https://www.carrefouregypt.com/mafegy/en/c/FEGY1701230'
products = []
options = Options()
driver = webdriver.Chrome(options = options)
driver.get(url)
time.sleep(8)
#click on load more
while True:
try:
btn_class = 'css-1n3fqy0'
btn = driver.find_element(By.CLASS_NAME , btn_class)
btn.click()
driver.implicitly_wait(10)
except NoSuchElementException:
break
driver.execute_script("window.scrollTo(0,document.body.scrollHeight)")
time.sleep(8)
The following code will click that button until it cannot locate it, and exit gracefully:
from selenium.common.exceptions import NoSuchElementException, TimeoutException
from selenium import webdriver
from selenium.webdriver.firefox.service import Service
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.firefox.options import Options as Firefox_Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support.ui import Select
from selenium.webdriver.support import expected_conditions as EC
import time as t
firefox_options = Firefox_Options()
firefox_options.add_argument("--width=1280")
firefox_options.add_argument("--height=720")
# firefox_options.headless = True
firefox_options.set_preference("general.useragent.override", "Mozilla/5.0 (Linux; Android 7.0; SM-A310F Build/NRD90M) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.91 Mobile Safari/537.36 OPR/42.7.2246.114996")
driverService = Service('chromedriver/geckodriver')
browser = webdriver.Firefox(service=driverService, options=firefox_options)
url = 'https://www.carrefouregypt.com/mafegy/en/c/FEGY1701230'
browser.get(url)
t.sleep(5)
while True:
try:
load_more_button = WebDriverWait(browser, 10).until(EC.element_to_be_clickable((By.XPATH,'//button[text()="Load More"]')))
browser.execute_script('window.scrollBy(0, 100);')
load_more_button.click()
print('clicked')
t.sleep(3)
except TimeoutException:
print('all elements loaded in page')
break
It's using Firefox, on a linux setup (for some reasons Chrome was temperamental on this one). You just have to observe the imports, and the code after defining the browser/driver. Selenium documentation: https://www.selenium.dev/documentation/
I am trying to scrape the blog post titles using Selenium with Python of the following URL: https://blog.coinbase.com/tagged/coinbase-pro. When I use Selenium to get the page source, it does not contain the blog post titles, but the Chrome source code does when I right click and select "view page source". I'm using the following code:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
options = Options()
options.headless = True
driver = webdriver.Chrome(options=options)
driver.get("https://blog.coinbase.com/tagged/coinbase-pro")
pageSource = driver.page_source
print(pageSource)
Any help would be appreciated.
Thanks.
You can fetch all the titles from that webpage in several ways. The efficient and fastest way would be to opt for requests.
This is how you can grab the titles using requests:
import re
import json
import time
import requests
link = 'https://medium.com/the-coinbase-blog/load-more'
params = {
'sortBy': 'tagged',
'tagSlug': 'coinbase-pro',
'limit': 25,
'to': int(time.time() * 1000),
}
with requests.Session() as s:
s.headers['User-Agent'] = 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.150 Safari/537.36'
s.headers['accept'] = 'application/json'
s.headers['referer'] = 'https://blog.coinbase.com/tagged/coinbase-pro'
while True:
res = s.get(link,params=params)
container = json.loads(re.findall("[^{]+(.*)",res.text)[0])
for k,v in container['payload']['references']['Post'].items():
title = v['title']
print(title)
try:
next_page = container['payload']['paging']['next']['to']
except KeyError:
break
params['to'] = next_page
However, if it is selenium you want to stick with, try the following:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.support import expected_conditions as EC
def scroll_down_to_the_bottom():
check_height = driver.execute_script("return document.body.scrollHeight;")
while True:
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
try:
WebDriverWait(driver,10).until(lambda driver: driver.execute_script("return document.body.scrollHeight;") > check_height)
check_height = driver.execute_script("return document.body.scrollHeight;")
except TimeoutException:
break
with webdriver.Chrome() as driver:
driver.get("https://blog.coinbase.com/tagged/coinbase-pro")
scroll_down_to_the_bottom()
for item in WebDriverWait(driver,10).until(EC.visibility_of_all_elements_located((By.CSS_SELECTOR,".section-content h3.graf--title"))):
print(item.text)
wait=WebDriverWait(driver,30)
driver.get("https://blog.coinbase.com/tagged/coinbase-pro")
elements=wait.until(EC.visibility_of_all_elements_located((By.CSS_SELECTOR,".graf.graf--h3.graf-after--figure.graf--trailing.graf--title")))
for elem in elements:
print(elem.text)
If you wanted the 8 titles you can grab them by their css selector using waits.
Import:
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
Outputs:
Inverse Finance (INV), Liquity (LQTY), Polyswarm (NCT) and Propy (PRO) are launching on Coinbase Pro
Goldfinch Protocol (GFI) is launching on Coinbase Pro
Decentralized Social (DESO) is launching on Coinbase Pro
API3 (API3), Bluezelle (BLZ), Gods Unchained (GODS), Immutable X (IMX), Measurable Data Token (MDT) and Ribbon…
Circuits of Value (COVAL), IDEX (IDEX), Moss Carbon Credit (MCO2), Polkastarter (POLS), ShapeShift FOX Token (FOX)…
Voyager Token (VGX) is launching on Coinbase Pro
Alchemix (ALCX), Ethereum Name Service (ENS), Gala (GALA), mStable USD (MUSD) and Power Ledger (POWR) are launching…
Crypto.com Protocol (CRO) is launching on Coinbase Pro
I have a problem with even an open website using "webdriver Chrome". Only trying to open the website end with "Access denied" information and don't know why.
Below is my code:
from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as ec
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
import time
class PriceCheckPhoenix:
def __init__(self):
self.url_login = "https://www.phoenixcontact.com/online/portal/pl?1dmy&urile=wcm%3apath%3a/plpl/web/home"
self.create_session()
def create_session(self):
# Run browser with webdriver
driver = webdriver.Chrome(executable_path="D:/chromedriver_v84.exe")
driver.get(self.url_login)
time.sleep(2)
# Find link to sub-website with login
link = driver.find_element_by_xpath('//*[#id="pxc-funcnav"]/div[3]/ul/li[1]/a').get_attribute("href")
driver.get(link)
time.sleep(100)
Description to code:
#1 I create browser chrome session
#2 Loading first website from self.url_login
#3 Is loaded
#4 I need to find a link behind the active text on the website to log in
#5 I found it and try to open this, but the response after getting a link is:
Access Denied
You don't have permission to access
"http://www.phoenixcontact.com/online/portal/pl/pxc/offcontext/login/!ut/p/z1/tZJNa4NAEIZ_Sw45yszuuro9WkO1xqY2EqN7EbXGWPzYFDGlv74Gcio0oYTMZRgY3mcYHpAQg-yysa6yoe67rJnmRBqpu4zownzixDEYx2cWmIYTeYgrHSKQIFVRv0MieJZTZEITglFNLwTXRPaw03RGC6Qm10nOTttFN6hhD4lqVDPHY5nPcd-3JSQTy0ypQ5C4Onl5XUcmvgXCttzNWo-WCNuxLo-w6frPdjot_CfZxWsEciPhSjy7a7xN7xt_63M8kJdNmlSrPw4HaU2G9N1Qfg0Q_1Zke4JeiPHIeQH_KAshVE0a-GkQ24EPqm0F41WbLh5XWuKN3-fm78KgsmazH7dw0Ts!/dz/d5/L0lJSklKQ2dwUkEhIS9JRGpBQUF4QUFFUkNwcVlxLzRObEdRb1lwTWhUalVFZyEvWjZfR0FMNjE0ODI4RzNEQzBJMklPMlA2OTFHMDMvWjdfR0FMNjE0ODI4RzNEQzBJMklPMlA2OTFHSTcvdGFyZ2V0Vmlldy9sb2dpbg!!/" on this server.
Reference #18.d58655f.1597921471.5b29112
Is anyone know what is wrong here? :( When I try to load the website from the link in normal Chrome browser it's all fine :/
Thank you all for any help.
Please try the below code and let me know if it works for you :-
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver import ActionChains
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
import time
options = Options()
user_agent = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/33.0.1750.517 Safari/537.36'
options.add_argument('user-agent={0}'.format(user_agent))
driver = webdriver.Chrome(options=options)
wait = WebDriverWait(driver, 20)
action = ActionChains(driver)
driver.get("https://www.phoenixcontact.com/online/portal/pl?1dmy&urile=wcm%3apath%3a/plpl/web/home")
Login_Btn = wait.until(EC.element_to_be_clickable((By.XPATH, "//*[#class='pxc-fn-login']/a")))
action.move_to_element(Login_Btn).click().perform()
Note - Please make the changes in your code accordingly.
Google search brought me here. After trying several options. Undetected Chromedriver with a very simple script without any options worked for me.
import undetected_chromedriver as uc
driver = uc.Chrome()
driver.get(<url here>)
I'm trying to write a program that extracts the prices of the below website. I'm downloading the site with selenium and then try to parse it either with beautifulsoup or with selenium itself.
I determined that the information I want is always class="totalPrice" and I would like to extract them all, ideally as a list.
<td class="totalPrice" colspan="3">
Total: £560
<span class="sr_room_reinforcement"></span>
</td>
For some reason the below queries never find any totalPrice. Any suggestions what I'm doing wrong would be appreciated.
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup as bs
url='http://www.booking.com/searchresults.en-gb.html?label=gen173nr-17CAEoggJCAlhYSDNiBW5vcmVmaFCIAQGYAS64AQTIAQTYAQHoAQH4AQs;sid=1a43e0952558ac0ad0061d5b6523a7bc;dcid=1;checkin_monthday=4;checkin_year_month=2016-2;checkout_monthday=11;checkout_year_month=2016-2;city=-2601889;class_interval=1;csflt=%7B%7D;group_adults=7;group_children=0;highlighted_hotels=1192837;hp_sbox=1;label_click=undef;no_rooms=1;review_score_group=empty;room1=A%2CA%2CA%2CA%2CA%2CA%2CA;sb_price_type=total;score_min=0;si=ai%2Cco%2Cci%2Cre%2Cdi;ss=London;ssafas=1;ssb=empty;ssne=London;ssne_untouched=London&;order=price_for_two'
driver = webdriver.PhantomJS(r"C:\\Program Files (x86)\\phantomjs-2.0.0-windows\\bin\\phantomjs.exe")
#driver = webdriver.Firefox()
driver.get(url)
# for elm in driver.find_element_by_class_name("totalPrice"):
# print(elm.text)
content = driver.page_source
soup = bs(content, 'lxml')
for e in soup.find_all('totalPrice'):
print(e.name)
driver.close()
First of all, you need to wait when the total prices would be loaded. Use WebDriverWait class with a precense_of_element_located Expected Condition.
I've also found out that you would need to pretend not to be PhantomJS by overriding the browser's User-Agent through the Desired Capabilities.
Complete working code:
from selenium import webdriver
from selenium.webdriver import DesiredCapabilities
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup as bs
url = 'http://www.booking.com/searchresults.en-gb.html?label=gen173nr-17CAEoggJCAlhYSDNiBW5vcmVmaFCIAQGYAS64AQTIAQTYAQHoAQH4AQs;sid=1a43e0952558ac0ad0061d5b6523a7bc;dcid=1;checkin_monthday=4;checkin_year_month=2016-2;checkout_monthday=11;checkout_year_month=2016-2;city=-2601889;class_interval=1;csflt=%7B%7D;group_adults=7;group_children=0;highlighted_hotels=1192837;hp_sbox=1;label_click=undef;no_rooms=1;review_score_group=empty;room1=A%2CA%2CA%2CA%2CA%2CA%2CA;sb_price_type=total;score_min=0;si=ai%2Cco%2Cci%2Cre%2Cdi;ss=London;ssafas=1;ssb=empty;ssne=London;ssne_untouched=London&;order=price_for_two'
# setting a custom User-Agent
user_agent = (
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_4) " +
"AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.57 Safari/537.36"
)
dcap = dict(DesiredCapabilities.PHANTOMJS)
dcap["phantomjs.page.settings.userAgent"] = user_agent
driver = webdriver.PhantomJS(desired_capabilities=dcap)
driver.get(url)
# wait for the total prices to become present
WebDriverWait(driver, 30).until(EC.presence_of_element_located((By.CSS_SELECTOR, ".totalPrice")))
content = driver.page_source
driver.close()
soup = bs(content, 'lxml')
for e in soup.select('.totalPrice'):
print(e.text.strip())
It prints:
Total: US$781
Total: US$814
Total: US$831
Total: US$864
Total: US$895
Total: US$914
Total: US$915
Total: US$967
Total: US$1,031
As a side note, you don't really need BeautifulSoup - you can locate elements with selenium - it is quite powerful. Here is how you can locate the total prices:
for price in driver.find_elements_by_css_selector(".totalPrice"):
print(price.text.strip())
Trying to screen scrape a web site without having to launch an actual browser instance in a python script (using Selenium). I can do this with Chrome or Firefox - I've tried it and it works - but I want to use PhantomJS so it's headless.
The code looks like this:
import sys
import traceback
import time
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
dcap = dict(DesiredCapabilities.PHANTOMJS)
dcap["phantomjs.page.settings.userAgent"] = (
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/53 "
"(KHTML, like Gecko) Chrome/15.0.87"
)
try:
# Choose our browser
browser = webdriver.PhantomJS(desired_capabilities=dcap)
#browser = webdriver.PhantomJS()
#browser = webdriver.Firefox()
#browser = webdriver.Chrome(executable_path="/usr/local/bin/chromedriver")
# Go to the login page
browser.get("https://www.whatever.com")
# For debug, see what we got back
html_source = browser.page_source
with open('out.html', 'w') as f:
f.write(html_source)
# PROCESS THE PAGE (code removed)
except Exception, e:
browser.save_screenshot('screenshot.png')
traceback.print_exc(file=sys.stdout)
finally:
browser.close()
The output is merely:
<html><head></head><body></body></html>
But when I use the Chrome or Firefox options, it works fine. I thought maybe the web site was returning junk based on the user agent, so I tried faking that out. No difference.
What am I missing?
UPDATED: I will try to keep the below snippet updated with until it works. What's below is what I'm currently trying.
import sys
import traceback
import time
import re
from selenium import webdriver
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
from selenium.webdriver.support import expected_conditions as EC
dcap = dict(DesiredCapabilities.PHANTOMJS)
dcap["phantomjs.page.settings.userAgent"] = (
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/53 (KHTML, like Gecko) Chrome/15.0.87")
try:
# Set up our browser
browser = webdriver.PhantomJS(desired_capabilities=dcap, service_args=['--ignore-ssl-errors=true'])
#browser = webdriver.Chrome(executable_path="/usr/local/bin/chromedriver")
# Go to the login page
print "getting web page..."
browser.get("https://www.website.com")
# Need to wait for the page to load
timeout = 10
print "waiting %s seconds..." % timeout
wait = WebDriverWait(browser, timeout)
element = wait.until(EC.element_to_be_clickable((By.ID,'the_id')))
print "done waiting. Response:"
# Rest of code snipped. Fails as "wait" above.
I was facing the same problem and no amount of code to make the driver wait was helping.
The problem is the SSL encryption on the https websites, ignoring them will do the trick.
Call the PhantomJS driver as:
driver = webdriver.PhantomJS(service_args=['--ignore-ssl-errors=true', '--ssl-protocol=TLSv1'])
This solved the problem for me.
You need to wait for the page to load. Usually, it is done by using an Explicit Wait to wait for a key element to be present or visible on a page. For instance:
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
# ...
browser.get("https://www.whatever.com")
wait = WebDriverWait(driver, 10)
wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "div.content")))
html_source = browser.page_source
# ...
Here, we'll wait up to 10 seconds for a div element with class="content" to become visible before getting the page source.
Additionally, you may need to ignore SSL errors:
browser = webdriver.PhantomJS(desired_capabilities=dcap, service_args=['--ignore-ssl-errors=true'])
Though, I'm pretty sure this is related to the redirecting issues in PhantomJS. There is an open ticket in phantomjs bugtracker:
PhantomJS does not follow some redirects
driver = webdriver.PhantomJS(service_args=['--ignore-ssl-errors=true', '--ssl-protocol=TLSv1'])
This worked for me