I'm trying to write a program that extracts the prices of the below website. I'm downloading the site with selenium and then try to parse it either with beautifulsoup or with selenium itself.
I determined that the information I want is always class="totalPrice" and I would like to extract them all, ideally as a list.
<td class="totalPrice" colspan="3">
Total: £560
<span class="sr_room_reinforcement"></span>
</td>
For some reason the below queries never find any totalPrice. Any suggestions what I'm doing wrong would be appreciated.
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup as bs
url='http://www.booking.com/searchresults.en-gb.html?label=gen173nr-17CAEoggJCAlhYSDNiBW5vcmVmaFCIAQGYAS64AQTIAQTYAQHoAQH4AQs;sid=1a43e0952558ac0ad0061d5b6523a7bc;dcid=1;checkin_monthday=4;checkin_year_month=2016-2;checkout_monthday=11;checkout_year_month=2016-2;city=-2601889;class_interval=1;csflt=%7B%7D;group_adults=7;group_children=0;highlighted_hotels=1192837;hp_sbox=1;label_click=undef;no_rooms=1;review_score_group=empty;room1=A%2CA%2CA%2CA%2CA%2CA%2CA;sb_price_type=total;score_min=0;si=ai%2Cco%2Cci%2Cre%2Cdi;ss=London;ssafas=1;ssb=empty;ssne=London;ssne_untouched=London&;order=price_for_two'
driver = webdriver.PhantomJS(r"C:\\Program Files (x86)\\phantomjs-2.0.0-windows\\bin\\phantomjs.exe")
#driver = webdriver.Firefox()
driver.get(url)
# for elm in driver.find_element_by_class_name("totalPrice"):
# print(elm.text)
content = driver.page_source
soup = bs(content, 'lxml')
for e in soup.find_all('totalPrice'):
print(e.name)
driver.close()
First of all, you need to wait when the total prices would be loaded. Use WebDriverWait class with a precense_of_element_located Expected Condition.
I've also found out that you would need to pretend not to be PhantomJS by overriding the browser's User-Agent through the Desired Capabilities.
Complete working code:
from selenium import webdriver
from selenium.webdriver import DesiredCapabilities
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup as bs
url = 'http://www.booking.com/searchresults.en-gb.html?label=gen173nr-17CAEoggJCAlhYSDNiBW5vcmVmaFCIAQGYAS64AQTIAQTYAQHoAQH4AQs;sid=1a43e0952558ac0ad0061d5b6523a7bc;dcid=1;checkin_monthday=4;checkin_year_month=2016-2;checkout_monthday=11;checkout_year_month=2016-2;city=-2601889;class_interval=1;csflt=%7B%7D;group_adults=7;group_children=0;highlighted_hotels=1192837;hp_sbox=1;label_click=undef;no_rooms=1;review_score_group=empty;room1=A%2CA%2CA%2CA%2CA%2CA%2CA;sb_price_type=total;score_min=0;si=ai%2Cco%2Cci%2Cre%2Cdi;ss=London;ssafas=1;ssb=empty;ssne=London;ssne_untouched=London&;order=price_for_two'
# setting a custom User-Agent
user_agent = (
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_4) " +
"AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.57 Safari/537.36"
)
dcap = dict(DesiredCapabilities.PHANTOMJS)
dcap["phantomjs.page.settings.userAgent"] = user_agent
driver = webdriver.PhantomJS(desired_capabilities=dcap)
driver.get(url)
# wait for the total prices to become present
WebDriverWait(driver, 30).until(EC.presence_of_element_located((By.CSS_SELECTOR, ".totalPrice")))
content = driver.page_source
driver.close()
soup = bs(content, 'lxml')
for e in soup.select('.totalPrice'):
print(e.text.strip())
It prints:
Total: US$781
Total: US$814
Total: US$831
Total: US$864
Total: US$895
Total: US$914
Total: US$915
Total: US$967
Total: US$1,031
As a side note, you don't really need BeautifulSoup - you can locate elements with selenium - it is quite powerful. Here is how you can locate the total prices:
for price in driver.find_elements_by_css_selector(".totalPrice"):
print(price.text.strip())
Related
I am trying to write a code that scrapes all reviews from a single hotel on tripadvisor. The code runs through all pages except the last one, where it has a problem. It says that the problem is the next.click() in the loop. I am assuming this is because "next" is still present in the element, but just disabled. Anyone know how to fix this problem? I basically want it to not try to click next when it reaches the last page/when it is disabled, but still technically present. Any help would be much appreciated!
#maybe3.1
from argparse import Action
from calendar import month
from distutils.command.clean import clean
from lib2to3.pgen2 import driver
from os import link
import unittest
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.common.exceptions import ElementNotInteractableException
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup
from dateutil import relativedelta
from selenium.webdriver.common.action_chains import ActionChains
import time
import datetime
from selenium.common.exceptions import StaleElementReferenceException
from selenium.common.exceptions import NoSuchElementException
import pandas as pd
import requests
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
# Extract the HTML and create a BeautifulSoup object.
url = ('https://www.tripadvisor.com/Hotel_Review-g46833-d256905-Reviews-Knights_Inn_South_Hackensack-South_Hackensack_New_Jersey.html#REVIEWS')
user_agent = ({'User-Agent':
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) \
AppleWebKit/537.36 (KHTML, like Gecko) \
Chrome/90.0.4430.212 Safari/537.36',
'Accept-Language': 'en-US, en;q=0.5'})
driver = webdriver.Chrome()
driver.get(url)
# Find and extract the data elements.
wait = WebDriverWait(driver,30)
wait.until(EC.element_to_be_clickable((By.XPATH,'//*[#id="component_15"]/div/div[3]/div[13]/div')))
#explicit wait here
next = driver.find_element(By.XPATH,'.//a[#class="ui_button nav next primary "]')
here = next.is_displayed()
while here == True:
time.sleep(2)
soup = BeautifulSoup(driver.page_source, 'html.parser')
time.sleep(2)
Titles = []
for title in soup.findAll('a',{'Qwuub'}):
Titles.append(title.text.strip())
reviews = []
for review in soup.findAll('q',{'class':'QewHA H4 _a'}):
reviews.append(review.text.strip())
next.click()
if here != True:
time.sleep(2)
soup = BeautifulSoup(driver.page_source, 'html.parser')
time.sleep(8)
break
# Create the dictionary.
dict = {'Review Title':Titles,'Reviews/Feedback':reviews}
# Create the dataframe.
datafr = pd.DataFrame.from_dict(dict)
datafr.head(10)
# Convert dataframe to CSV file.
datafr.to_csv('hotels1.855.csv', index=False, header=True)
This question might be in the same vein like:
python selenium to check if this text field is disabled or not
You can check if an element is enabled with:
driver.find_element_by_id("id").is_enabled
You can also wrap the code in a try/except block.
page=2
while True:
try:
#your code
driver.find_element(By.XPATH,f"//a[#class='pageNum ' and text()='{page}']").click()
page+=1
time.sleep(1)
except:
break
Should be a simple loop to go through all pages wait till the a tag in question is not valid anymore.
import time
I am trying to scrape the blog post titles using Selenium with Python of the following URL: https://blog.coinbase.com/tagged/coinbase-pro. When I use Selenium to get the page source, it does not contain the blog post titles, but the Chrome source code does when I right click and select "view page source". I'm using the following code:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
options = Options()
options.headless = True
driver = webdriver.Chrome(options=options)
driver.get("https://blog.coinbase.com/tagged/coinbase-pro")
pageSource = driver.page_source
print(pageSource)
Any help would be appreciated.
Thanks.
You can fetch all the titles from that webpage in several ways. The efficient and fastest way would be to opt for requests.
This is how you can grab the titles using requests:
import re
import json
import time
import requests
link = 'https://medium.com/the-coinbase-blog/load-more'
params = {
'sortBy': 'tagged',
'tagSlug': 'coinbase-pro',
'limit': 25,
'to': int(time.time() * 1000),
}
with requests.Session() as s:
s.headers['User-Agent'] = 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.150 Safari/537.36'
s.headers['accept'] = 'application/json'
s.headers['referer'] = 'https://blog.coinbase.com/tagged/coinbase-pro'
while True:
res = s.get(link,params=params)
container = json.loads(re.findall("[^{]+(.*)",res.text)[0])
for k,v in container['payload']['references']['Post'].items():
title = v['title']
print(title)
try:
next_page = container['payload']['paging']['next']['to']
except KeyError:
break
params['to'] = next_page
However, if it is selenium you want to stick with, try the following:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.support import expected_conditions as EC
def scroll_down_to_the_bottom():
check_height = driver.execute_script("return document.body.scrollHeight;")
while True:
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
try:
WebDriverWait(driver,10).until(lambda driver: driver.execute_script("return document.body.scrollHeight;") > check_height)
check_height = driver.execute_script("return document.body.scrollHeight;")
except TimeoutException:
break
with webdriver.Chrome() as driver:
driver.get("https://blog.coinbase.com/tagged/coinbase-pro")
scroll_down_to_the_bottom()
for item in WebDriverWait(driver,10).until(EC.visibility_of_all_elements_located((By.CSS_SELECTOR,".section-content h3.graf--title"))):
print(item.text)
wait=WebDriverWait(driver,30)
driver.get("https://blog.coinbase.com/tagged/coinbase-pro")
elements=wait.until(EC.visibility_of_all_elements_located((By.CSS_SELECTOR,".graf.graf--h3.graf-after--figure.graf--trailing.graf--title")))
for elem in elements:
print(elem.text)
If you wanted the 8 titles you can grab them by their css selector using waits.
Import:
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
Outputs:
Inverse Finance (INV), Liquity (LQTY), Polyswarm (NCT) and Propy (PRO) are launching on Coinbase Pro
Goldfinch Protocol (GFI) is launching on Coinbase Pro
Decentralized Social (DESO) is launching on Coinbase Pro
API3 (API3), Bluezelle (BLZ), Gods Unchained (GODS), Immutable X (IMX), Measurable Data Token (MDT) and Ribbon…
Circuits of Value (COVAL), IDEX (IDEX), Moss Carbon Credit (MCO2), Polkastarter (POLS), ShapeShift FOX Token (FOX)…
Voyager Token (VGX) is launching on Coinbase Pro
Alchemix (ALCX), Ethereum Name Service (ENS), Gala (GALA), mStable USD (MUSD) and Power Ledger (POWR) are launching…
Crypto.com Protocol (CRO) is launching on Coinbase Pro
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver import ActionChains
from selenium.common.exceptions import TimeoutException
driver = webdriver.Chrome(r'C:\chromedriver.exe')
driver.get('https://www.gigadocs.com/hyderabad/dentist')
driver.find_element_by_xpath('//*[#id="listingTab"]/div[2]/div/div[1]/div[1]/div/div[2]/div[2]/ul/li[1]/span'). click()
soup = BeautifulSoup(driver.page_source,'html.parser')
mobile = soup.find('ul',class_='detailsList')
print(mobile)
i am trying to click on view contact to scrape the mobile number but after clicking , getting same output as view contact.
You don't need the overhead of selenium. The page makes POST requests using ids for doctor and clinic to retrieve telephone numbers. You can scrape these ids from the initial page then mimic those requests to get the tel numbers. I use doctor id as the key for a dictionary and update the values with the tel number.
import requests
from bs4 import BeautifulSoup as bs
headers = {
'User-Agent': 'Mozilla/5.0',
'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
'X-Requested-With': 'XMLHttpRequest'
}
data = {
'doctorId': '3806', #[data-doctor]
'clinicId': '1519', #[data-clinic]
'clickSource': 'mobile'
}
with requests.Session() as s:
s.headers = headers
r = s.get('https://www.gigadocs.com/hyderabad/dentist')
soup = bs(r.content, 'lxml')
tel_numbers = {i['data-doctor']:i['data-clinic'] for i in soup.select('.appointmentBtn')}
for k, v in tel_numbers.items():
data['doctorId'] = k
data['clinicId'] = v
r = s.post('https://www.gigadocs.com/search/getmobilenumbers', data=data).json()
tel_numbers[k] = r['mobile']
print(tel_numbers)
Induce WebDriverWait and element_to_be_clickable() and click the view contact and then get the li tag under ul tag.
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
driver = webdriver.Chrome(r'C:\chromedriver.exe')
driver.get('https://www.gigadocs.com/hyderabad/dentist')
WebDriverWait(driver,10).until(EC.element_to_be_clickable((By.XPATH,"//span[#data-source='mobile'][text()='View Contact']"))).click()
time.sleep(1)
soup = BeautifulSoup(driver.page_source,'html.parser')
mobile = soup.find('ul',class_='detailsList')
print(mobile.find('li').text)
I have written a script in python using selenium to fetch the business summary (which is within p tag) located at the bottom right corner of a webpage under the header Company profile. The webpage is heavily dynamic, so I thought to use a browser simulator. I have created a css selector, which is able to parse the summary if I copy the html elements directly from that webpage and try on it locally. For some reason, when I tried the same selector within my below script, it doesn't do the trick. It throws timeout exception error instead. How can I fetch it?
This is my try:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
link = "https://in.finance.yahoo.com/quote/AAPL?p=AAPL"
def get_information(driver, url):
driver.get(url)
item = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "[id$='-QuoteModule'] p[class^='businessSummary']")))
driver.execute_script("arguments[0].scrollIntoView();", item)
print(item.text)
if __name__ == "__main__":
driver = webdriver.Chrome()
wait = WebDriverWait(driver, 20)
try:
get_information(driver,link)
finally:
driver.quit()
It seem that there is no Business Summary block initially, but it is generated after you scroll page down. Try below solution:
from selenium.webdriver.common.keys import Keys
def get_information(driver, url):
driver.get(url)
driver.find_element_by_tag_name("body").send_keys(Keys.END)
item = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "[id$='-QuoteModule'] p[class^='businessSummary']")))
print(item.text)
You have to scroll the page down twice until the element will be present:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
import time
link = "https://in.finance.yahoo.com/quote/AAPL?p=AAPL"
def get_information(driver, url):
driver.get(url)
driver.find_element_by_tag_name("body").send_keys(Keys.END) # scroll page
time.sleep(1) # small pause between
driver.find_element_by_tag_name("body").send_keys(Keys.END) # one more time
item = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "[id$='-QuoteModule'] p[class^='businessSummary']")))
driver.execute_script("arguments[0].scrollIntoView();", item)
print(item.text)
if __name__ == "__main__":
driver = webdriver.Chrome()
wait = WebDriverWait(driver, 20)
try:
get_information(driver,link)
finally:
driver.quit()
If you will scroll only one time it won't work properly at some reason(at least for me). I think it depends on window dimensions, on the smaller window you have to scroll more than on a bigger one.
Here is a much simpler approach using requests and working with the JSON data that is already in the page. I would also recommend to always use request if possible. It may take some extra work but the end result is a lot more reliable / cleaner. You could also take my example a lot further and parse the JSON to work directly with it (you need to clean up the text to be valid JSON). In my example I just use split which was just faster to do but it could lead to problems down the road when doing something more complex.
import requests
from lxml import html
url = 'https://in.finance.yahoo.com/quote/AAPL?p=AAPL'
headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'}
r = requests.get(url, headers=headers)
tree = html.fromstring(r.text)
data= [e.text_content() for e in tree.iter('script') if 'root.App.main = ' in e.text_content()][0]
data = data.split('longBusinessSummary":"')[1]
data = data.split('","city')[0]
print (data)
I was trying to get the embedded video URL from https://www.fmovies.is . I'm using selenium.PhantomJS(). The exact same code works perfectly if I use selenium.Firefox() driver . It seems as though I'm doing something wrong during the waiting phase.
If someone could point out what I was doing wrong , I would really appreciate it.
from bs4 import BeautifulSoup
from selenium import webdriver
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver import DesiredCapabilities
desired_capabilities = DesiredCapabilities.PHANTOMJS.copy()
desired_capabilities['phantomjs.page.customHeaders.User-Agent'] = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_5)AppleWebKit 537.36 (KHTML, like Gecko) Chrome"
desired_capabilities['phantomjs.page.customHeaders.Accept'] = "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8"
url = "https://fmovies.is/film/kung-fu-panda-2.9kx/q8kkyj"
driver = webdriver.PhantomJS(service_args=['--ignore-ssl-errors=true', '--ssl-protocol=any'],desired_capabilities=desired_capabilities)
driver.get(url)
try:
element = WebDriverWait(driver, 100).until(EC.presence_of_element_located((By.ID, "jw")))
finally:
driver.find_element_by_id("player").click()
pageSource = driver.page_source
soup = BeautifulSoup(pageSource,'lxml')
url = soup.find("video",{"class":"jw-video"})
print url
videoURL = ''
if url:
videoURL = url['src']
print videoURL