Scraping Blog Post Titles with Selenium - Python

Scraping Blog Post Titles with Selenium - Python - python

I am trying to scrape the blog post titles using Selenium with Python of the following URL: https://blog.coinbase.com/tagged/coinbase-pro. When I use Selenium to get the page source, it does not contain the blog post titles, but the Chrome source code does when I right click and select "view page source". I'm using the following code:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
options = Options()
options.headless = True
driver = webdriver.Chrome(options=options)
driver.get("https://blog.coinbase.com/tagged/coinbase-pro")
pageSource = driver.page_source
print(pageSource)
Any help would be appreciated.
Thanks.

You can fetch all the titles from that webpage in several ways. The efficient and fastest way would be to opt for requests.
This is how you can grab the titles using requests:
import re
import json
import time
import requests
link = 'https://medium.com/the-coinbase-blog/load-more'
params = {
'sortBy': 'tagged',
'tagSlug': 'coinbase-pro',
'limit': 25,
'to': int(time.time() * 1000),
}
with requests.Session() as s:
s.headers['User-Agent'] = 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.150 Safari/537.36'
s.headers['accept'] = 'application/json'
s.headers['referer'] = 'https://blog.coinbase.com/tagged/coinbase-pro'
while True:
res = s.get(link,params=params)
container = json.loads(re.findall("[^{]+(.*)",res.text)[0])
for k,v in container['payload']['references']['Post'].items():
title = v['title']
print(title)
try:
next_page = container['payload']['paging']['next']['to']
except KeyError:
break
params['to'] = next_page
However, if it is selenium you want to stick with, try the following:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.support import expected_conditions as EC
def scroll_down_to_the_bottom():
check_height = driver.execute_script("return document.body.scrollHeight;")
while True:
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
try:
WebDriverWait(driver,10).until(lambda driver: driver.execute_script("return document.body.scrollHeight;") > check_height)
check_height = driver.execute_script("return document.body.scrollHeight;")
except TimeoutException:
break
with webdriver.Chrome() as driver:
driver.get("https://blog.coinbase.com/tagged/coinbase-pro")
scroll_down_to_the_bottom()
for item in WebDriverWait(driver,10).until(EC.visibility_of_all_elements_located((By.CSS_SELECTOR,".section-content h3.graf--title"))):
print(item.text)

wait=WebDriverWait(driver,30)
driver.get("https://blog.coinbase.com/tagged/coinbase-pro")
elements=wait.until(EC.visibility_of_all_elements_located((By.CSS_SELECTOR,".graf.graf--h3.graf-after--figure.graf--trailing.graf--title")))
for elem in elements:
print(elem.text)
If you wanted the 8 titles you can grab them by their css selector using waits.
Import:
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
Outputs:
Inverse Finance (INV), Liquity (LQTY), Polyswarm (NCT) and Propy (PRO) are launching on Coinbase Pro
Goldfinch Protocol (GFI) is launching on Coinbase Pro
Decentralized Social (DESO) is launching on Coinbase Pro
API3 (API3), Bluezelle (BLZ), Gods Unchained (GODS), Immutable X (IMX), Measurable Data Token (MDT) and Ribbon…
Circuits of Value (COVAL), IDEX (IDEX), Moss Carbon Credit (MCO2), Polkastarter (POLS), ShapeShift FOX Token (FOX)…
Voyager Token (VGX) is launching on Coinbase Pro
Alchemix (ALCX), Ethereum Name Service (ENS), Gala (GALA), mStable USD (MUSD) and Power Ledger (POWR) are launching…
Crypto.com Protocol (CRO) is launching on Coinbase Pro

Related

Selenium Scrape not reading all elements

I am trying to scrape data from the following site. I was able to click on load more yet the code doesn't catch most of the elements and I do not really know what to do.
url = 'https://www.carrefouregypt.com/mafegy/en/c/FEGY1701230'
products = []
options = Options()
driver = webdriver.Chrome(options = options)
driver.get(url)
time.sleep(8)
#click on load more
while True:
try:
btn_class = 'css-1n3fqy0'
btn = driver.find_element(By.CLASS_NAME , btn_class)
btn.click()
driver.implicitly_wait(10)
except NoSuchElementException:
break
driver.execute_script("window.scrollTo(0,document.body.scrollHeight)")
time.sleep(8)

The following code will click that button until it cannot locate it, and exit gracefully:
from selenium.common.exceptions import NoSuchElementException, TimeoutException
from selenium import webdriver
from selenium.webdriver.firefox.service import Service
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.firefox.options import Options as Firefox_Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support.ui import Select
from selenium.webdriver.support import expected_conditions as EC
import time as t
firefox_options = Firefox_Options()
firefox_options.add_argument("--width=1280")
firefox_options.add_argument("--height=720")
# firefox_options.headless = True
firefox_options.set_preference("general.useragent.override", "Mozilla/5.0 (Linux; Android 7.0; SM-A310F Build/NRD90M) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.91 Mobile Safari/537.36 OPR/42.7.2246.114996")
driverService = Service('chromedriver/geckodriver')
browser = webdriver.Firefox(service=driverService, options=firefox_options)
url = 'https://www.carrefouregypt.com/mafegy/en/c/FEGY1701230'
browser.get(url)
t.sleep(5)
while True:
try:
load_more_button = WebDriverWait(browser, 10).until(EC.element_to_be_clickable((By.XPATH,'//button[text()="Load More"]')))
browser.execute_script('window.scrollBy(0, 100);')
load_more_button.click()
print('clicked')
t.sleep(3)
except TimeoutException:
print('all elements loaded in page')
break
It's using Firefox, on a linux setup (for some reasons Chrome was temperamental on this one). You just have to observe the imports, and the code after defining the browser/driver. Selenium documentation: https://www.selenium.dev/documentation/

how to make driver to wait upto 10 seconds to click hyperlink

In the below URL i need to click a mail icon hyperlink, sometimes it is not working even code is correct, in this case driver needs to wait upto 10 seconds and go to the next level
https://www.sciencedirect.com/science/article/pii/S1001841718305011
tags = driver.find_elements_by_xpath('//a[#class="author size-m workspace-trigger"]//*[local-name()="svg"]')
if tags:
for tag in tags:
tag.click()
how to use explicitly or implicitly wait here-- "tag.click()"

from my understanding, after the element clicked it should wait until author popup appear then extract using details() ?
tags = driver.find_elements_by_css_selector('svg.icon-envelope')
if tags:
for tag in tags:
tag.click()
# wait until author dialog/popup on the right appear
WebDriverWait(driver, 10).until(
lambda d: d.find_element_by_class_name('e-address') # selector for email
)
try:
details()
# close the popup
driver.find_element_by_css_selector('button.close-button').click()
except Exception as ex:
print(ex)
continue

As an aside.. you can extract the author contact e-mails (which are same as for click) from json like string in one of the scripts
from selenium import webdriver
import json
d = webdriver.Chrome()
d.get('https://www.sciencedirect.com/science/article/pii/S1001841718305011#!')
script = d.find_element_by_css_selector('script[data-iso-key]').get_attribute('innerHTML')
script = script.replace(':false',':"false"').replace(':true',':"true"')
data = json.loads(script)
authors = data['authors']['content'][0]['$$']
emails = [author['$$'][3]['$']['href'].replace('mailto:','') for author in authors if len(author['$$']) == 4]
print(emails)
d.quit()
You can also use requests to get all the recommendations info
import requests
headers = {
'User-Agent' : 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36'
}
data = requests.get('https://www.sciencedirect.com/sdfe/arp/pii/S1001841718305011/recommendations?creditCardPurchaseAllowed=true&preventTransactionalAccess=false&preventDocumentDelivery=true', headers = headers).json()
print(data)
Sample view:

You have to wait until the element is clickable . You can do it with WebDriverWait function.
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
driver = webdriver.Firefox()
driver.get('url')
elements = driver.find_elements_by_xpath('xpath')
for element in elements:
try:
WebDriverWait(driver, 10).until(
EC.element_to_be_clickable((By.LINK_TEXT, element.text)))
finally:
element.click()

You can try like below to click on the hyperlinks containing mail icon. When a click is initiated, a pop up box shows up containing additional information. The following script can fetch the email address from there. It's always a great trouble to dig out anything when svg element are there. I've used BeautifulSoup library in order for the usage of .extract() function to kick out svg element so that the script can reach the content.
from bs4 import BeautifulSoup
from contextlib import closing
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
with closing(webdriver.Chrome()) as driver:
driver.get("https://www.sciencedirect.com/science/article/pii/S1001841718305011")
for elem in WebDriverWait(driver, 10).until(EC.visibility_of_all_elements_located((By.XPATH, "//a[starts-with(#name,'baut')]")))[-2:]:
elem.click()
soup = BeautifulSoup(driver.page_source,"lxml")
[item.extract() for item in soup.select("svg")]
email = soup.select_one("a[href^='mailto:']").text
print(email)
Output:
weibingzhang#ecust.edu.cn
junhongqian#ecust.edu.cn

use the builtin time.sleep() function
from time import sleep
tags = driver.find_elements_by_xpath('//a[#class="author size-m workspace-trigger"]//*[local-name()="svg"]')
if tags:
for tag in tags:
sleep(10)
tag.click()

Unable to get a dynamically generated content from a webpage

I have written a script in python using selenium to fetch the business summary (which is within p tag) located at the bottom right corner of a webpage under the header Company profile. The webpage is heavily dynamic, so I thought to use a browser simulator. I have created a css selector, which is able to parse the summary if I copy the html elements directly from that webpage and try on it locally. For some reason, when I tried the same selector within my below script, it doesn't do the trick. It throws timeout exception error instead. How can I fetch it?
This is my try:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
link = "https://in.finance.yahoo.com/quote/AAPL?p=AAPL"
def get_information(driver, url):
driver.get(url)
item = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "[id$='-QuoteModule'] p[class^='businessSummary']")))
driver.execute_script("arguments[0].scrollIntoView();", item)
print(item.text)
if __name__ == "__main__":
driver = webdriver.Chrome()
wait = WebDriverWait(driver, 20)
try:
get_information(driver,link)
finally:
driver.quit()

It seem that there is no Business Summary block initially, but it is generated after you scroll page down. Try below solution:
from selenium.webdriver.common.keys import Keys
def get_information(driver, url):
driver.get(url)
driver.find_element_by_tag_name("body").send_keys(Keys.END)
item = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "[id$='-QuoteModule'] p[class^='businessSummary']")))
print(item.text)

You have to scroll the page down twice until the element will be present:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
import time
link = "https://in.finance.yahoo.com/quote/AAPL?p=AAPL"
def get_information(driver, url):
driver.get(url)
driver.find_element_by_tag_name("body").send_keys(Keys.END) # scroll page
time.sleep(1) # small pause between
driver.find_element_by_tag_name("body").send_keys(Keys.END) # one more time
item = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "[id$='-QuoteModule'] p[class^='businessSummary']")))
driver.execute_script("arguments[0].scrollIntoView();", item)
print(item.text)
if __name__ == "__main__":
driver = webdriver.Chrome()
wait = WebDriverWait(driver, 20)
try:
get_information(driver,link)
finally:
driver.quit()
If you will scroll only one time it won't work properly at some reason(at least for me). I think it depends on window dimensions, on the smaller window you have to scroll more than on a bigger one.

Here is a much simpler approach using requests and working with the JSON data that is already in the page. I would also recommend to always use request if possible. It may take some extra work but the end result is a lot more reliable / cleaner. You could also take my example a lot further and parse the JSON to work directly with it (you need to clean up the text to be valid JSON). In my example I just use split which was just faster to do but it could lead to problems down the road when doing something more complex.
import requests
from lxml import html
url = 'https://in.finance.yahoo.com/quote/AAPL?p=AAPL'
headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'}
r = requests.get(url, headers=headers)
tree = html.fromstring(r.text)
data= [e.text_content() for e in tree.iter('script') if 'root.App.main = ' in e.text_content()][0]
data = data.split('longBusinessSummary":"')[1]
data = data.split('","city')[0]
print (data)

Selenium PhantomJS never finishes loading / incomplete loading

I was trying to get the embedded video URL from https://www.fmovies.is . I'm using selenium.PhantomJS(). The exact same code works perfectly if I use selenium.Firefox() driver . It seems as though I'm doing something wrong during the waiting phase.
If someone could point out what I was doing wrong , I would really appreciate it.
from bs4 import BeautifulSoup
from selenium import webdriver
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver import DesiredCapabilities
desired_capabilities = DesiredCapabilities.PHANTOMJS.copy()
desired_capabilities['phantomjs.page.customHeaders.User-Agent'] = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_5)AppleWebKit 537.36 (KHTML, like Gecko) Chrome"
desired_capabilities['phantomjs.page.customHeaders.Accept'] = "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8"
url = "https://fmovies.is/film/kung-fu-panda-2.9kx/q8kkyj"
driver = webdriver.PhantomJS(service_args=['--ignore-ssl-errors=true', '--ssl-protocol=any'],desired_capabilities=desired_capabilities)
driver.get(url)
try:
element = WebDriverWait(driver, 100).until(EC.presence_of_element_located((By.ID, "jw")))
finally:
driver.find_element_by_id("player").click()
pageSource = driver.page_source
soup = BeautifulSoup(pageSource,'lxml')
url = soup.find("video",{"class":"jw-video"})
print url
videoURL = ''
if url:
videoURL = url['src']
print videoURL

BeautifulSoup not finding elements

I'm trying to write a program that extracts the prices of the below website. I'm downloading the site with selenium and then try to parse it either with beautifulsoup or with selenium itself.
I determined that the information I want is always class="totalPrice" and I would like to extract them all, ideally as a list.
<td class="totalPrice" colspan="3">
Total: £560
<span class="sr_room_reinforcement"></span>
</td>
For some reason the below queries never find any totalPrice. Any suggestions what I'm doing wrong would be appreciated.
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup as bs
url='http://www.booking.com/searchresults.en-gb.html?label=gen173nr-17CAEoggJCAlhYSDNiBW5vcmVmaFCIAQGYAS64AQTIAQTYAQHoAQH4AQs;sid=1a43e0952558ac0ad0061d5b6523a7bc;dcid=1;checkin_monthday=4;checkin_year_month=2016-2;checkout_monthday=11;checkout_year_month=2016-2;city=-2601889;class_interval=1;csflt=%7B%7D;group_adults=7;group_children=0;highlighted_hotels=1192837;hp_sbox=1;label_click=undef;no_rooms=1;review_score_group=empty;room1=A%2CA%2CA%2CA%2CA%2CA%2CA;sb_price_type=total;score_min=0;si=ai%2Cco%2Cci%2Cre%2Cdi;ss=London;ssafas=1;ssb=empty;ssne=London;ssne_untouched=London&;order=price_for_two'
driver = webdriver.PhantomJS(r"C:\\Program Files (x86)\\phantomjs-2.0.0-windows\\bin\\phantomjs.exe")
#driver = webdriver.Firefox()
driver.get(url)
# for elm in driver.find_element_by_class_name("totalPrice"):
# print(elm.text)
content = driver.page_source
soup = bs(content, 'lxml')
for e in soup.find_all('totalPrice'):
print(e.name)
driver.close()

First of all, you need to wait when the total prices would be loaded. Use WebDriverWait class with a precense_of_element_located Expected Condition.
I've also found out that you would need to pretend not to be PhantomJS by overriding the browser's User-Agent through the Desired Capabilities.
Complete working code:
from selenium import webdriver
from selenium.webdriver import DesiredCapabilities
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup as bs
url = 'http://www.booking.com/searchresults.en-gb.html?label=gen173nr-17CAEoggJCAlhYSDNiBW5vcmVmaFCIAQGYAS64AQTIAQTYAQHoAQH4AQs;sid=1a43e0952558ac0ad0061d5b6523a7bc;dcid=1;checkin_monthday=4;checkin_year_month=2016-2;checkout_monthday=11;checkout_year_month=2016-2;city=-2601889;class_interval=1;csflt=%7B%7D;group_adults=7;group_children=0;highlighted_hotels=1192837;hp_sbox=1;label_click=undef;no_rooms=1;review_score_group=empty;room1=A%2CA%2CA%2CA%2CA%2CA%2CA;sb_price_type=total;score_min=0;si=ai%2Cco%2Cci%2Cre%2Cdi;ss=London;ssafas=1;ssb=empty;ssne=London;ssne_untouched=London&;order=price_for_two'
# setting a custom User-Agent
user_agent = (
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_4) " +
"AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.57 Safari/537.36"
)
dcap = dict(DesiredCapabilities.PHANTOMJS)
dcap["phantomjs.page.settings.userAgent"] = user_agent
driver = webdriver.PhantomJS(desired_capabilities=dcap)
driver.get(url)
# wait for the total prices to become present
WebDriverWait(driver, 30).until(EC.presence_of_element_located((By.CSS_SELECTOR, ".totalPrice")))
content = driver.page_source
driver.close()
soup = bs(content, 'lxml')
for e in soup.select('.totalPrice'):
print(e.text.strip())
It prints:
Total: US$781
Total: US$814
Total: US$831
Total: US$864
Total: US$895
Total: US$914
Total: US$915
Total: US$967
Total: US$1,031
As a side note, you don't really need BeautifulSoup - you can locate elements with selenium - it is quite powerful. Here is how you can locate the total prices:
for price in driver.find_elements_by_css_selector(".totalPrice"):
print(price.text.strip())

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Scraping Blog Post Titles with Selenium - Python - python

Related

Selenium Scrape not reading all elements

how to make driver to wait upto 10 seconds to click hyperlink

Unable to get a dynamically generated content from a webpage

Selenium PhantomJS never finishes loading / incomplete loading

BeautifulSoup not finding elements

Categories

Resources