Selenium Scrape not reading all elements - python

I am trying to scrape data from the following site. I was able to click on load more yet the code doesn't catch most of the elements and I do not really know what to do.
url = 'https://www.carrefouregypt.com/mafegy/en/c/FEGY1701230'
products = []
options = Options()
driver = webdriver.Chrome(options = options)
driver.get(url)
time.sleep(8)
#click on load more
while True:
try:
btn_class = 'css-1n3fqy0'
btn = driver.find_element(By.CLASS_NAME , btn_class)
btn.click()
driver.implicitly_wait(10)
except NoSuchElementException:
break
driver.execute_script("window.scrollTo(0,document.body.scrollHeight)")
time.sleep(8)

The following code will click that button until it cannot locate it, and exit gracefully:
from selenium.common.exceptions import NoSuchElementException, TimeoutException
from selenium import webdriver
from selenium.webdriver.firefox.service import Service
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.firefox.options import Options as Firefox_Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support.ui import Select
from selenium.webdriver.support import expected_conditions as EC
import time as t
firefox_options = Firefox_Options()
firefox_options.add_argument("--width=1280")
firefox_options.add_argument("--height=720")
# firefox_options.headless = True
firefox_options.set_preference("general.useragent.override", "Mozilla/5.0 (Linux; Android 7.0; SM-A310F Build/NRD90M) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.91 Mobile Safari/537.36 OPR/42.7.2246.114996")
driverService = Service('chromedriver/geckodriver')
browser = webdriver.Firefox(service=driverService, options=firefox_options)
url = 'https://www.carrefouregypt.com/mafegy/en/c/FEGY1701230'
browser.get(url)
t.sleep(5)
while True:
try:
load_more_button = WebDriverWait(browser, 10).until(EC.element_to_be_clickable((By.XPATH,'//button[text()="Load More"]')))
browser.execute_script('window.scrollBy(0, 100);')
load_more_button.click()
print('clicked')
t.sleep(3)
except TimeoutException:
print('all elements loaded in page')
break
It's using Firefox, on a linux setup (for some reasons Chrome was temperamental on this one). You just have to observe the imports, and the code after defining the browser/driver. Selenium documentation: https://www.selenium.dev/documentation/

Related

Scraping Blog Post Titles with Selenium - Python

I am trying to scrape the blog post titles using Selenium with Python of the following URL: https://blog.coinbase.com/tagged/coinbase-pro. When I use Selenium to get the page source, it does not contain the blog post titles, but the Chrome source code does when I right click and select "view page source". I'm using the following code:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
options = Options()
options.headless = True
driver = webdriver.Chrome(options=options)
driver.get("https://blog.coinbase.com/tagged/coinbase-pro")
pageSource = driver.page_source
print(pageSource)
Any help would be appreciated.
Thanks.
You can fetch all the titles from that webpage in several ways. The efficient and fastest way would be to opt for requests.
This is how you can grab the titles using requests:
import re
import json
import time
import requests
link = 'https://medium.com/the-coinbase-blog/load-more'
params = {
'sortBy': 'tagged',
'tagSlug': 'coinbase-pro',
'limit': 25,
'to': int(time.time() * 1000),
}
with requests.Session() as s:
s.headers['User-Agent'] = 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.150 Safari/537.36'
s.headers['accept'] = 'application/json'
s.headers['referer'] = 'https://blog.coinbase.com/tagged/coinbase-pro'
while True:
res = s.get(link,params=params)
container = json.loads(re.findall("[^{]+(.*)",res.text)[0])
for k,v in container['payload']['references']['Post'].items():
title = v['title']
print(title)
try:
next_page = container['payload']['paging']['next']['to']
except KeyError:
break
params['to'] = next_page
However, if it is selenium you want to stick with, try the following:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.support import expected_conditions as EC
def scroll_down_to_the_bottom():
check_height = driver.execute_script("return document.body.scrollHeight;")
while True:
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
try:
WebDriverWait(driver,10).until(lambda driver: driver.execute_script("return document.body.scrollHeight;") > check_height)
check_height = driver.execute_script("return document.body.scrollHeight;")
except TimeoutException:
break
with webdriver.Chrome() as driver:
driver.get("https://blog.coinbase.com/tagged/coinbase-pro")
scroll_down_to_the_bottom()
for item in WebDriverWait(driver,10).until(EC.visibility_of_all_elements_located((By.CSS_SELECTOR,".section-content h3.graf--title"))):
print(item.text)
wait=WebDriverWait(driver,30)
driver.get("https://blog.coinbase.com/tagged/coinbase-pro")
elements=wait.until(EC.visibility_of_all_elements_located((By.CSS_SELECTOR,".graf.graf--h3.graf-after--figure.graf--trailing.graf--title")))
for elem in elements:
print(elem.text)
If you wanted the 8 titles you can grab them by their css selector using waits.
Import:
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
Outputs:
Inverse Finance (INV), Liquity (LQTY), Polyswarm (NCT) and Propy (PRO) are launching on Coinbase Pro
Goldfinch Protocol (GFI) is launching on Coinbase Pro
Decentralized Social (DESO) is launching on Coinbase Pro
API3 (API3), Bluezelle (BLZ), Gods Unchained (GODS), Immutable X (IMX), Measurable Data Token (MDT) and Ribbon…
Circuits of Value (COVAL), IDEX (IDEX), Moss Carbon Credit (MCO2), Polkastarter (POLS), ShapeShift FOX Token (FOX)…
Voyager Token (VGX) is launching on Coinbase Pro
Alchemix (ALCX), Ethereum Name Service (ENS), Gala (GALA), mStable USD (MUSD) and Power Ledger (POWR) are launching…
Crypto.com Protocol (CRO) is launching on Coinbase Pro

won't execute for loop in python selenium headless mode

Without using selenium headless, the below code works fine. But with headless mode, why the for loop won't execute??
Here is my code:-
from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
options = Options()
options.add_argument("--disable-notifications")
options.add_argument('headless')
driver = webdriver.Chrome(ChromeDriverManager().install(), chrome_options=options)
url = "https://www.justdial.com/Delhi/S-K-Premium-Par-Hari-Nagar/011PXX11-XX11-131128122154-B8G6_BZDET"
driver.get(url)
try:
pop_up = WebDriverWait(driver, 30).until(
EC.element_to_be_clickable((By.XPATH, '//*[#id="best_deal_detail_div"]/section/span')))
pop_up.click() # For disable pop-up
except TimeoutException:
pass
while True:
try:
element = WebDriverWait(driver, 20).until(
EC.element_to_be_clickable((By.XPATH, "//span[text()='Load More Reviews..']")))
element.click()
except TimeoutException:
break
except:
pass
soup = BeautifulSoup(driver.page_source, 'lxml')
services = soup.find_all('span', {'class': "rName lng_commn"})
for i in services:
print(i.text)
I want to run this code with selenium headless. Please help.
Some websites behave different when they see your "headless" user-agent.
Try changing your user-agent to Chrome and see if it works.
options.add_argument("""user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)""")

How to restart Selenium browser after quit?

I'm trying to quit and then restart a new browser session with Selenium when encountering a captcha, and I'm not sure yet why the code below isn't working.
It quits the existing driver, but after recursion browser.get() results in this error: ConnectionRefusedError: [Errno 61] Connection refused
Thanks in advance for any advice. I've included only the most relevant parts of the code below:
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.action_chains import ActionChains
path_to_chromedriver = '/Users/Myname/Desktop/a/chromedriver 2'
options = webdriver.ChromeOptions()
options.add_argument('start-maximized')
#options.add_argument('disable-infobars')
#options.add_argument('--disable-notifications')
options.add_argument('--disable-extensions')
browser = webdriver.Chrome(chrome_options=options, executable_path=path_to_chromedriver)
headers = {'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36'}
def get_page_info(url, browser = webdriver.Chrome(chrome_options=options, executable_path=path_to_chromedriver)):
browser.get(url)
try:
body = browser.find_element_by_tag_name('body')
if "been denied because we believe" in body.text:
print("going to new session...")
browser.quit()
human(4,6) #time delay
return winery_info(url)
Edit: I normally wouldn't use this tactic to get around a captcha, but in my use case this makes sense.
Try to use the driver.delete_all_cookies() method instead of closing browser and reopening it
edit : maybe the site block your ip adress i suggest you to use tor to change ip automatically i will give you this
import os
import time
os.system("killall tor")
os.system("tor &")
time.sleep(5)
#init driver
fp = webdriver.FirefoxProfile()
fp.set_preference("network.proxy.type", 1)
fp.set_preference("network.proxy.socks", "127.0.0.1")
fp.set_preference("network.proxy.socks_port", int("9050"))
fp.update_preferences()
browser = webdriver.Firefox(firefox_profile=fp)
browser.get(...)
...
...
if captcha:
os.system("killall tor")
os.system("tor &")
time.sleep(5)
browser.get(...)
# this will change your ip adress
# You can also configure tor to change ip every 10 seconds by changing torrc file

Selenium PhantomJS never finishes loading / incomplete loading

I was trying to get the embedded video URL from https://www.fmovies.is . I'm using selenium.PhantomJS(). The exact same code works perfectly if I use selenium.Firefox() driver . It seems as though I'm doing something wrong during the waiting phase.
If someone could point out what I was doing wrong , I would really appreciate it.
from bs4 import BeautifulSoup
from selenium import webdriver
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver import DesiredCapabilities
desired_capabilities = DesiredCapabilities.PHANTOMJS.copy()
desired_capabilities['phantomjs.page.customHeaders.User-Agent'] = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_5)AppleWebKit 537.36 (KHTML, like Gecko) Chrome"
desired_capabilities['phantomjs.page.customHeaders.Accept'] = "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8"
url = "https://fmovies.is/film/kung-fu-panda-2.9kx/q8kkyj"
driver = webdriver.PhantomJS(service_args=['--ignore-ssl-errors=true', '--ssl-protocol=any'],desired_capabilities=desired_capabilities)
driver.get(url)
try:
element = WebDriverWait(driver, 100).until(EC.presence_of_element_located((By.ID, "jw")))
finally:
driver.find_element_by_id("player").click()
pageSource = driver.page_source
soup = BeautifulSoup(pageSource,'lxml')
url = soup.find("video",{"class":"jw-video"})
print url
videoURL = ''
if url:
videoURL = url['src']
print videoURL

PhantomJS returning empty web page (python, Selenium)

Trying to screen scrape a web site without having to launch an actual browser instance in a python script (using Selenium). I can do this with Chrome or Firefox - I've tried it and it works - but I want to use PhantomJS so it's headless.
The code looks like this:
import sys
import traceback
import time
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
dcap = dict(DesiredCapabilities.PHANTOMJS)
dcap["phantomjs.page.settings.userAgent"] = (
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/53 "
"(KHTML, like Gecko) Chrome/15.0.87"
)
try:
# Choose our browser
browser = webdriver.PhantomJS(desired_capabilities=dcap)
#browser = webdriver.PhantomJS()
#browser = webdriver.Firefox()
#browser = webdriver.Chrome(executable_path="/usr/local/bin/chromedriver")
# Go to the login page
browser.get("https://www.whatever.com")
# For debug, see what we got back
html_source = browser.page_source
with open('out.html', 'w') as f:
f.write(html_source)
# PROCESS THE PAGE (code removed)
except Exception, e:
browser.save_screenshot('screenshot.png')
traceback.print_exc(file=sys.stdout)
finally:
browser.close()
The output is merely:
<html><head></head><body></body></html>
But when I use the Chrome or Firefox options, it works fine. I thought maybe the web site was returning junk based on the user agent, so I tried faking that out. No difference.
What am I missing?
UPDATED: I will try to keep the below snippet updated with until it works. What's below is what I'm currently trying.
import sys
import traceback
import time
import re
from selenium import webdriver
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
from selenium.webdriver.support import expected_conditions as EC
dcap = dict(DesiredCapabilities.PHANTOMJS)
dcap["phantomjs.page.settings.userAgent"] = (
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/53 (KHTML, like Gecko) Chrome/15.0.87")
try:
# Set up our browser
browser = webdriver.PhantomJS(desired_capabilities=dcap, service_args=['--ignore-ssl-errors=true'])
#browser = webdriver.Chrome(executable_path="/usr/local/bin/chromedriver")
# Go to the login page
print "getting web page..."
browser.get("https://www.website.com")
# Need to wait for the page to load
timeout = 10
print "waiting %s seconds..." % timeout
wait = WebDriverWait(browser, timeout)
element = wait.until(EC.element_to_be_clickable((By.ID,'the_id')))
print "done waiting. Response:"
# Rest of code snipped. Fails as "wait" above.
I was facing the same problem and no amount of code to make the driver wait was helping.
The problem is the SSL encryption on the https websites, ignoring them will do the trick.
Call the PhantomJS driver as:
driver = webdriver.PhantomJS(service_args=['--ignore-ssl-errors=true', '--ssl-protocol=TLSv1'])
This solved the problem for me.
You need to wait for the page to load. Usually, it is done by using an Explicit Wait to wait for a key element to be present or visible on a page. For instance:
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
# ...
browser.get("https://www.whatever.com")
wait = WebDriverWait(driver, 10)
wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "div.content")))
html_source = browser.page_source
# ...
Here, we'll wait up to 10 seconds for a div element with class="content" to become visible before getting the page source.
Additionally, you may need to ignore SSL errors:
browser = webdriver.PhantomJS(desired_capabilities=dcap, service_args=['--ignore-ssl-errors=true'])
Though, I'm pretty sure this is related to the redirecting issues in PhantomJS. There is an open ticket in phantomjs bugtracker:
PhantomJS does not follow some redirects
driver = webdriver.PhantomJS(service_args=['--ignore-ssl-errors=true', '--ssl-protocol=TLSv1'])
This worked for me

Categories

Resources