I am trying to scrape the connection names with selenium Python. But it only scrolls to one page and loads result of one page. Is there any way that I can get the full page of results with selenium? I am attaching the code for reference.
import time
import requests
from bs4 import BeautifulSoup as bt
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.chrome.options import Options
from parsel import Selector
import urllib3
options=webdriver.ChromeOptions()
prefs={"profile.default_content_setting_values.notifications":2}
options.add_experimental_option("prefs",prefs)
options.add_argument("start-maximized")
options.add_argument("--disable-notifications")
from webdriver_manager.chrome import ChromeDriverManager
email=input("Please enter your linkedin email id: ")
password=input("Please enter your linkedin password: ")
driver=webdriver.Chrome(options=options,executable_path="D:\chromedriver.exe")
driver.get("http://www.linkedin.com/login/")
time.sleep(4)
ele=driver.find_element_by_name("session_key")
#print(ele.is_displayed())
pwd=driver.find_element_by_name("session_password")
ele.send_keys(str(email))
pwd.send_keys(str(password))
driver.find_element_by_xpath("/html/body/div/main/div[2]/form/div[3]/button").click()
time.sleep(3)
#logging in
driver.get("https://www.linkedin.com/mynetwork/")
driver.get("https://www.linkedin.com/mynetwork/invite-connect/connections/")
def scroll(driver, timeout):
scroll_pause_time = timeout
last_height = driver.execute_script("return document.body.scrollHeight")
while True:
# Scroll down to bottom
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
# Wait to load page
time.sleep(scroll_pause_time)
# Calculate new scroll height and compare with last scroll height
new_height = driver.execute_script("return document.body.scrollHeight")
if new_height == last_height:
# If heights are the same it will exit the function
break
last_height = new_height
scroll(driver, 2)
Related
I would like to scroll until the end of a page like : https://fr.finance.yahoo.com/quote/GM/history?period1=1290038400&period2=1612742400&interval=1d&filter=history&frequency=1d&includeAdjustedClose=true
The fact is using this :
# # Get scroll height after first time page load
last_height = driver.execute_script("return document.body.scrollHeight")
while True:
# Scroll down to bottom
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
# Wait to load page
time.sleep(2)()
# Calculate new scroll height and compare with last scroll height
new_height = driver.execute_script("return document.body.scrollHeight")
if new_height == last_height:
break
last_height = new_height
does not work. yes it should work for pages with infinite loads but doesn't work for yahoo finance, which has a finite number of loads but the condition should break when it reachs the end. So I'm quite confuse at the moment.
We could also use :
while driver.find_element_by_tag_name('tfoot'):
# Scroll down three times to load the table
for i in range(0, 3):
driver.execute_script("window.scrollBy(0, 5000)")
time.sleep(2)
but it sometimes blocks at a certain loads.
What would be the best way to do this ?
Requires pip install undetected-chromedriver, but will get the job done.
It's just my webdriver of choice, you can also do the exact same with normal selenium.
from time import sleep as s
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
import undetected_chromedriver as uc
options = uc.ChromeOptions()
options.headless = False
driver = uc.Chrome(options=options)
driver.get('https://fr.finance.yahoo.com/quote/GM/history?period1=1290038400&period2=1612742400&interval=1d&filter=history&frequency=1d&includeAdjustedClose=true')
WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, '#consent-page > div > div > div > div.wizard-body > div.actions.couple > form > button'))).click() #clicks the cookie warning or whatever
last_scroll_pos=0
while True:
WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, 'body'))).send_keys(Keys.DOWN)
s(.01)
current_scroll_pos=str(driver.execute_script('return window.pageYOffset;'))
if current_scroll_pos == last_scroll_pos:
print('scrolling is finished')
break
last_scroll_pos=current_scroll_pos
I'm trying to scrape a website (https://harleytherapy.com/therapists?page=1) that looks like it's been generated by Javascript and the element I'm trying to scrape (the lu with id="downshift-7-menu") doesn't appear on the "Page source" but only after I click on "Inspect element".
I tried to find a solution here and so far this the code I was able to come up with (a combination of Selenium + Beautiful soup)
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
import time
url = "https://harleytherapy.com/therapists?page=1"
options = webdriver.ChromeOptions()
options.add_argument('headless')
capa = DesiredCapabilities.CHROME
capa["pageLoadStrategy"] = "none"
driver = webdriver.Chrome(chrome_options=options, desired_capabilities=capa)
driver.set_window_size(1440,900)
driver.get(url)
time.sleep(15)
plain_text = driver.page_source
soup = BeautifulSoup(plain_text, 'html')
therapist_menu_id = "downshift-7-menu"
print(soup.find(id=therapist_menu_id))
I thought that allowing Selenium to wait for 15 seconds would make sure that all elements are loaded but I still can't find any element with id downshift-7-menu in the soup. Do you guys know what's wrong with my code?
The element with ID downshift-7-menu is loaded only after opening the THERAPIST dropdown menu, you can do it by scrolling it into view to load it and then clicking on it. You should also consider replacing sleep with explicit wait
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
wait = WebDriverWait(driver, 15)
# scroll the dropdown into view to load it
side_menu = wait.until(EC.visibility_of_element_located((By.CLASS_NAME, 'inner-a377b5')))
last_height = driver.execute_script("return arguments[0].scrollHeight", side_menu)
while True:
driver.execute_script("arguments[0].scrollTo(0, arguments[0].scrollHeight);", side_menu)
new_height = driver.execute_script("return arguments[0].scrollHeight", side_menu)
if new_height == last_height:
break
last_height = new_height
# open the menu
wait.until(EC.visibility_of_element_located((By.ID, 'downshift-7-input'))).click()
# wait for the option to load
therapist_menu_id = 'downshift-7-menu'
wait.until(EC.presence_of_element_located((By.ID, therapist_menu_id)))
print(soup.find(id=therapist_menu_id))
I am trying to use selenium to scroll down infinitely this webpage https://gfycat.com/discover/trending-gifs
I try this code:
from selenium import webdriver
options = webdriver.ChromeOptions()
options.add_argument("start-maximized")
options.add_argument("--disable-extensions")
driver = webdriver.Chrome(options=options, executable_path=r"C:\chromedriver.exe")
driver.get(url)
driver.execute_script("window.scrollTo(0, document.body.scrollHeight)")
driver.quit()
But no scroll down happened.
I also tried:
from selenium.webdriver.common.keys import Keys
for i in range(10):
driver.find_element_by_css_selector('html').send_keys(Keys.END)
But no scroll down happened too.
For infinite of Scrolling website you can using this methods of coding in Selenium as you can see I am using while for making infinite in addition you should be import time module for time out of loading website
def scroll(driver):
timeout = 5
# Get scroll height
last_height = driver.execute_script("return document.body.scrollHeight")
while True:
# Scroll down to bottom
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
# load the website
time.sleep(5)
# Calculate new scroll height and compare with last scroll height
new_height = driver.execute_script("return document.body.scrollHeight")
I'm trying to scrape the reviews from this link:
https://www.google.com/search?q=google+reviews+2nd+chance+treatment+40th+street&rlz=1C1JZAP_enUS697US697&oq=google+reviews+2nd+chance+treatment+40th+street&aqs=chrome..69i57j69i64.6183j0j7&sourceid=chrome&ie=UTF-8#lrd=0x872b7179b68e33d5:0x24b5517d86a95f89,1
For what I'm using the following code to load the page
from selenium import webdriver
import datetime
import time
import argparse
import os
import time
#Define the argument parser to read in the URL
url = "https://www.google.com/search?q=google+reviews+2nd+chance+treatment+40th+street&rlz=1C1JZAP_enUS697US697&oq=google+reviews+2nd+chance+treatment+40th+street&aqs=chrome..69i57j69i64.6183j0j7&sourceid=chrome&ie=UTF-8#lrd=0x872b7179b68e33d5:0x24b5517d86a95f89,1"
# Initialize the Chrome webdriver and open the URL
#driver = webdriver.Chromium()
profile = webdriver.FirefoxProfile()
profile.set_preference("general.useragent.override", "Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; AS; rv:11.0) like Gecko")
#driver = webdriver.Firefox(profile)
# https://stackoverflow.com/questions/22476112/using-chromedriver-with-selenium-python-ubuntu
driver = webdriver.Chrome("/usr/lib/chromium-browser/chromedriver")
driver.get(url)
driver.implicitly_wait(2)
SCROLL_PAUSE_TIME = 0.5
# Get scroll height
last_height = driver.execute_script("return document.body.scrollHeight")
while True:
# Scroll down to bottom
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
# Wait to load page
time.sleep(SCROLL_PAUSE_TIME)
# Calculate new scroll height and compare with last scroll height
new_height = driver.execute_script("return document.body.scrollHeight")
if new_height == last_height:
break
last_height = new_height
The page load fine, it is not scrolling down, I have used the same code for other sites like linkedn and it works there.
Here is the logic that you can use without using the javascript scroll down. Simple and effective by using the location_once_scrolled_into_view method which will scroll to the element.
As part of logic below, we are scrolling to the last review, and then checking if we loaded the number of reviews desired as per the request.
Imports Needed:
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
change the desiredReviewsCount variable value as per your requirement in the below code.
wait = WebDriverWait(driver,10)
url = "https://www.google.com/search?q=google+reviews+2nd+chance+treatment+40th+street&rlz=1C1JZAP_enUS697US697&oq=google+reviews+2nd+chance+treatment+40th+street&aqs=chrome..69i57j69i64.6183j0j7&sourceid=chrome&ie=UTF-8#lrd=0x872b7179b68e33d5:0x24b5517d86a95f89,1"
driver.get(url)
x=0
desiredReviewsCount=30
wait.until(EC.presence_of_all_elements_located((By.XPATH,"//div[#class='gws-localreviews__general-reviews-block']//div[#class='WMbnJf gws-localreviews__google-review']")))
while x<desiredReviewsCount:
driver.find_element_by_xpath("(//div[#class='gws-localreviews__general-reviews-block']//div[#class='WMbnJf gws-localreviews__google-review'])[last()]").location_once_scrolled_into_view
x = len(driver.find_elements_by_xpath("//div[#class='gws-localreviews__general-reviews-block']//div[#class='WMbnJf gws-localreviews__google-review']"))
print (len(driver.find_elements_by_xpath("//div[#class='gws-localreviews__general-reviews-block']//div[#class='WMbnJf gws-localreviews__google-review']")))
I want to scrape all tweets from twitter using Selenium. So, for this I want to go at bottom of the page.I tried a lot but it shows "Back to top " as shown in image.
How can I go at the bottom of the page/disappear "Back to top" using Selenium or How can I scrape all the tweets, if applying any other approach?
import pandas as pd
import selenium
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
import time
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
driver=webdriver.Firefox(executable_path="/home/piyush/geckodriver")
url="https://twitter.com/narendramodi"
driver.get(url)
time.sleep(6)
lastHeight = driver.execute_script("return document.body.scrollHeight")
while True:
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(6)
newHeight = driver.execute_script("return document.body.scrollHeight")
if newHeight == lastHeight:
break
lastHeight = newHeight
soup=BeautifulSoup(driver.page_source.encode("utf-8"),"html.parser")
tweet=[p.text for p in soup.find_all("p",class_="tweet-text")]
Here is image of inspect element of "Back-to-top"
Here is the output image
Just briefly looking at twitter, it appears that the content is generated on scrolling, meaning you need to scrape and parse the data as you scroll rather than after.
I would suggest moving
soup = BeautifulSoup(driver.page_source.encode("utf-8"),"html.parser")
tweet = [p.text for p in soup.find_all("p",class_="tweet-text")]
into your while loop after the scroll:
while True:
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
###
soup = BeautifulSoup(driver.page_source.encode("utf-8"),"html.parser")
tweet = [p.text for p in soup.find_all("p",class_="tweet-text")]
###
time.sleep(6)
newHeight = driver.execute_script("return document.body.scrollHeight")
if newHeight == lastHeight:
break
lastHeight = newHeight
If this doesn't work you are probably being fingerprinted and labeled as a bot by Twitter.