How do I scrape a website with an infinite scroller? - python

The code below is what I have so far, but it only pulls data for the first 25 items, which are the first 25 items on the page before scrolling down for more:
import requests
from bs4 import BeautifulSoup
import time
import pandas as pd
start_time = time.time()
s = requests.Session()
#Get URL and extract content
response = s.get('https://www.linkedin.com/jobs/search?keywords=It%20Business%20Analyst&location=Boston%2C%20Massachusetts%2C%20United%20States&geoId=102380872&trk=public_jobs_jobs-search-bar_search-submit&position=1&pageNum=0')
soup = BeautifulSoup(response.text, 'html.parser')
items = soup.find('ul', {'class': 'jobs-search__results-list'})
job_titles = [i.text.strip('\n ') for i in items.find_all('h3', {'class': 'base-search-card__title'})]
job_companies = [i.text.strip('\n ') for i in items.find_all('h4', {'class': 'base-search-card__subtitle'})]
job_locations = [i.text.strip('\n ') for i in items.find_all('span', {'class': 'job-search-card__location'})]
job_links = [i["href"].strip('\n ') for i in items.find_all('a', {'class': 'base-card__full-link'})]
a = pd.DataFrame({'Job Titles': job_titles})
b = pd.DataFrame({'Job Companies': job_companies})
c = pd.DataFrame({'Job Locations': job_locations})
value_counts1 = a['Job Titles'].value_counts()
value_counts2 = b['Job Companies'].value_counts()
value_counts3 = c['Job Locations'].value_counts()
l1 = [f"{key} - {value_counts1[key]}" for key in value_counts1.keys()]
l2 = [f"{key} - {value_counts2[key]}" for key in value_counts2.keys()]
l3 = [f"{key} - {value_counts3[key]}" for key in value_counts3.keys()]
data = l1, l2, l3
df = pd.DataFrame(
data, index=['Job Titles', 'Job Companies', 'Job Locations'])
df = df.T
print(df)
print("--- %s seconds ---" % (time.time() - start_time))
I would like to pull data for more than the first 25 items, is there an efficient way of being able to do this?

Get the container that holds the desired data by inspecting and you can scrape from the infinite scroll page with Selenium web driver using window.scrollTo()
check this for more >
crawl site that has infinite scrolling using python
or this web-scraping-infinite-scrolling-with-selenium

The best way is to create a function to scroll down:
# Scroll function
# This function takes two arguments. The driver that is being used and a timeout.
# The driver is used to scroll and the timeout is used to wait for the page to load.
def scroll(driver, timeout):
scroll_pause_time = timeout
# Get scroll height
last_height = driver.execute_script("return document.body.scrollHeight")
while True:
# Scroll down to bottom
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
# Wait to load page
time.sleep(scroll_pause_time)
# Calculate new scroll height and compare with last scroll height
new_height = driver.execute_script("return document.body.scrollHeight")
if new_height == last_height:
# If heights are the same it will exit the function
break
last_height = new_height
Then you can use the scroll function to scroll desidered page:
import time
import pandas as pd
from seleniumwire import webdriver
# Create a new instance of the Firefox driver
driver = webdriver.Firefox()
# move to some url
driver.get('your_url')
# use "scroll" function to scroll the page every 5 seconds
scroll(driver, 5)

Related

Get 'src' link from image using selenium

My problem is that I am trying to find a way to get the link of youtube thumbnails using selenium. What I found online does not help at all it suggested me to do: .get_attribute("src")' which does not work.
I tried this (everything works if I remove '.get_attribute("src")' *well, I do not get any errors and I am not capable of getting the thumbnails either):
import time
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
driver = webdriver.Chrome()
driver.get("https://www.youtube.com/#MrBeast/videos")
SCROLL_PAUSE_TIME = 3
last_height = driver.execute_script("return document.documentElement.scrollHeight")
n=0
while n<4:
#Scroll down to bottom
driver.execute_script("window.scrollTo(0, arguments[0]);", last_height);
time.sleep(SCROLL_PAUSE_TIME)
new_height = driver.execute_script("return document.documentElement.scrollHeight")
if new_height == last_height:
break
last_height = new_height
n += 1
titles = driver.find_elements(By.ID, "video-title")
views = driver.find_elements(By.XPATH, '//*[#id="metadata-line"]/span[1]')
year = driver.find_elements(By.XPATH,'//*[#id="metadata-line"]/span[2]')
thumbnail = driver.find_elements(By.XPATH, '//*[#id="thumbnail"]/yt-image/img').get_attribute("src")
data = []
for i,j,k,l in zip(titles, views, year, thumbnail):
data.append([i.text, j.text, k.text, l.text])
df = pd.DataFrame(data, columns = ['Title', 'views', 'date', 'thumbnail'])
df.to_csv('MrBeastThumbnails.csv')
driver.quit()
find_elements returns a list of web elements while .get_attribute() can be applied on single web element object only.
To get the src attribute values you need to iterate over a list of web elements extracting their src attributes, as following:
src_values = []
thumbnails = driver.find_elements(By.XPATH, '//*[#id="thumbnail"]/yt-image/img')
for thumbnail in thumbnails:
src_values.append(thumbnail.get_attribute("src"))

Unsure why BS4 and selenium not return correctly

from bs4 import BeautifulSoup
import requests
from csv import writer
from selenium import webdriver
import time
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
#selenium Path
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
driver.get('https://www.nike.com/w/mens-shoes-nik1zy7ok')
#PAGE SCROLLING
# Get scroll height
last_height = driver.execute_script("return document.body.scrollHeight")
SCROLL_PAUSE_TIME = 1
while True:
# Scroll down to bottom
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
# Wait to load page
time.sleep(SCROLL_PAUSE_TIME)
# Calculate new scroll height and compare with last scroll height
new_height = driver.execute_script("return document.body.scrollHeight")
if new_height == last_height:
break
last_height = new_height
#BS4 taking Selenium Driver Source
soup = BeautifulSoup(driver.page_source, 'html.parser')
lists = soup.find_all('div', class_='product-card__info disable-animations for--product')
#Iterate Through HTML and filter out content as needed and store inside shoes.csv
with open('shoes.csv','w', encoding='utf8',newline='') as f:
thewriter= writer(f)
header=['Name','Price']
thewriter.writerow(header)
for list in lists:
try:
name = list.find('div', class_='product-card__title').text
price = list.find('div',class_='product-price css-11s12ax is--current-price').text
except:
print("\nList finished!")
break
info = [name,price]
thewriter.writerow(info)
print(info)
#testing for other tag
soup2 = BeautifulSoup(driver.page_source, 'html.parser')
lists2 = soup2.find_all('div', class_='product-card__info disable-animations for--product')
#testing
with open('shoes2.csv','w', encoding='utf8',newline='') as f:
thewriter= writer(f)
header=['Name','Price']
thewriter.writerow(header)
for list in lists2:
try:
names = list.find('div', class_='product-card__title').text
prices = list.find('div',class_='product-price is--current-price css-s56yt7').text
except:
print("\nList finished!")
break
info2 = [names,prices]
thewriter.writerow(info2)
print(info2)
The intent is to build a web scraper for search through the Nike Men Shoes Store and output a CSV file with the name and price of item
So on the the website it show 500+ items and I'm only able to gather 100 items....
I double check all the tags and notice when i print out the HTML its skipping item randomly! If anyone could tell me why I would greatly appreciated
UPDATE SOLVE using purely selenium!
will be using the webdriver opinion to use headless browser to further lessen resource load! any tip for make it more efficient would be appreciated
import requests
from csv import writer
from selenium import webdriver
import time
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
import itertools
#selenium Path
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
driver.get('https://www.nike.com/w/mens-shoes-nik1zy7ok')
#PAGE SCROLLING
# Get scroll height
last_height = driver.execute_script("return document.body.scrollHeight")
SCROLL_PAUSE_TIME = .5
while True:
# Scroll down to bottom
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
# Wait to load page
time.sleep(SCROLL_PAUSE_TIME)
# Calculate new scroll height and compare with last scroll height
new_height = driver.execute_script("return document.body.scrollHeight")
if new_height == last_height:
break
last_height = new_height
Wname=driver.find_elements(By.CLASS_NAME, "product-card__title")
Wprice=driver.find_elements(By.CLASS_NAME, "product-card__price")
Wcolor=driver.find_elements(By.CLASS_NAME, "product-card__product-count")
#Make 3 seperate list to translate over to text
name=[]
price=[]
color=[]
for i in Wname :
name.append(i.text)
for i in Wprice:
price.append(i.text)
for i in Wcolor:
color.append(i.text)
#making CSV
new_list=[]
with open('Menshoes.csv','w', encoding='utf8',newline='') as f:
thewriter= writer(f)
header=['Name','Price','#Color']
thewriter.writerow(header)
for n,p,q in itertools.zip_longest(name,price,color):
if n:
new_list.append(n)
if p:
new_list.append(p)
if q:
new_list.append(q)
info = [n,p,q]
thewriter.writerow(info)
Take a closer look on your selection with BeautifulSoup it only provides specific cards as ResultSet which correspond to your class specification :
soup.find_all('div', class_='product-card__info disable-animations for--product')
You should use a more general selection to get more cards shown in your ResultSet:
soup.find_all('div', class_='product-card__body')
Note: Also avoid using reserved keywords like list
While scrolling give a bit more time or wait until all elements are loaded - I stored the results in a list of dicts, so it is a bit more structured and I could perform some other things (check len() or set()) on it before writing it to csv.
...
last_height = driver.execute_script("return document.body.scrollHeight")
SCROLL_PAUSE_TIME = 2
while True:
# Scroll down to bottom
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
# Wait to load page
time.sleep(SCROLL_PAUSE_TIME)
# Calculate new scroll height and compare with last scroll height
new_height = driver.execute_script("return document.body.scrollHeight")
if new_height == last_height:
break
last_height = new_height
#BS4 taking Selenium Driver Source
soup = BeautifulSoup(driver.page_source, 'html.parser')
data = []
for e in soup.find_all('div', class_='product-card__body'):
data.append({
'name': e.select_one('.product-card__titles').text,
'price':e.select_one('.is--current-price').text
})
with open('shoes.csv','w', encoding='utf8',newline='') as f:
writer = csv.DictWriter(f, fieldnames = data[0].keys())
writer.writeheader()
writer.writerows(data)
...

How to break while loop when scraping pages in opensea

I'm trying to scrap opnesea.io, I have code which scrapes all pages, but I need only first five page for scraping, so I have try to break loop, but it doesn't do it.
time.sleep(2) # Allow 2 seconds for the web page to open
data = []
path = ChromeDriverManager().install()
url = 'https://opensea.io/rankings?sortBy=seven_day_volume'
driver = webdriver.Chrome(path)
driver.get(url)
start = time.time()
def scroll():
""" Get urls from Opensea.io """
global data
scroll_pause_time = 1
screen_height = driver.execute_script("return window.screen.height;")
i = 1
num = 0
while num True:
driver.execute_script("window.scrollTo(0, {screen_height}*{i});".format(screen_height=screen_height, i=i))
i += 1
time.sleep(scroll_pause_time)
main_url = 'https://opensea.io'
# update scroll height each time after scrolled, as the scroll height can change after we scrolled the page
scroll_height = driver.execute_script("return document.body.scrollHeight;")
soup = BS(driver.page_source, 'html.parser')
divs = soup.find_all('a', class_='sc-1pie21o-0 elyzfO sc-1xf18x6-0 sc-1twd32i-0 sc-1idymv7-0 dGptxx kKpYwv iLNufV fresnel-lessThan-xl')
for items in divs:
link = main_url + items['href']
print(link)
d = {'link' : link}
print('Done!')
data.append(d)
if (screen_height) * i > scroll_height:
el = driver.find_element_by_xpath('//*[#id="main"]/div/div[3]/button[2]').click()
time.sleep(7)
scroll()
num += 1
if num == 5:
return
scroll()
print('Done ----> Opensea.io urls')
So, you can see I use recursion for my task, I know that using a while loop and recursion at same time is not good idea, but only in this way it's scraping more than one page.
Add a parameter to the function and create a global variable outside of the function called pages and pass it the function.
Check if it's less than 5 using if statement and increment it before a recursion.
Like below:
time.sleep(2) # Allow 2 seconds for the web page to open
data = []
path = ChromeDriverManager().install()
url = 'https://opensea.io/rankings?sortBy=seven_day_volume'
driver = webdriver.Chrome(path)
driver.get(url)
start = time.time()
#create a new integer to count the number of recursions outside of the function
pages = 0
#pass it to the function
def scroll(pages):
""" Get urls from Opensea.io """
global data
scroll_pause_time = 1
screen_height = driver.execute_script("return window.screen.height;")
i = 1
while num True:
driver.execute_script("window.scrollTo(0, {screen_height}*{i});".format(screen_height=screen_height, i=i))
i += 1
time.sleep(scroll_pause_time)
main_url = 'https://opensea.io'
# update scroll height each time after scrolled, as the scroll height can change after we scrolled the page
scroll_height = driver.execute_script("return document.body.scrollHeight;")
soup = BS(driver.page_source, 'html.parser')
divs = soup.find_all('a', class_='sc-1pie21o-0 elyzfO sc-1xf18x6-0 sc-1twd32i-0 sc-1idymv7-0 dGptxx kKpYwv iLNufV fresnel-lessThan-xl')
for items in divs:
link = main_url + items['href']
print(link)
d = {'link' : link}
print('Done!')
data.append(d)
if (screen_height) * i > scroll_height and pages < 5:
el = driver.find_element_by_xpath('//*[#id="main"]/div/div[3]/button[2]').click()
time.sleep(7)
#incremnt it before every recursion
pages += 1
scroll(pages)
return

Can't get all xpath elements from dynamic webpage

First time here asking. Hope someone can help me with this, it's driving me crazy !
I'm trying to scrape a used-car webpage from my country. The data loads when you start to scroll down, so, the first part of the code is for scrolling down and load the webpage.
I'm trying to get the link of every car published here, that's why I'm using find_elements_by_xpath in the try-except part.
Well, the problem is, the cars are showed up in packs of 11 for every load(scroll down), so the 11 xpaths repeats when scrolling down everytime;
meaning xpaths from
"//*[#id='w1']/div[1]/div/div[1]/a"
to
"//*[#id='w11']/div[1]/div/div[1]/a"
All libraries are called at the start of the code, don't worry.
from selenium import webdriver
from bs4 import BeautifulSoup
import time
links = []
url = ('https://buy.olxautos.cl/buscar?VehiculoEsSearch%5Btipo_valor%5D=1&VehiculoEsSearch%5Bprecio_range%5D=3990000%3B15190000')
driver = webdriver.Chrome('')
driver.get(url)
time.sleep(5)
SCROLL_PAUSE_TIME = 3
# Get scroll height
last_height = driver.execute_script("return document.body.scrollHeight")
while True:
# Scroll down to bottom
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
# Wait to load page
time.sleep(SCROLL_PAUSE_TIME)
# Calculate new scroll height and compare with last scroll height
new_height = driver.execute_script("return document.body.scrollHeight")
if new_height == last_height:
break
last_height = new_height
try:
zelda = driver.find_elements_by_xpath("//*[#id='w1']/div[1]/div/div[1]/a").get_attribute('href')
links.append(zelda)
except:
pass
print(links)
So the expected output of this code would be something like this:
['link_car_1', 'link_car_12', 'link_car_23', '...']
But when I run this code, it returns an empty list. But when I run it with find_element_by_xpath returns the first link, what am I doing wrong 😭😭, I just can't figure it out !!.
Thanks!
You get only one link because the XPATH is not the same for all the links. you can use bs4 to extract links by using the driver page source as shown below.
from bs4 import BeautifulSoup
import lxml
links = []
url = ('https://buy.olxautos.cl/buscar?VehiculoEsSearch%5Btipo_valor%5D=1&VehiculoEsSearch%5Bprecio_range%5D=3990000%3B15190000')
driver = webdriver.Chrome(executable_path = Path)
driver.get(url)
time.sleep(5)
SCROLL_PAUSE_TIME = 3
# Get scroll height
last_height = driver.execute_script("return document.body.scrollHeight")
while True:
# Scroll down to bottom
page_source_ = driver.page_source
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
# Wait to load page
time.sleep(SCROLL_PAUSE_TIME)
# Calculate new scroll height and compare with last scroll height
new_height = driver.execute_script("return document.body.scrollHeight")
#use BeautifulSoup to extract links
sup = BeautifulSoup(page_source_, 'lxml')
sub_ = sup.findAll('div', {'class': 'owl-item active'})
for link_ in sub_:
link = link_.find('a', href= True)
#link = 'https://buy.olxautos.cl' + link #if needed (adding prefix)
links.append(link['href'])
if new_height == last_height:
break
last_height = new_height
print('>> Total length of list : ', len(links))
print('\n',links)

Scraping all comments under an Instagram post

my previous questions was closed, but the suggested answer doesn't help me. Instagram comments has a very specific behaviour! I know how to programatically scroll a website down, but with the comments on Instagram is a bit different! I would appreciate if my question was not closed immediately because it really doesn't help. Woule ba grateful for help and not shutting me down! Thank you.
Here it is again:
I am trying to build a scraper that is saving the comments under an Instagram post. I manage to log in to the instagram through my code so I can access all comments under a post, but I seem to cannot scroll down enough times to view all comments in order to scrape all of them. I only get around 20 comments everytime.
Can anyone please help me? I am using selenium webdriver.
Thank you for your help in advance! Will be greatfull.
This is my function for saving the comments:
import time
from selenium.webdriver.firefox.options import Options
from selenium.webdriver import Firefox
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.keys import Keys
class Instagram_comments():
def __init__(self):
self.firefox_options = Options()
self.browser = Firefox(options=self.firefox_options)
def get_comments(self, url):
self.browser.get(url)
time.sleep(3)
while True:
try:
self.load_more_comments = self.browser.find_element_by_class_name(
'glyphsSpriteCircle_add__outline__24__grey_9')
self.action = ActionChains(self.browser)
self.action.move_to_element(self.load_more_comments)
self.load_more_comments.click()
time.sleep(4)
self.body_elem = self.browser.find_element_by_class_name('Mr508')
for _ in range(100):
self.body_elem.send_keys(Keys.END)
time.sleep(3)
except Exception as e:
pass
time.sleep(5)
self.comment = self.browser.find_elements_by_class_name('gElp9 ')
for c in self.comment:
self.container = c.find_element_by_class_name('C4VMK')
self.name = self.container.find_element_by_class_name('_6lAjh').text
self.content = self.container.find_element_by_tag_name('span').text
self.content = self.content.replace('\n', ' ').strip().rstrip()
self.time_of_post = self.browser.find_element_by_xpath('//a/time').get_attribute("datetime")
self.comment_details = {'profile name': self.name, 'comment': self.content, 'time': self.time_of_post}
print(self.comment_details)
time.sleep(5)
return self.comment_details
This chunk worked multiple times for me:
def scroll():
SCROLL_PAUSE_TIME = 1
# Get scroll height
last_height = driver.execute_script("return document.body.scrollHeight")
while True:
# Scroll down to bottom
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
# Wait to load page
time.sleep(SCROLL_PAUSE_TIME)
# Calculate new scroll height and compare with last scroll height
new_height = driver.execute_script("return document.body.scrollHeight")
if new_height == last_height:
break
print('page height:', new_height)
last_height = new_height
scroll()
and for some sites you will need to scrape as you scroll as not all elements will appear when you get to the bottom(such as twitter).
This is what my code looked like for twitter:
account_names = []
account_tags = []
account_link = []
def scroll():
SCROLL_PAUSE_TIME = 1
global account_name
# Get scroll height
last_height = driver.execute_script("return document.body.scrollHeight")
while True:
# Scroll down to bottom
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
# Wait to load page
time.sleep(SCROLL_PAUSE_TIME)
account_name = driver.find_elements_by_xpath('//*[#id="react-root"]/div/div/div/main/div/div/div/div/div/div[2]/div/div/section/div/div/div/div/div/div/div/div[2]/div[1]/div[1]/a/div/div[1]/div[1]/span/span')
for act_name in account_name:
global acctname
acctname = act_name.text
account_names.append(acctname)
account_handle = driver.find_elements_by_xpath('//*[#id="react-root"]/div/div/div/main/div/div/div/div/div/div[2]/div/div/section/div/div/div/div/div/div/div/div[2]/div[1]/div[1]/a/div/div[2]/div/span')
for act_handle in account_handle:
global account_tags
acct_handles = act_handle.text
account_tags.append(acct_handles)
soup = BeautifulSoup(driver.page_source, 'lxml')
account_links = soup.find_all('a', href=True, class_='css-4rbku5 css-18t94o4 css-1dbjc4n r-1loqt21 r-1wbh5a2 r-dnmrzs r-1ny4l3l')
for acct_links in account_links:
global act_link
act_link = acct_links['href']
account_link.append(act_link)
# Calculate new scroll height and compare with last scroll height
new_height = driver.execute_script("return document.body.scrollHeight")
if new_height == last_height:
break
last_height = new_height
scroll()
Just a note: as another user commented, Instagram is very difficult to scrape because of the dynamic html variables, so they would be correct to say no one, myself included is too interested in writing that for instagram.
The first function returns True if an element is found
The second function is used to scroll to the bottom till the last comment and then BeautifulSoup is used to scrape all comments
def check_exists_by_xpath(self,xpath):
try:
self.driver.find_element_by_xpath(xpath)
except NoSuchElementException:
return False
return True
def get_comments():
while self.check_exists_by_xpath("//div/ul/li/div/button"):
load_more_comments_element = self.driver.find_element_by_xpath("//div/ul/li/div/button")
load_more_comments_element.click()
sleep(1)
sleep(2)
soup = BeautifulSoup(self.driver.page_source,'lxml')
comms = soup.find_all('div',attrs={'class':'C4VMK'})
print(len(comms))
soup_2 = BeautifulSoup(str(comms),'lxml')
spans = soup_2.find_all('span')
comments = [i.text.strip() for i in spans if i != '']
print(comments)
I hope this helps - be aware I'm also still learning.
This worked for me, this programmatically clicks the "load more" button, as many times it is displayed.
try:
load_more_comment = driver.find_element_by_css_selector('.MGdpg > button:nth-child(1)')
print("Found {}".format(str(load_more_comment)))
while load_more_comment.is_displayed():
load_more_comment.click()
time.sleep(1.5)
load_more_comment = driver.find_element_by_css_selector('.MGdpg > button:nth-child(1)')
print("Found {}".format(str(load_more_comment)))
except Exception as e:
print(e)
pass

Categories

Resources