My problem is that I am trying to find a way to get the link of youtube thumbnails using selenium. What I found online does not help at all it suggested me to do: .get_attribute("src")' which does not work.
I tried this (everything works if I remove '.get_attribute("src")' *well, I do not get any errors and I am not capable of getting the thumbnails either):
import time
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
driver = webdriver.Chrome()
driver.get("https://www.youtube.com/#MrBeast/videos")
SCROLL_PAUSE_TIME = 3
last_height = driver.execute_script("return document.documentElement.scrollHeight")
n=0
while n<4:
#Scroll down to bottom
driver.execute_script("window.scrollTo(0, arguments[0]);", last_height);
time.sleep(SCROLL_PAUSE_TIME)
new_height = driver.execute_script("return document.documentElement.scrollHeight")
if new_height == last_height:
break
last_height = new_height
n += 1
titles = driver.find_elements(By.ID, "video-title")
views = driver.find_elements(By.XPATH, '//*[#id="metadata-line"]/span[1]')
year = driver.find_elements(By.XPATH,'//*[#id="metadata-line"]/span[2]')
thumbnail = driver.find_elements(By.XPATH, '//*[#id="thumbnail"]/yt-image/img').get_attribute("src")
data = []
for i,j,k,l in zip(titles, views, year, thumbnail):
data.append([i.text, j.text, k.text, l.text])
df = pd.DataFrame(data, columns = ['Title', 'views', 'date', 'thumbnail'])
df.to_csv('MrBeastThumbnails.csv')
driver.quit()
find_elements returns a list of web elements while .get_attribute() can be applied on single web element object only.
To get the src attribute values you need to iterate over a list of web elements extracting their src attributes, as following:
src_values = []
thumbnails = driver.find_elements(By.XPATH, '//*[#id="thumbnail"]/yt-image/img')
for thumbnail in thumbnails:
src_values.append(thumbnail.get_attribute("src"))
Related
I'm scraping news-articles from a website where there is no load-more button in a specific category page, the news article links are being generated as I scroll down. I wrote a function which take input category_page_url and limit_page(how many times I want to scroll down) and return me back all the links of the news articles displayed in that page.
Category page link = https://www.scmp.com/topics/trade
def get_article_links(url, limit_loading):
options = webdriver.ChromeOptions()
lists = ['disable-popup-blocking']
caps = DesiredCapabilities().CHROME
caps["pageLoadStrategy"] = "normal"
options.add_argument("--window-size=1920,1080")
options.add_argument("--disable-extensions")
options.add_argument("--disable-notifications")
options.add_argument("--disable-Advertisement")
options.add_argument("--disable-popup-blocking")
driver = webdriver.Chrome(executable_path= r"E:\chromedriver\chromedriver.exe", options=options) #add your chrome path
driver.get(url)
last_height = driver.execute_script("return document.body.scrollHeight")
loading = 0
while loading < limit_loading:
loading += 1
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(8)
new_height = driver.execute_script("return document.body.scrollHeight")
if new_height == last_height:
break
last_height = new_height
article_links = []
bsObj = BeautifulSoup(driver.page_source, 'html.parser')
for i in bsObj.find('div', {'class': 'content-box'}).find('div', {'class': 'topic-article-container'}).find_all('h2', {'class': 'article__title'}):
article_links.append(i.a['href'])
return article_links
Assuming I want to scroll 5 times in this category page,
get_article_links('https://www.scmp.com/topics/trade', 5)
But even if I change the number of my limit_page it return me back only the links from first page, there is some mistake I've done to write the scrolling part. Please help me with this.
Instead of scrolling using per body scrollHeight property, I checked to see if there was any appropriate element after the list of articles to scroll to. I noticed this appropriately named div:
<div class="topic-content__load-more-anchor" data-v-db98a5c0=""></div>
Accordingly, I primarily changed the while loop in your function get_article_links to scroll to this div using location_once_scrolled_into_view after finding the div before the loop starts, as follows:
loading = 0
end_div = driver.find_element('class name','topic-content__load-more-anchor')
while loading < limit_loading:
loading += 1
print(f'scrolling to page {loading}...')
end_div.location_once_scrolled_into_view
time.sleep(2)
If we now call the function with different limit_loading, we get different count of unique news links. Here are couple of runs:
>>> ar_links = get_article_links('https://www.scmp.com/topics/trade', 2)
>>> len(ar_links)
scrolling to page 1...
scrolling to page 2...
90
>>> ar_links = get_article_links('https://www.scmp.com/topics/trade', 3)
>>> len(ar_links)
scrolling to page 1...
scrolling to page 2...
scrolling to page 3...
120
from bs4 import BeautifulSoup
import requests
from csv import writer
from selenium import webdriver
import time
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
#selenium Path
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
driver.get('https://www.nike.com/w/mens-shoes-nik1zy7ok')
#PAGE SCROLLING
# Get scroll height
last_height = driver.execute_script("return document.body.scrollHeight")
SCROLL_PAUSE_TIME = 1
while True:
# Scroll down to bottom
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
# Wait to load page
time.sleep(SCROLL_PAUSE_TIME)
# Calculate new scroll height and compare with last scroll height
new_height = driver.execute_script("return document.body.scrollHeight")
if new_height == last_height:
break
last_height = new_height
#BS4 taking Selenium Driver Source
soup = BeautifulSoup(driver.page_source, 'html.parser')
lists = soup.find_all('div', class_='product-card__info disable-animations for--product')
#Iterate Through HTML and filter out content as needed and store inside shoes.csv
with open('shoes.csv','w', encoding='utf8',newline='') as f:
thewriter= writer(f)
header=['Name','Price']
thewriter.writerow(header)
for list in lists:
try:
name = list.find('div', class_='product-card__title').text
price = list.find('div',class_='product-price css-11s12ax is--current-price').text
except:
print("\nList finished!")
break
info = [name,price]
thewriter.writerow(info)
print(info)
#testing for other tag
soup2 = BeautifulSoup(driver.page_source, 'html.parser')
lists2 = soup2.find_all('div', class_='product-card__info disable-animations for--product')
#testing
with open('shoes2.csv','w', encoding='utf8',newline='') as f:
thewriter= writer(f)
header=['Name','Price']
thewriter.writerow(header)
for list in lists2:
try:
names = list.find('div', class_='product-card__title').text
prices = list.find('div',class_='product-price is--current-price css-s56yt7').text
except:
print("\nList finished!")
break
info2 = [names,prices]
thewriter.writerow(info2)
print(info2)
The intent is to build a web scraper for search through the Nike Men Shoes Store and output a CSV file with the name and price of item
So on the the website it show 500+ items and I'm only able to gather 100 items....
I double check all the tags and notice when i print out the HTML its skipping item randomly! If anyone could tell me why I would greatly appreciated
UPDATE SOLVE using purely selenium!
will be using the webdriver opinion to use headless browser to further lessen resource load! any tip for make it more efficient would be appreciated
import requests
from csv import writer
from selenium import webdriver
import time
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
import itertools
#selenium Path
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
driver.get('https://www.nike.com/w/mens-shoes-nik1zy7ok')
#PAGE SCROLLING
# Get scroll height
last_height = driver.execute_script("return document.body.scrollHeight")
SCROLL_PAUSE_TIME = .5
while True:
# Scroll down to bottom
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
# Wait to load page
time.sleep(SCROLL_PAUSE_TIME)
# Calculate new scroll height and compare with last scroll height
new_height = driver.execute_script("return document.body.scrollHeight")
if new_height == last_height:
break
last_height = new_height
Wname=driver.find_elements(By.CLASS_NAME, "product-card__title")
Wprice=driver.find_elements(By.CLASS_NAME, "product-card__price")
Wcolor=driver.find_elements(By.CLASS_NAME, "product-card__product-count")
#Make 3 seperate list to translate over to text
name=[]
price=[]
color=[]
for i in Wname :
name.append(i.text)
for i in Wprice:
price.append(i.text)
for i in Wcolor:
color.append(i.text)
#making CSV
new_list=[]
with open('Menshoes.csv','w', encoding='utf8',newline='') as f:
thewriter= writer(f)
header=['Name','Price','#Color']
thewriter.writerow(header)
for n,p,q in itertools.zip_longest(name,price,color):
if n:
new_list.append(n)
if p:
new_list.append(p)
if q:
new_list.append(q)
info = [n,p,q]
thewriter.writerow(info)
Take a closer look on your selection with BeautifulSoup it only provides specific cards as ResultSet which correspond to your class specification :
soup.find_all('div', class_='product-card__info disable-animations for--product')
You should use a more general selection to get more cards shown in your ResultSet:
soup.find_all('div', class_='product-card__body')
Note: Also avoid using reserved keywords like list
While scrolling give a bit more time or wait until all elements are loaded - I stored the results in a list of dicts, so it is a bit more structured and I could perform some other things (check len() or set()) on it before writing it to csv.
...
last_height = driver.execute_script("return document.body.scrollHeight")
SCROLL_PAUSE_TIME = 2
while True:
# Scroll down to bottom
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
# Wait to load page
time.sleep(SCROLL_PAUSE_TIME)
# Calculate new scroll height and compare with last scroll height
new_height = driver.execute_script("return document.body.scrollHeight")
if new_height == last_height:
break
last_height = new_height
#BS4 taking Selenium Driver Source
soup = BeautifulSoup(driver.page_source, 'html.parser')
data = []
for e in soup.find_all('div', class_='product-card__body'):
data.append({
'name': e.select_one('.product-card__titles').text,
'price':e.select_one('.is--current-price').text
})
with open('shoes.csv','w', encoding='utf8',newline='') as f:
writer = csv.DictWriter(f, fieldnames = data[0].keys())
writer.writeheader()
writer.writerows(data)
...
The code below is what I have so far, but it only pulls data for the first 25 items, which are the first 25 items on the page before scrolling down for more:
import requests
from bs4 import BeautifulSoup
import time
import pandas as pd
start_time = time.time()
s = requests.Session()
#Get URL and extract content
response = s.get('https://www.linkedin.com/jobs/search?keywords=It%20Business%20Analyst&location=Boston%2C%20Massachusetts%2C%20United%20States&geoId=102380872&trk=public_jobs_jobs-search-bar_search-submit&position=1&pageNum=0')
soup = BeautifulSoup(response.text, 'html.parser')
items = soup.find('ul', {'class': 'jobs-search__results-list'})
job_titles = [i.text.strip('\n ') for i in items.find_all('h3', {'class': 'base-search-card__title'})]
job_companies = [i.text.strip('\n ') for i in items.find_all('h4', {'class': 'base-search-card__subtitle'})]
job_locations = [i.text.strip('\n ') for i in items.find_all('span', {'class': 'job-search-card__location'})]
job_links = [i["href"].strip('\n ') for i in items.find_all('a', {'class': 'base-card__full-link'})]
a = pd.DataFrame({'Job Titles': job_titles})
b = pd.DataFrame({'Job Companies': job_companies})
c = pd.DataFrame({'Job Locations': job_locations})
value_counts1 = a['Job Titles'].value_counts()
value_counts2 = b['Job Companies'].value_counts()
value_counts3 = c['Job Locations'].value_counts()
l1 = [f"{key} - {value_counts1[key]}" for key in value_counts1.keys()]
l2 = [f"{key} - {value_counts2[key]}" for key in value_counts2.keys()]
l3 = [f"{key} - {value_counts3[key]}" for key in value_counts3.keys()]
data = l1, l2, l3
df = pd.DataFrame(
data, index=['Job Titles', 'Job Companies', 'Job Locations'])
df = df.T
print(df)
print("--- %s seconds ---" % (time.time() - start_time))
I would like to pull data for more than the first 25 items, is there an efficient way of being able to do this?
Get the container that holds the desired data by inspecting and you can scrape from the infinite scroll page with Selenium web driver using window.scrollTo()
check this for more >
crawl site that has infinite scrolling using python
or this web-scraping-infinite-scrolling-with-selenium
The best way is to create a function to scroll down:
# Scroll function
# This function takes two arguments. The driver that is being used and a timeout.
# The driver is used to scroll and the timeout is used to wait for the page to load.
def scroll(driver, timeout):
scroll_pause_time = timeout
# Get scroll height
last_height = driver.execute_script("return document.body.scrollHeight")
while True:
# Scroll down to bottom
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
# Wait to load page
time.sleep(scroll_pause_time)
# Calculate new scroll height and compare with last scroll height
new_height = driver.execute_script("return document.body.scrollHeight")
if new_height == last_height:
# If heights are the same it will exit the function
break
last_height = new_height
Then you can use the scroll function to scroll desidered page:
import time
import pandas as pd
from seleniumwire import webdriver
# Create a new instance of the Firefox driver
driver = webdriver.Firefox()
# move to some url
driver.get('your_url')
# use "scroll" function to scroll the page every 5 seconds
scroll(driver, 5)
First time here asking. Hope someone can help me with this, it's driving me crazy !
I'm trying to scrape a used-car webpage from my country. The data loads when you start to scroll down, so, the first part of the code is for scrolling down and load the webpage.
I'm trying to get the link of every car published here, that's why I'm using find_elements_by_xpath in the try-except part.
Well, the problem is, the cars are showed up in packs of 11 for every load(scroll down), so the 11 xpaths repeats when scrolling down everytime;
meaning xpaths from
"//*[#id='w1']/div[1]/div/div[1]/a"
to
"//*[#id='w11']/div[1]/div/div[1]/a"
All libraries are called at the start of the code, don't worry.
from selenium import webdriver
from bs4 import BeautifulSoup
import time
links = []
url = ('https://buy.olxautos.cl/buscar?VehiculoEsSearch%5Btipo_valor%5D=1&VehiculoEsSearch%5Bprecio_range%5D=3990000%3B15190000')
driver = webdriver.Chrome('')
driver.get(url)
time.sleep(5)
SCROLL_PAUSE_TIME = 3
# Get scroll height
last_height = driver.execute_script("return document.body.scrollHeight")
while True:
# Scroll down to bottom
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
# Wait to load page
time.sleep(SCROLL_PAUSE_TIME)
# Calculate new scroll height and compare with last scroll height
new_height = driver.execute_script("return document.body.scrollHeight")
if new_height == last_height:
break
last_height = new_height
try:
zelda = driver.find_elements_by_xpath("//*[#id='w1']/div[1]/div/div[1]/a").get_attribute('href')
links.append(zelda)
except:
pass
print(links)
So the expected output of this code would be something like this:
['link_car_1', 'link_car_12', 'link_car_23', '...']
But when I run this code, it returns an empty list. But when I run it with find_element_by_xpath returns the first link, what am I doing wrong ðŸ˜ðŸ˜, I just can't figure it out !!.
Thanks!
You get only one link because the XPATH is not the same for all the links. you can use bs4 to extract links by using the driver page source as shown below.
from bs4 import BeautifulSoup
import lxml
links = []
url = ('https://buy.olxautos.cl/buscar?VehiculoEsSearch%5Btipo_valor%5D=1&VehiculoEsSearch%5Bprecio_range%5D=3990000%3B15190000')
driver = webdriver.Chrome(executable_path = Path)
driver.get(url)
time.sleep(5)
SCROLL_PAUSE_TIME = 3
# Get scroll height
last_height = driver.execute_script("return document.body.scrollHeight")
while True:
# Scroll down to bottom
page_source_ = driver.page_source
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
# Wait to load page
time.sleep(SCROLL_PAUSE_TIME)
# Calculate new scroll height and compare with last scroll height
new_height = driver.execute_script("return document.body.scrollHeight")
#use BeautifulSoup to extract links
sup = BeautifulSoup(page_source_, 'lxml')
sub_ = sup.findAll('div', {'class': 'owl-item active'})
for link_ in sub_:
link = link_.find('a', href= True)
#link = 'https://buy.olxautos.cl' + link #if needed (adding prefix)
links.append(link['href'])
if new_height == last_height:
break
last_height = new_height
print('>> Total length of list : ', len(links))
print('\n',links)
My Code:
from selenium import webdriver
from time import sleep
import csv
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium import webdriver
PATH = "C:\Program Files (x86)\chromedriver.exe"
driver = webdriver.Chrome(PATH)
driver.get("https://shopee.ph/search?keyword=arduino&noCorrection=true&page=0&withDiscount=true")
sleep(2)
SCROLL_PAUSE_TIME = 0.5
last_height = driver.execute_script("return document.body.scrollHeight")
while True:
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
sleep(SCROLL_PAUSE_TIME)
new_height = driver.execute_script("return document.body.scrollHeight")
if new_height == last_height:
break
last_height = new_height
Categories = []
Categories.append(["NAME"])
driver.implicitly_wait(10)
items = driver.find_elements_by_xpath('//*[#id="main"]/div/div[3]/div/div[2]/div/div[2]/div')
for item in items:
Name = item.find_element_by_xpath('.//*[#class="fBhek2 _2xt0JJ"]/div[2]/div/div').text
print(Name)
Categories.append(Name)
sleep(1)
with open('Shopee.csv', 'a', encoding='utf-8') as file:
Import = csv.writer(file,lineterminator='\n')
Import.writerows(Categories)
so im trying to data scrape the Shopee using Selenium and Pycharm. first is the product name is appearing in the Run Console but after 19 product..it causing error(you can see in the picture). 2nd, even if the product are appearing on Rune Console. NONE of them are being stored inside. here is the link of the Shopee i want to data scrape: https://shopee.ph/search?keyword=arduino&noCorrection=true&page=0&withDiscount=true
i wonder what is wrong? why none of them are being stored inside. then for every 19 product name display, it causing error(i also tried this on different product search such as laptop, etc).
Screenshot of CSV File with no inputs
First of all, you should improve your locators.
Try this:
items = driver.find_elements_by_xpath('//div[#class="col-xs-2-4 shopee-search-item-result__item"]')
Name = item.find_element_by_xpath('.//div[#data-sqe="name"]').text
Also I see you first scrolling till the page bottom with
while True:
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
sleep(SCROLL_PAUSE_TIME)
new_height = driver.execute_script("return document.body.scrollHeight")
if new_height == last_height:
break
last_height = new_height
and only after that you trying to get the products data with
items = driver.find_elements_by_xpath('//*[#id="main"]/div/div[3]/div/div[2]/div/div[2]/div')
for item in items:
Name = item.find_element_by_xpath('.//*[#class="fBhek2 _2xt0JJ"]/div[2]/div/div').text
print(Name)
Categories.append(Name)
sleep(1)
while you should get the products data inside the scrolling, otherwise you will get only the products presented on the lower part of the screen