Youtube Scraping (List of Videos) - python

On a Youtube channel, I'm trying to get a list of videos listed in the channel. (i.e. link, title, view, etc)
Yet, my code doesn't return any object. Any help will be appreciated!
from bs4 import BeautifulSoup as bs
import requests
from selenium.webdriver import Chrome
import re
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.keys import Keys
address = "https://www.youtube.com/channel/UCepWEz3BW6EMKA4CU-yGDMw/videos"
#driver = webdriver.Chrome('./chromedriver')
#driver.get(address)
#driver.maximize_window()
#body = driver.find_element_by_css_selector('body')
#for i in range(250):
# body.send_keys(Keys.PAGE_DOWN)
# time.sleep(1)
r = requests.get(address)
page = r.text
soup=bs(page,'html.parser')
result=soup.find_all('div', attrs={"class": 'videoId'})
print(result)

Try it:
from selenium import webdriver
url = "https://www.youtube.com/channel/UCepWEz3BW6EMKA4CU-yGDMw/videos"
browser = webdriver.Firefox()
browser.get(url)
datas = browser.find_elements_by_css_selector(".ytd-grid-renderer")
result = {"title":[], "link":[], "views":[]}
for data in datas:
try:
title = data.find_element_by_css_selector("#video-title").text
result["title"].append(title)
except:
result["title"].append("")
try:
link = data.find_element_by_css_selector("#video-title").get_attribute("href")
result["link"].append(link)
except:
result["link"].append("")
try:
views = data.find_element_by_css_selector("#metadata-line .ytd-grid-video-renderer:nth-child(1)").text
result["views"].append(views)
except:
result["views"].append("")
# print(result)
browser.close()

Related

Python Selenium .send_keys() only sending first character of my string

I was trying to automate a post to Facebook using Python Selenium, and it was 90% complete. The only issue is that the string I give is "test," but when Facebook posts, it just sends the first character of "test," which is "t."
This is the code:
#libraries
from selenium import webdriver
from selenium.webdriver.common.by import By
import selenium.webdriver.common.keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import bs4
from bs4 import BeautifulSoup as soup
from urllib.request import Request, urlopen
from time import sleep
import pyautogui
#fetching hashtags
def hashtags(hash_idea):
url = 'http://best-hashtags.com/hashtag/' + hash_idea
try:
req = Request(url, headers={'User-Agent' : 'Mozilla/5.0'})
page = urlopen(req, timeout=10)
page_html = page.read()
page.close()
page_soup = soup(page_html, 'html.parser')
result = page_soup.find('div',{'class':'tag-box tag-box-v3 margin-bottom-40'})
tags = result.decode()
start_index = tags.find('#')
end_index = tags.find('</p1>')
tags = tags[start_index:end_index]
return tags
except:
print('Something went wrong While Fetching hashtags')
def login(username, password):
try:
url = 'https://facebook.com'
driver.get(url)
user = WebDriverWait(driver,20).until(EC.presence_of_element_located((By.NAME, 'email')))
user.send_keys(username)
pas = WebDriverWait(driver,20).until(EC.presence_of_element_located((By.NAME, 'pass')))
pas.send_keys(password)
login_btn = WebDriverWait(driver,20).until(EC.presence_of_element_located((By.NAME,'login')))
login_btn.click()
except:
print('Something went wrong while login process')
def upload(img_path,caption):
try:
btn1 = WebDriverWait(driver,20).until(EC.presence_of_element_located((By.XPATH, '/html/body/div[1]/div[1]/div[1]/div/div[3]/div/div/div/div[1]/div[1]/div/div[2]/div/div/div/div[3]/div/div[2]/div/div/div/div[1]/div/div[1]')))
btn1.click()
btn2= WebDriverWait(driver,20).until(EC.presence_of_element_located((By.XPATH, '/html/body/div[1]/div[1]/div[1]/div/div[4]/div/div/div[1]/div/div[2]/div/div/div/form/div/div[1]/div/div/div/div[3]/div[1]/div[2]/div/div[1]/div/span/div/div/div[1]/div/div/div[1]/i')))
btn2.click()
btn3 = WebDriverWait(driver,20).until(EC.presence_of_element_located((By.XPATH, '/html/body/div[1]/div[1]/div[1]/div/div[4]/div/div/div[1]/div/div[2]/div/div/div/form/div/div[1]/div/div/div/div[2]/div[1]/div[2]/div/div[1]/div/div/div/div[1]/div/div/div/div[1]/div/i')))
btn3.click()
pyautogui.write(img_path)
pyautogui.press('enter')
cap = WebDriverWait(driver,20).until(EC.presence_of_element_located((By.XPATH, '/html/body/div[1]/div[1]/div[1]/div/div[4]/div/div/div[1]/div/div[2]/div/div/div/form/div/div[1]/div/div/div/div[2]/div[1]/div[1]/div[1]/div/div/div[1]')))
cap.send_keys(caption)
sleep(5) # this is mandatory while doing some thing with bot
btn_post = WebDriverWait(driver,20).until(EC.presence_of_element_located((By.XPATH, '/html/body/div[1]/div[1]/div[1]/div/div[4]/div/div/div[1]/div/div[2]/div/div/div/form/div/div[1]/div/div/div/div[3]/div[2]/div/div/div[1]/div')))
btn_post.click()
except:
print('Something Went Wrong While posting the image or video')
if __name__== "__main__":
#turn for credentials, driver, and caption
username = input('username : ')
password = input('pass : ')
img_path = 'pic1.jpg'
hash_idea = 'covid'
caption = 'test' # if you want to
caption = caption + '\n' + hashtags(hash_idea)
driver = webdriver.Firefox(executable_path="C:/Users/Asus/Downloads/Compressed/geckodriver-v0.32.0-win64/geckodriver.exe")
login(username,password)
upload(img_path,caption)
I wanted to automate the post with the text I provided in the code.
You can try several alternatives
In the definition of cap replace presence_of_element_located with element_to_be_clickable.
Do what in 1. and moreover add
cap = ...
cap.clear()
cap.click()
cap.send_keys(caption)
Do what in 1. and moreover use ActionChains
from selenium.webdriver.common.action_chains import ActionChains
actions = ActionChains(driver)
cap = ...
actions.move_to_element(cap) # move the mouse to the middle of element
actions.click()
actions.send_keys(caption).perform()
If none works, then you can always send one character at a time
[cap.send_keys(c) for c in caption]

Python Selenium 'post' button of Facebook is not working

Previously I asked about the Python Selenium .send_keys() only sending first character of my string. The issue has now been resolved, but the post button is no longer functional. The post button functions properly when there is a string problem, but when that problem is fixed, the post button stops functioning properly.
This is the previous code (only sending first character of my string) :
#libraries
from selenium import webdriver
from selenium.webdriver.common.by import By
import selenium.webdriver.common.keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import bs4
from bs4 import BeautifulSoup as soup
from urllib.request import Request, urlopen
from time import sleep
import pyautogui
#fetching hashtags
def hashtags(hash_idea):
url = 'http://best-hashtags.com/hashtag/' + hash_idea
try:
req = Request(url, headers={'User-Agent' : 'Mozilla/5.0'})
page = urlopen(req, timeout=10)
page_html = page.read()
page.close()
page_soup = soup(page_html, 'html.parser')
result = page_soup.find('div',{'class':'tag-box tag-box-v3 margin-bottom-40'})
tags = result.decode()
start_index = tags.find('#')
end_index = tags.find('</p1>')
tags = tags[start_index:end_index]
return tags
except:
print('Something went wrong While Fetching hashtags')
def login(username, password):
try:
url = 'https://facebook.com'
driver.get(url)
user = WebDriverWait(driver,20).until(EC.presence_of_element_located((By.NAME, 'email')))
user.send_keys(username)
pas = WebDriverWait(driver,20).until(EC.presence_of_element_located((By.NAME, 'pass')))
pas.send_keys(password)
login_btn = WebDriverWait(driver,20).until(EC.presence_of_element_located((By.NAME,'login')))
login_btn.click()
except:
print('Something went wrong while login process')
def upload(img_path,caption):
try:
btn1 = WebDriverWait(driver,20).until(EC.presence_of_element_located((By.XPATH, '/html/body/div[1]/div[1]/div[1]/div/div[3]/div/div/div/div[1]/div[1]/div/div[2]/div/div/div/div[3]/div/div[2]/div/div/div/div[1]/div/div[1]')))
btn1.click()
btn2= WebDriverWait(driver,20).until(EC.presence_of_element_located((By.XPATH, '/html/body/div[1]/div[1]/div[1]/div/div[4]/div/div/div[1]/div/div[2]/div/div/div/form/div/div[1]/div/div/div/div[3]/div[1]/div[2]/div/div[1]/div/span/div/div/div[1]/div/div/div[1]/i')))
btn2.click()
btn3 = WebDriverWait(driver,20).until(EC.presence_of_element_located((By.XPATH, '/html/body/div[1]/div[1]/div[1]/div/div[4]/div/div/div[1]/div/div[2]/div/div/div/form/div/div[1]/div/div/div/div[2]/div[1]/div[2]/div/div[1]/div/div/div/div[1]/div/div/div/div[1]/div/i')))
btn3.click()
pyautogui.write(img_path)
pyautogui.press('enter')
cap = WebDriverWait(driver,20).until(EC.presence_of_element_located((By.XPATH, '/html/body/div[1]/div[1]/div[1]/div/div[4]/div/div/div[1]/div/div[2]/div/div/div/form/div/div[1]/div/div/div/div[2]/div[1]/div[1]/div[1]/div/div/div[1]')))
cap.send_keys(caption)
sleep(5) # this is mandatory while doing some thing with bot
btn_post = WebDriverWait(driver,20).until(EC.presence_of_element_located((By.XPATH, '/html/body/div[1]/div[1]/div[1]/div/div[4]/div/div/div[1]/div/div[2]/div/div/div/form/div/div[1]/div/div/div/div[3]/div[2]/div/div/div[1]/div')))
btn_post.click()
except:
print('Something Went Wrong While posting the image or video')
if __name__== "__main__":
#turn for credentials, driver, and caption
username = input('username : ')
password = input('pass : ')
img_path = 'pic1.jpg'
hash_idea = 'covid'
caption = 'test' # if you want to
caption = caption + '\n' + hashtags(hash_idea)
driver = webdriver.Firefox(executable_path="C:/Users/Asus/Downloads/Compressed/geckodriver-v0.32.0-win64/geckodriver.exe")
login(username,password)
upload(img_path,caption)
This is the new code (String problem fixed but 'post' button is not working) :
#libraries
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains
import bs4
from bs4 import BeautifulSoup as soup
from urllib.request import Request, urlopen
from time import sleep
import pyautogui
#fetching hashtags
def hashtags(hash_idea):
url = 'http://best-hashtags.com/hashtag/' + hash_idea
try:
req = Request(url, headers={'User-Agent' : 'Mozilla/5.0'})
page = urlopen(req, timeout=10)
page_html = page.read()
page.close()
page_soup = soup(page_html, 'html.parser')
result = page_soup.find('div',{'class':'tag-box tag-box-v3 margin-bottom-40'})
tags = result.decode()
start_index = tags.find('#')
end_index = tags.find('</p1>')
tags = tags[start_index:end_index]
return tags
except:
print('Something went wrong While Fetching hashtags')
def login(username, password):
try:
url = 'https://facebook.com'
driver.get(url)
user = WebDriverWait(driver,20).until(EC.presence_of_element_located((By.NAME, 'email')))
user.send_keys(username)
pas = WebDriverWait(driver,20).until(EC.presence_of_element_located((By.NAME, 'pass')))
pas.send_keys(password)
login_btn = WebDriverWait(driver,20).until(EC.presence_of_element_located((By.NAME,'login')))
login_btn.click()
except:
print('Something went wrong while login process')
def upload(img_path,caption):
try:
btn1 = WebDriverWait(driver,20).until(EC.presence_of_element_located((By.XPATH, '/html/body/div[1]/div[1]/div[1]/div/div[3]/div/div/div/div[1]/div[1]/div/div[2]/div/div/div/div[3]/div/div[2]/div/div/div/div[1]/div/div[1]')))
btn1.click()
btn2= WebDriverWait(driver,20).until(EC.presence_of_element_located((By.XPATH, '/html/body/div[1]/div[1]/div[1]/div/div[4]/div/div/div[1]/div/div[2]/div/div/div/form/div/div[1]/div/div/div/div[3]/div[1]/div[2]/div/div[1]/div/span/div/div/div[1]/div/div/div[1]/i')))
btn2.click()
btn3 = WebDriverWait(driver,20).until(EC.presence_of_element_located((By.XPATH, '/html/body/div[1]/div[1]/div[1]/div/div[4]/div/div/div[1]/div/div[2]/div/div/div/form/div/div[1]/div/div/div/div[2]/div[1]/div[2]/div/div[1]/div/div/div/div[1]/div/div/div/div[1]/div/i')))
btn3.click()
pyautogui.write(img_path)
pyautogui.press('enter')
actions = ActionChains(driver)
cap = WebDriverWait(driver,20).until(EC.element_to_be_clickable((By.XPATH, '/html/body/div[1]/div[1]/div[1]/div/div[4]/div/div/div[1]/div/div[2]/div/div/div/form/div/div[1]/div/div/div/div[2]/div[1]/div[1]/div[1]/div/div/div[1]')))
actions.move_to_element(cap)
actions.click()
actions.send_keys(caption).perform()
sleep(5) # this is mandatory while doing some thing with bot
btn_post = WebDriverWait(driver,20).until(EC.element_to_be_clickable((By.XPATH, '/html/body/div[1]/div[1]/div[1]/div/div[4]/div/div/div[1]/div/div[2]/div/div/div/form/div/div[1]/div/div/div/div[3]/div[2]/div/div/div[1]/div')))
btn_post.click()
except:
print('Something Went Wrong While posting the image or video')
if __name__== "__main__":
#turn for credentials, driver, and caption
username = input('username : ')
password = input('pass : ')
img_path = 'pic1.jpg'
hash_idea = 'covid'
caption = 'test' # if you want to
caption = caption + '\n' + hashtags(hash_idea)
driver = webdriver.Firefox(executable_path="C:/Users/Asus/Downloads/Compressed/geckodriver-v0.32.0-win64/geckodriver.exe")
login(username,password)
upload(img_path,caption)
I wanted to automate the post with the text I provided in the code. I've tried a number of different approaches, but the post button is still not functioning properly. When I execute the script, it gives the error message "Something Went Wrong While posting the image or video." Any helpful suggestion or response would be greatly appreciated.

something is wrong google crawler. please

# 뉴스 크롤링.py
#######################################'사용후핵연료' 키워드 검색##################################################
import sys, os
from bs4 import BeautifulSoup
import requests
from selenium import webdriver
import selenium
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
from datetime import datetime, timedelta
from pandas import DataFrame
import time
from openpyxl.workbook import Workbook
sleep_sec = 0.5
wb = Workbook()
# User-Agent를 입력해주세요.
headers = {'User-Agent' : '________________'}
query = 'spent nuclear fuel'
yesterday = (datetime.today() - timedelta(1)).strftime("%Y.%m.%d")
def news_crawling():
service = Service(executable_path=ChromeDriverManager().install())
browser = webdriver.Chrome(service=service)
print('브라우저를 실행시킵니다(자동 제어)\n')
news_url = 'https://www.google.com/search?q={0}&tbm=nws&source-news]'.format(query, yesterday)
browser.get(news_url)
time.sleep(sleep_sec)
print('\n크롤링을 시작합니다.')
#####동적 제어로 페이지 넘어가며 크롤링
news_dict = {}
idx = 1
cur_page = 1
news_num = 1000000
while True:
table = browser.find_element("xpath",'.//div[#data-hveid="CBAQAA"]')
li_list = table.find_elements("xpath",'.//li[contains(#class="vJOb1e aIfcHf Hw13jc"]')
area_list = [li.find_element("xpath",'.//div[#class="mCBkyc y355M ynAwRc MBeuO nDgy9d"]') for li in li_list]
for a in area_list[:min(len(area_list), news_num-idx+1)]:
n = a.find_element("xpath",'.//div[#role="heading"]')
n_url = n.get_attribute('href')
try:
img = a.find_element(By.CSS_SELECTOR,'img#dimg_').find_element(By.CSS_SELECTOR, 'img')
img = img.get_attribute('src')
except:
img = " "
news_dict[idx] = {'Title' : n.get_attribute('title'),
'url' : n_url,
'thumbnail': img}
idx += 1
try:
next_btn = browser.find_element(By.CSS_SELECTOR, 'a#pnnext')
next_btn.click()
cur_page +=1
# pages = browser.find_element("xpath",'//div[#class="sc_page_inner"]')
# next_page_url = [p for p in pages.find_elements("xpath",'.//a') if p.text == str(cur_page)][0].get_attribute('href')
pages = browser.find_element("xpath",'//table[#class="fl"]')
next_page_url = [p for p in pages.find_elements("xpath",'.//a') if p.text == str(cur_page)][0].get_attribute('aria-lable')
browser.get(next_page_url)
time.sleep(sleep_sec)
except:
print('\n브라우저를 종료합니다.\n' + '=' * 100)
time.sleep(0.7)
browser.close()
break
########################################################여기까지 수정 완료################################################################
# 엑셀파일 추출
print('데이터프레임 변환\n')
news_df = DataFrame(news_dict).T
folder_path = os.getcwd()
xlsx_file_name = '{}_{}.xlsx'.format(query, yesterday)
news_df.to_excel(xlsx_file_name, index=False)
print('엑셀 저장 완료 | 경로 : {}\\{}\n'.format(folder_path, xlsx_file_name))
news_crawling()
this is my code. I use it on Korean website and it works well. But after I modified it for google search, it wouldn't work.
I want to search something on google and then get the news titles into a xlsx file.
I before used it in Korean website, so I changed the part below
table = browser.find_element("xpath",'.//div[#data-hveid="CBAQAA"]')
li_list = table.find_elements("xpath",'.//li[contains(#class="vJOb1e aIfcHf Hw13jc"]')
area_list = [li.find_element("xpath",'.//div[#class="mCBkyc y355M ynAwRc MBeuO nDgy9d"]') for li in li_list]
and when I run the code, it only gives me an empty xlsx file.
can anyone help with this please? I would be so appreciate.
Here is one possible solution:
from openpyxl import Workbook
from datetime import datetime
from datetime import timedelta
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
def get_url(query: str, min_date: str, max_date: str) -> str:
return f'https://www.google.com/search?q={query}&tbm=nws&source-news&tbs=cdr:1,cd_min:{min_date},cd_max:{max_date}'
options = webdriver.ChromeOptions()
options.add_argument('--headless')
options.add_experimental_option("excludeSwitches", ["enable-automation", "enable-logging"])
service = Service(executable_path="path/to/your/chromedriver.exe")
driver = webdriver.Chrome(service=service, options=options)
wait = WebDriverWait(driver, 5)
yesterday = (datetime.now() - timedelta(1)).strftime("%m.%d.%Y")
driver.get(get_url('spent nuclear fuel', yesterday, yesterday))
url_locator = (By.CSS_SELECTOR, '#rso a')
title_locator = (By.CSS_SELECTOR, 'a div[role="heading"]')
thumbnail_locator = (By.CSS_SELECTOR, '#rso a>div>div:first-child img')
workbook = Workbook()
worksheet = workbook.active
page = 1
while True:
print(f'Current page: {page}')
url_web_elements = wait.until(EC.visibility_of_all_elements_located(url_locator))
title_web_elements = wait.until(EC.presence_of_all_elements_located(title_locator))
thumbnail_web_elements = wait.until(EC.visibility_of_all_elements_located(thumbnail_locator))
titles = [title.text.replace(',', '.') for title in title_web_elements]
urls = [link.get_attribute('href') for link in url_web_elements]
thumbnails = [thumbnail.get_attribute('src') for thumbnail in thumbnail_web_elements]
for data in zip(titles, urls, thumbnails):
news = {
'title' : data[0],
'url' : data[1],
'thumbnail': data[2]
}
worksheet.append(list(news.values()))
try:
page += 1
wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, '#pnnext'))).click()
except TimeoutException:
break
workbook.save(f'google_news_{yesterday}.xlsx')
driver.quit()
Output is xlsx file google_news_11.10.2022.xlsx
In the get_url function, you can pass a range of dates for which the news will be displayed. For example get_url('spent nuclear fuel', 01.11.2022, 11.11.2022)
You can also save data to csv using this solution:
import csv
from datetime import datetime
from datetime import timedelta
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
def get_url(query: str, min_date: str, max_date: str) -> str:
return f'https://www.google.com/search?q={query}&tbm=nws&source-news&tbs=cdr:1,cd_min:{min_date},cd_max:{max_date}'
def save_to_csv(data: list) -> None:
with open(file='google_news.csv', mode='a', encoding="utf-8") as f:
writer = csv.writer(f, lineterminator='\n')
writer.writerow([*data])
options = webdriver.ChromeOptions()
options.add_argument('--headless')
options.add_experimental_option("excludeSwitches", ["enable-automation", "enable-logging"])
service = Service(executable_path="path/to/your/chromedriver.exe")
driver = webdriver.Chrome(service=service, options=options)
wait = WebDriverWait(driver, 5)
yesterday = (datetime.now() - timedelta(1)).strftime("%m.%d.%Y")
driver.get(get_url('spent nuclear fuel', yesterday, yesterday))
url_locator = (By.CSS_SELECTOR, '#rso a')
title_locator = (By.CSS_SELECTOR, 'a div[role="heading"]')
thumbnail_locator = (By.CSS_SELECTOR, '#rso a>div>div:first-child img')
page = 1
while True:
print(f'Current page: {page}')
url_web_elements = wait.until(EC.visibility_of_all_elements_located(url_locator))
title_web_elements = wait.until(EC.presence_of_all_elements_located(title_locator))
thumbnail_web_elements = wait.until(EC.visibility_of_all_elements_located(thumbnail_locator))
titles = [title.text.replace(',', '.') for title in title_web_elements]
urls = [link.get_attribute('href') for link in url_web_elements]
thumbnails = [thumbnail.get_attribute('src') for thumbnail in thumbnail_web_elements]
for data in zip(titles, urls, thumbnails):
news = {
'title' : data[0],
'url' : data[1],
'thumbnail': data[2]
}
save_to_csv(news.values())
try:
page += 1
wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, '#pnnext'))).click()
except TimeoutException:
break
driver.quit()
Output is csv file google_news.csv:
COP27: nuclear boss doesn't expect surge in waste recycling,https://news.yahoo.com/cop27-nuclear-boss-doesnt-expect-072631885.html,""
UN Nuclear Chief Says Recycling Nuclear Waste 'Difficult ...,https://www.theepochtimes.com/un-nuclear-chief-says-recycling-nuclear-waste-difficult-after-biden-looks-to-fund-reprocessing-projects_4855151.html,""
COP27: UN nuclear chief says radioactive waste recycling is 'difficult' technology,https://www.deccanherald.com/international/world-news-politics/cop27-un-nuclear-chief-says-radioactive-waste-recycling-is-difficult-technology-1161036.html,""
Tested on Python 3.9.10. Used Selenium 4.5.0, openpyxl 3.0.10

How to scrape data from each product page from Aliexpress using python selenium

I am trying to scrape each product page from this website: https://www.aliexpress.com/wholesale?catId=0&initiative_id=SB_20220315022920&SearchText=bluetooth+earphones
Especially I want to get comments and custumer countries as I mentionned in the photo:
enter image description here
The main issue is that my code does not inspect the right elements and this is what I am struggling with .
First, I tried my scraping on this product : https://www.aliexpress.com/item/1005003801507855.html?spm=a2g0o.productlist.0.0.1e951bc72xISfE&algo_pvid=6d3ed61e-f378-43d0-a429-5f6cddf3d6ad&algo_exp_id=6d3ed61e-f378-43d0-a429-5f6cddf3d6ad-8&pdp_ext_f=%7B%22sku_id%22%3A%2212000027213624098%22%7D&pdp_pi=-1%3B40.81%3B-1%3B-1%40salePrice%3BMAD%3Bsearch-mainSearch
Here is my code :
from selenium import webdriver
from selenium.webdriver.common.by import By
from lxml import html
import cssselect
from time import sleep
from itertools import zip_longest
import csv
driver = webdriver.Edge(executable_path=r"C:/Users/OUISSAL/Desktop/wscraping/XEW/scraping/codes/msedgedriver")
url = "https://www.aliexpress.com/item/1005003801507855.html?spm=a2g0o.productlist.0.0.1e951bc72xISfE&algo_pvid=6d3ed61e-f378-43d0-a429-5f6cddf3d6ad&algo_exp_id=6d3ed61e-f378-43d0-a429-5f6cddf3d6ad-8&pdp_ext_f=%7B%22sku_id%22%3A%2212000027213624098%22%7D&pdp_pi=-1%3B40.81%3B-1%3B-1%40salePrice%3BMAD%3Bsearch-mainSearch"
with open ("data.csv", "w", encoding="utf-8") as csvfile:
wr = csv.writer(csvfile)
wr.writerow(["Comment","Custumer country"])
driver.get(url)
driver.execute_script('window.scrollTo(0, document.body.scrollHeight);')
review_buttom = driver.find_element_by_xpath('//li[#ae_button_type="tab_feedback"]')
review_buttom.click()
html_source = driver.find_element_by_xpath('//div[#id="transction-feedback"]')
tree = html.fromstring(html_source)
#tree = html.fromstring(driver.page_source)
for rvw in tree.xpath('//div[#class="feedback-item clearfix"]'):
country = rvw.xpath('//div[#class="user-country"]//b/text()')
if country:
country = country[0]
else:
country = ''
print('country:', country)
comment = rvw.xpath('//dt[#id="buyer-feedback"]//span/text()')
if comment:
comment = comment[0]
else:
comment = ''
print('comment:', comment)
driver.close()
Thank you !!
What happens?
There is one main issue, the feedback you are looking for is in an iframe, so you wont get your information by calling the elements directly.
How to fix?
Scroll into view of element that holds the iframe navigate to its source and interact with its pagination to get all the feedbacks.
Example
from selenium import webdriver
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
url = 'https://www.aliexpress.com/item/1005003801507855.html?spm=a2g0o.productlist.0.0.1e951bc72xISfE&algo_pvid=6d3ed61e-f378-43d0-a429-5f6cddf3d6ad&algo_exp_id=6d3ed61e-f378-43d0-a429-5f6cddf3d6ad-8&pdp_ext_f=%7B%22sku_id%22%3A%2212000027213624098%22%7D&pdp_pi=-1%3B40.81%3B-1%3B-1%40salePrice%3BMAD%3Bsearch-mainSearch'
driver = webdriver.Chrome(ChromeDriverManager().install())
driver.maximize_window()
driver.get(url)
wait = WebDriverWait(driver, 10)
driver.execute_script("arguments[0].scrollIntoView();", wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, '.tab-content'))))
driver.get(wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, '#product-evaluation'))).get_attribute('src'))
data=[]
while True:
for e in driver.find_elements(By.CSS_SELECTOR, 'div.feedback-item'):
try:
country = e.find_element(By.CSS_SELECTOR, '.user-country > b').text
except:
country = None
try:
comment = e.find_element(By.CSS_SELECTOR, '.buyer-feedback span').text
except:
comment = None
data.append({
'country':country,
'comment':comment
})
try:
wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, '#complex-pager a.ui-pagination-next'))).click()
except:
break
pd.DataFrame(data).to_csv('filename.csv',index=False)

if or try loop for an element in a page selenium

I am trying to scrape agents data here. I am able to get the links from the first page. I am using numbered loops because I know the total number of pages. I tried to run this as long as the "next" page option is there. I tried both "try" and "if not" but wasn't able to figure it out. Any help is welcome. Here is the code.
from selenium import webdriver
import time
from selenium.common.exceptions import ElementNotVisibleException, NoSuchElementException
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
options = webdriver.ChromeOptions()
options.add_argument('headless')
driver = webdriver.Chrome('C:/Users/../Downloads/cd79/chromedriver.exe', options=options)
links_total = []
driver.get("https://www.cbp.gov/contact/find-broker-by-port?field_port_location_tid=All&field_port_code_value=")
def first_links():
initial_data = driver.find_elements_by_tag_name('td')
for initial in initial_data:
page_links = initial.find_elements_by_tag_name('a')
for page in page_links:
page_link = page.get_attribute("href")
links_total.append(page_link)
driver.refresh()
if driver.find_element_by_partial_link_text('next'):
next_page = driver.find_element_by_partial_link_text('next')
next_page.click()
time.sleep(2)
new_data = driver.find_elements_by_tag_name('td')
for new in new_data:
links = new.find_elements_by_tag_name('a')
for link in links:
new_link = link.get_attribute("href")
links_total.append(new_link)
for i in range(1, 23):
first_links()
for link in links_total:
print(link)
Try-catch would be better option
from selenium import webdriver
import time
from selenium.common.exceptions import ElementNotVisibleException, NoSuchElementException
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
options = webdriver.ChromeOptions()
options.add_argument('headless')
driver = webdriver.Chrome('C:/Users/../Downloads/cd79/chromedriver.exe', options=options)
driver.implicitly_wait(10)
# links_total = []
driver.get("https://www.cbp.gov/contact/find-broker-by-port?field_port_location_tid=All&field_port_code_value=")
def first_links(links_total=[]):
initial_data = driver.find_elements_by_tag_name('td')
for initial in initial_data:
page_links = initial.find_elements_by_tag_name('a')
for page in page_links:
page_link = page.get_attribute("href")
links_total.append(page_link)
# driver.refresh()
try:
next_page = driver.find_element_by_partial_link_text('next')
next_page.click()
time.sleep(2)
first_links(links_total)
except (TimeoutError, ElementNotVisibleException, NoSuchElementException):
print("NEXT btn not found : ")
pass
return links_total
all_links = first_links()
for link in all_links:
print(link)
You don't need to use Selenium actually. You could do it with BeautifulSoap like so :
import requests
from bs4 import BeautifulSoup
page_num=0
url_cbp = r"https://www.cbp.gov/contact/find-broker-by-port?field_port_location_tid=All&field_port_code_value=&page={}"
def get_links(links_total=[], page_num=0):
page = requests.get(url_cbp.format(page_num))
soup = BeautifulSoup(page.content, 'html.parser')
results = soup.find(id='region-content')
table_cells = results.find_all('td', class_='views-field')
for cell in table_cells:
# print(cell )
# print('\n\n')
cell_link = cell.find('a')
page_link = cell_link["href"]
links_total.append(page_link)
next_page = results.find('li', class_='pager-next')
if next_page:
page_num += 1
get_links(links_total, page_num)
return links_total
all_links = get_links()
for link in all_links:
print(link)

Categories

Resources