How to extract the comments count correctly

How to extract the comments count correctly - python

I am trying to extract number of youtube comments and tried several methods.
My Code:
from selenium import webdriver
import pandas as pd
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
import time
DRIVER_PATH = <your chromedriver path>
wd = webdriver.Chrome(executable_path=DRIVER_PATH)
url = 'https://www.youtube.com/watch?v=5qzKTbnhyhc'
wd.get(url)
wait = WebDriverWait(wd, 100)
time.sleep(40)
v_title = wd.find_element_by_xpath('//*[#id="container"]/h1/yt-formatted-string').text
print("title Is ")
print(v_title)
comments_xpath = '//h2[#id="count"]/yt-formatted-string/span[1]'
v_comm_cnt = wait.until(EC.visibility_of_element_located((By.XPATH, comments_xpath)))
#wd.find_element_by_xpath(comments_xpath)
print(len(v_comm_cnt))
I get the following error:
selenium.common.exceptions.TimeoutException: Message:
I get correct value for title but not for comment_cnt. Can any one please guide me what is wrong with my code?
Please note that comments count path - //h2[#id="count"]/yt-formatted-string/span[1] point to correct place if I search the value in inspect element.

Updated answer
Well, it was tricky!
There are several issues here:
This page has some bad java scripts on it making the Selenium webdriver driver.get() method to wait until the timeout for the page loaded while it looks like the page is loaded. To overcome that I used Eager page load strategy.
This page has several blocks of code for the same areas so as sometimes one of them is used (visible) and sometimes the second. This makes working with element locators difficultly. So, here I am waiting for visibility of title element from one of that blocks. In case it was visible - I'm extracting the text from there, otherwise I'm waiting for the visibility of the second element (it comes immediately) and extracting the text from there.
There are several ways to make page scrolling. Not all of them worked here. I found the one that is working and scrolling not too much.
The code below is 100% working, I run it several times.
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
from selenium.webdriver.chrome.service import Service
options = Options()
options.add_argument("--start-maximized")
caps = DesiredCapabilities().CHROME
caps["pageLoadStrategy"] = "eager"
s = Service('C:\webdrivers\chromedriver.exe')
driver = webdriver.Chrome(options=options, desired_capabilities=caps, service=s)
url = 'https://www.youtube.com/watch?v=5qzKTbnhyhc'
driver.get(url)
driver.maximize_window()
wait = WebDriverWait(driver, 10)
title_xpath = "//div[#class='style-scope ytd-video-primary-info-renderer']/h1"
alternative_title = "//*[#id='title']/h1"
v_title = ""
try:
v_title = wait.until(EC.visibility_of_element_located((By.XPATH, title_xpath))).text
except:
v_title = wait.until(EC.visibility_of_element_located((By.XPATH, alternative_title))).text
print("Title is " + v_title)
comments_xpath = "//div[#id='title']//*[#id='count']//span[1]"
driver.execute_script("window.scrollBy(0, arguments[0]);", 600)
try:
v_comm_cnt = wait.until(EC.visibility_of_element_located((By.XPATH, comments_xpath)))
except:
pass
v_comm_cnt = driver.find_element(By.XPATH, comments_xpath).text
print("Video has " + v_comm_cnt + " comments")
The output is:
Title is Music for when you are stressed 🍀 Chil lofi | Music to Relax, Drive, Study, Chill
Video has 834 comments
Process finished with exit code 0

Related

Page navigation click without using current_url python selenium

How to navigate through each page without using driver.current_url? In my full code, I get a bunch of errors once I navigate through the page for a loop. Without it, it runs fine but can only go through one page. I want to navigate through as many pages. Any help appreciated, thanks.
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
driver_service = Service(executable_path="C:\Program Files (x86)\chromedriver.exe")
driver = webdriver.Chrome(service=driver_service)
driver.maximize_window() # load web driver
wait = WebDriverWait(driver, 5)
url_test = driver.get('https://www.seek.com.au/data-jobs-in-information-communication-technology/in-All-Perth-WA')
url_template = driver.current_url
template = url_template+ '?page={}'
for page in range(2,5):
link_job = [x.get_attribute('href') for x in driver.find_elements(By.XPATH, "//a[#data-automation='jobTitle']")]
for job in link_job:
driver.get(job)
try:
quick_apply = WebDriverWait(driver, 3).until(EC.element_to_be_clickable((By.XPATH, "(//a[#data-automation='job-detail-apply' and #target='_self'])")))
quick_apply.click()
#sleep(3)
except:
print("No records found " + job)
pass
sleep(3)
driver.get(template.format(page))

If I understand you correctly you want to determine dynamically how many pages there are and loop over each of them.
I have managed to achieve this by using a while loop and look on each page if the "Next" button at the bottom is visible. If not, the last page was reached and you can exit the loop.
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from time import sleep
driver_service = Service(executable_path="C:\\Users\\Stefan\\bin\\chromedriver.exe")
driver = webdriver.Chrome(service=driver_service)
driver.maximize_window() # load web driver
wait = WebDriverWait(driver, 5)
url_test = driver.get('https://www.seek.com.au/data-jobs-in-information-communication-technology/in-All-Perth-WA')
url_template = driver.current_url
template = url_template+ '?page={}'
page = 1
while True:
# check if "Next" button is visible
# -> if not, the last page was reached
try:
driver.find_element(By.XPATH, "//a[#title='Next']")
except:
# last page reached
break
link_job = [x.get_attribute('href') for x in driver.find_elements(By.XPATH, "//a[#data-automation='jobTitle']")]
for job in link_job:
driver.get(job)
try:
quick_apply = WebDriverWait(driver, 3).until(EC.element_to_be_clickable((By.XPATH, "(//a[#data-automation='job-detail-apply' and #target='_self'])")))
quick_apply.click()
#sleep(3)
except:
print("No records found " + job)
pass
sleep(3)
page += 1
driver.get(template.format(page))
driver.close()

Seems your problem is with StaleElementException when you getting back from job page to jobs search results page.
The simplest approach to overcome this problem is to keep the jobs search results page url.
Actually I changed your code only with this point and it works.
I also changed driver.find_elements(By.XPATH, "//a[#data-automation='jobTitle']") with wait.until(EC.visibility_of_all_elements_located((By.XPATH, "//a[#data-automation='jobTitle']"))) for better performance.
The code below works, but the web site itself responds badly.
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
options = Options()
options.add_argument("start-maximized")
webdriver_service = Service('C:\webdrivers\chromedriver.exe')
driver = webdriver.Chrome(service=webdriver_service, options=options)
wait = WebDriverWait(driver, 10)
url = 'https://www.seek.com.au/data-jobs-in-information-communication-technology/in-All-Perth-WA?page={p}'
for p in range(1,20):
driver.get(url)
link_job = [x.get_attribute('href') for x in wait.until(EC.visibility_of_all_elements_located((By.XPATH, "//a[#data-automation='jobTitle']")))]
for job in link_job:
driver.get(job)
try:
wait.until(EC.element_to_be_clickable((By.XPATH, "(//a[#data-automation='job-detail-apply' and #target='_self'])"))).click()
print("applied")
except:
print("No records found " + job)
pass
driver.get(url)

Selenium screenshots are chopped off

I'm trying to take screenshots of different elements (on different sections) of a long page. The first screenshot I take is near the top of the page and comes out not chopped off:
However, once selenium has to scroll to take a screenshot of an element, the screenshot comes out chopped off:
I'm pretty sure this is happening because selenium isn't scrolling far enough for the entire element to be exposed, and thus takes a screenshot that comes out incomplete – but I don't know how to solve the problem. Any help would be greatly appreciated. Below is my code so far (screen-shotting the element happens in the last few lines of code):
from time import sleep
from os import chdir
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
option = webdriver.ChromeOptions()
option.add_experimental_option("excludeSwitches", ["enable-automation"])
option.add_experimental_option('useAutomationExtension', False)
option.add_argument("--disable-infobars")
option.add_argument("start-maximized")
option.add_argument("--disable-extensions")
option.add_experimental_option("detach", True)
option.add_experimental_option("prefs", {
"profile.default_content_setting_values.notifications": 2
})
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()),options=option)
driver.get("https://www.reddit.com/r/AskReddit/")
#Show top reddit posts on the subreddit
topButton = WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.XPATH, "/html/body/div[1]/div/div[2]/div[2]/div/div/div/div[2]/div[4]/div[1]/div[1]/div[2]/a[3]")))
topButton.click()
#Topic
topic = WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.XPATH, '/html/body/div[1]/div/div[2]/div[2]/div/div/div/div[2]/div[4]/div[1]/div[4]/div[2]')))
topic.click()
sleep(1)
topicText = driver.find_element(By.XPATH, '/html/body/div[1]/div/div[2]/div[3]/div/div/div/div[2]/div[1]/div[2]/div[1]/div/div[3]/div[1]/div/h1').text
topicPicture = driver.find_element(By.XPATH, '/html/body/div[1]/div/div[2]/div[3]/div/div/div/div[2]/div[1]/div[2]/div[1]/div')
chdir(r'C:\Users\jack_l\Documents\PLAYING_WITH_REDDIT\currentVideo')
topicPicture.screenshot('topic.png')
#Comments
comment = ''
verified = 0
counter4 = 0
while(verified <= 10):
try:
comment = driver.find_element(By.XPATH, '/html/body/div[1]/div/div[2]/div[3]/div/div/div/div[2]/div[1]/div[2]/div[5]/div/div/div/div['+str(counter4)+']')
except Exception:
counter4 = counter4+1
else:
if 'level 1' in comment.text:
comment.screenshot('comment '+str(verified)+'.png')
verified = verified + 1
counter4 = counter4+1

Add this line in the try block after the definition of comment. What it does is simply to scroll down in such a way that the webelement comment is at the center of the page. This way it takes a full screenshot
try:
comment = driver.find_element(By.XPATH, '...')
driver.execute_script('arguments[0].scrollIntoView({block: "center"});', comment)
except:
...

Selenium Web Scraping: Find element by text not working in script

I am working on a script to gather information off Newegg to look at changes over time in graphics card prices. Currently, my script will open a Newegg search on RTX 3080's through Chromedriver and then click on the link for Desktop Graphics Cards to narrow down my search. The part that I am struggling with is developing a for item in range loop that will let me iterate through all 8 search result pages. I know that I could do this by simply changing the page number in the URL, but as this is an exercise that I'm trying to use to learn Relative Xpath better, I want to do it using the Pagination buttons at the bottom of the page. I know that each button should contain inner text of "1,2,3,4 etc." but whenever I use text() = {item} in my for loop, it doesn't click the button. The script runs and doesn't return any exceptions, but doesn't do what I want it too. Below I have attached the HTML for the page as well as my current script. Any suggestions or hints are appreciated.
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException
import pandas as pd
import time
options = Options()
PATH = 'C://Program Files (x86)//chromedriver.exe'
driver = webdriver.Chrome(PATH)
url = 'https://www.newegg.com/p/pl?d=RTX+3080'
driver.maximize_window()
driver.get(url)
card_path = '/html/body/div[8]/div[3]/section/div/div/div[1]/div/dl[1]/dd/ul[2]/li/a'
desktop_graphics_cards = driver.find_element(By.XPATH, card_path)
desktop_graphics_cards.click()
time.sleep(5)
graphics_card = []
shipping_cost = []
price = []
total_cost = []
for item in range(9):
try:
#next_page_click = driver.find_element(By.XPATH("//button[text() = '{item + 1}']"))
print(next_page_click)
next_page_click.click()
except:
pass

The pagination buttons are out of the initially visible area.
In order to click these elements you will have to scroll the page until the element appears.
Also, you will need to click next page buttons starting from 2 up to 9 (including) while you trying to do this with numbers from 1 up to 9.
I think this should work better:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException
import pandas as pd
import time
options = Options()
PATH = 'C://Program Files (x86)//chromedriver.exe'
driver = webdriver.Chrome(PATH)
url = 'https://www.newegg.com/p/pl?d=RTX+3080'
actions = ActionChains(driver)
driver.maximize_window()
driver.get(url)
card_path = '/html/body/div[8]/div[3]/section/div/div/div[1]/div/dl[1]/dd/ul[2]/li/a'
desktop_graphics_cards = driver.find_element(By.XPATH, card_path)
desktop_graphics_cards.click()
time.sleep(5)
graphics_card = []
shipping_cost = []
price = []
total_cost = []
for item in range(2,10):
try:
next_page_click = driver.find_element(By.XPATH(f"//button[text() = '{item}']"))
actions.move_to_element(next_page_click).perform()
time.sleep(2)
#print(next_page_click) - printing a web element itself will not give you usable information
next_page_click.click()
#let the next page loaded, it takes some time
time.sleep(5)
except:
pass

Selenium python getting stuck post driver.get('URL') for Internet Explorer

My selenium script is stuck post driver.get("url") and is not moving forward, later it error out.i am using below code. post executing this is paused for long, I have tried all options from the advance setting of the IE browser
from selenium import webdriver
from bs4 import BeautifulSoup
import time
from tqdm import tqdm
email='XXXXX'
password='XXXXX'
options = webdriver.IeOptions()
options.ignore_protected_mode_settings = True
driver = webdriver.Ie('C:\Program Files (x86)\selenium-
3.141.0\selenium\webdriver\ie\IEdriverServer.exe')
driver.get('https://s2fs.axisbank.com/EFTClient/Account/Login.htm')
email_box = driver.find_element_by_name('username')
email_box.send_keys(email)
pass_box = driver.find_element_by_name('password')
pass_box.send_keys(password)
submit_button = driver.find_element_by_id('loginSubmit')
submit_button.click()
time.sleep(3)
File2393= driver.find_element_by_link_text('Checkbox For Item 919020028802393.csv')
File2393.click()
time.sleep(1)
File3303= driver.find_element_by_link_text('Checkbox For Item 920020034873303.csv')
File3303.click()
time.sleep(1)
download = driver.find_element_by_class('icomoon icon-download2 toolbar-button')
download.click()
print("File is been downloaded")

You are missing a wait / delay before accessing the first element on the page.
You can simply add a sleep there, like this:
driver.get('https://s2fs.axisbank.com/EFTClient/Account/Login.htm')
time.sleep(10)
email_box = driver.find_element_by_name('username')
email_box.send_keys(email)
But it is better to use explicit waits

well, this URL :-
https://s2fs.axisbank.com/EFTClient/Account/Login.htm
is not loading at all in my browser, but if it works for you then you can try with Explicit wait as below :
options = webdriver.IeOptions()
options.ignore_protected_mode_settings = True
driver = webdriver.Ie('C:\Program Files (x86)\selenium-3.141.0\selenium\webdriver\ie\IEdriverServer.exe')
driver.maximize_window()
driver.implicitly_wait(30)
driver.get("https://s2fs.axisbank.com/EFTClient/Account/Login.htm")
wait = WebDriverWait(driver, 10)
email_box = wait.until(EC.element_to_be_clickable((By.NAME, "username")))
email_box.send_keys('email')
These would be the imports :
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC

I think, the site is not reachable, you can try to use the correct URL to access the page,
Accessing the element, you could use the explicitWait
email_box = WebDriverWait(driver,10).until(EC.visibility_of_element_located((By.XPATH,"element_XPATH")))
email_box.send_Keys("UserName")
import
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait

Selenium - Google Travel Scraping Price History missing

I am returning html with this python script but it doesn't return price history (see screenshot). Using non-selenium browser does return html with the prices (even without expending this section by simple regex); chrome/safari/firefox all do, incognito as well.
from selenium import webdriver
import time
url = 'https://www.google.com/flights?hl=en#flt=SFO.JFK.2021-06-01*JFK.SFO.2021-06-07'
options = webdriver.ChromeOptions()
driver = webdriver.Chrome(options=options)
driver.get(url)
time.sleep(10)
html = driver.page_source
print(html)
driver.quit()
I can't quite pinpoint if it's some setting in chromedriver. It is possible to do because there is a 3rd party scraper that currently returns this data.
Tried this to no avail. Can a website detect when you are using Selenium with chromedriver?
Any thoughts appreciated.

After I added chrome_options.add_argument("--disable-blink-features=AutomationControlled") I started to see this block. Not sure why it is not always loaded.
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.chrome.options import Options
url = 'https://www.google.com/flights?hl=en#flt=SFO.JFK.2021-06-01*JFK.SFO.2021-06-07'
chrome_options = Options()
chrome_options.add_argument("start-maximized")
chrome_options.add_argument("--disable-blink-features=AutomationControlled")
driver = webdriver.Chrome(executable_path='/snap/bin/chromium.chromedriver', chrome_options=chrome_options)
driver.get(url)
# wait = WebDriverWait(driver, 20)
# wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, ".EA71Tc.q7Eewe")))
time.sleep(10)
history = driver.find_element_by_css_selector(".EA71Tc.q7Eewe").get_attribute("innerHTML")
print(history)
Here the full block is returned, including all tag names. As you see, I tried explicit waits, but this block was not visible. Experiment with adding another explicit wait.

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

How to extract the comments count correctly - python

Related

Page navigation click without using current_url python selenium

Selenium screenshots are chopped off

Selenium Web Scraping: Find element by text not working in script

Selenium python getting stuck post driver.get('URL') for Internet Explorer

Selenium - Google Travel Scraping Price History missing

Categories

Resources