Web scraping text returns 0

Web scraping text returns 0 - python

Whenever I try to scrape a number from a website and print it always returns 0 even if I delay it to let the window load first.
Here's my code,
from selenium import webdriver
import time
url = 'https://hytrack.me/'
browser = webdriver.Chrome(r'C:\Users\kinet\OneDrive\Documents\webscraper\chromedriver.exe')
browser.get(url)
text = browser.find_element_by_xpath('//*[#id="stat_totalPlayers"]').text
time.sleep(10)
print(text)
All I need it to do is print some text that it takes from a website.
Have I done something wrong or am I just completely missing something?

You should put the delay before getting the element!
from selenium import webdriver
import time
url = 'https://hytrack.me/'
browser = webdriver.Chrome(r'C:\Users\kinet\OneDrive\Documents\webscraper\chromedriver.exe')
browser.get(url)
time.sleep(10)
text = browser.find_element_by_xpath('//*[#id="stat_totalPlayers"]').text
print(text)
While it's better to use explicit wait, like this:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
url = 'https://hytrack.me/'
browser = webdriver.Chrome(r'C:\Users\kinet\OneDrive\Documents\webscraper\chromedriver.exe')
wait = WebDriverWait(driver, 20)
browser.get(url)
text = wait.until(EC.visibility_of_element_located((By.XPATH, '//*[#id="stat_totalPlayers"]'))).text
print(text)

Related

How to extract the comments count correctly

I am trying to extract number of youtube comments and tried several methods.
My Code:
from selenium import webdriver
import pandas as pd
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
import time
DRIVER_PATH = <your chromedriver path>
wd = webdriver.Chrome(executable_path=DRIVER_PATH)
url = 'https://www.youtube.com/watch?v=5qzKTbnhyhc'
wd.get(url)
wait = WebDriverWait(wd, 100)
time.sleep(40)
v_title = wd.find_element_by_xpath('//*[#id="container"]/h1/yt-formatted-string').text
print("title Is ")
print(v_title)
comments_xpath = '//h2[#id="count"]/yt-formatted-string/span[1]'
v_comm_cnt = wait.until(EC.visibility_of_element_located((By.XPATH, comments_xpath)))
#wd.find_element_by_xpath(comments_xpath)
print(len(v_comm_cnt))
I get the following error:
selenium.common.exceptions.TimeoutException: Message:
I get correct value for title but not for comment_cnt. Can any one please guide me what is wrong with my code?
Please note that comments count path - //h2[#id="count"]/yt-formatted-string/span[1] point to correct place if I search the value in inspect element.

Updated answer
Well, it was tricky!
There are several issues here:
This page has some bad java scripts on it making the Selenium webdriver driver.get() method to wait until the timeout for the page loaded while it looks like the page is loaded. To overcome that I used Eager page load strategy.
This page has several blocks of code for the same areas so as sometimes one of them is used (visible) and sometimes the second. This makes working with element locators difficultly. So, here I am waiting for visibility of title element from one of that blocks. In case it was visible - I'm extracting the text from there, otherwise I'm waiting for the visibility of the second element (it comes immediately) and extracting the text from there.
There are several ways to make page scrolling. Not all of them worked here. I found the one that is working and scrolling not too much.
The code below is 100% working, I run it several times.
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
from selenium.webdriver.chrome.service import Service
options = Options()
options.add_argument("--start-maximized")
caps = DesiredCapabilities().CHROME
caps["pageLoadStrategy"] = "eager"
s = Service('C:\webdrivers\chromedriver.exe')
driver = webdriver.Chrome(options=options, desired_capabilities=caps, service=s)
url = 'https://www.youtube.com/watch?v=5qzKTbnhyhc'
driver.get(url)
driver.maximize_window()
wait = WebDriverWait(driver, 10)
title_xpath = "//div[#class='style-scope ytd-video-primary-info-renderer']/h1"
alternative_title = "//*[#id='title']/h1"
v_title = ""
try:
v_title = wait.until(EC.visibility_of_element_located((By.XPATH, title_xpath))).text
except:
v_title = wait.until(EC.visibility_of_element_located((By.XPATH, alternative_title))).text
print("Title is " + v_title)
comments_xpath = "//div[#id='title']//*[#id='count']//span[1]"
driver.execute_script("window.scrollBy(0, arguments[0]);", 600)
try:
v_comm_cnt = wait.until(EC.visibility_of_element_located((By.XPATH, comments_xpath)))
except:
pass
v_comm_cnt = driver.find_element(By.XPATH, comments_xpath).text
print("Video has " + v_comm_cnt + " comments")
The output is:
Title is Music for when you are stressed 🍀 Chil lofi | Music to Relax, Drive, Study, Chill
Video has 834 comments
Process finished with exit code 0

Scraping with Selenium. for page in range() issue

I am trying to scrape on ethplorer.io. I want to scrape many pages. My code is like this. But it scrapes page(11) three times. range(11,14) I couldn't understand why?
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.common.by import By
driver = webdriver.Chrome()
base_url = "https://ethplorer.io/tr/address/0xf87e31492faf9a91b02ee0deaad50d51d56d5d4d#pageSize=100&tab=tab-holders&holders="
results=[]
for page_number in range(11,14):
url = base_url+str(page_number)
driver.get(url)
data = driver.find_elements(By.CLASS_NAME, "local-link")
for x in data:
results.append(x.text)
driver.quit
with open("all_data.txt" , "w") as file:
for x in results:
file.write(x + "\n")

I have applied some modifications to your code to have it working through the several pages you are calling and to capture the text within the hyperlinks I assume you are targeting, please check below:
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
import time
from selenium.common.exceptions import StaleElementReferenceException
from webdriver_manager.chrome import ChromeDriverManager
driver = webdriver.Chrome(ChromeDriverManager().install())
url = "https://ethplorer.io/tr/address/0xf87e31492faf9a91b02ee0deaad50d51d56d5d4d#pageSize=100&tab=tab-holders&holders=11" # Here we are calling the first page we need, which is page #11 in this case.
xpath = "//div[#id='token-holders-tab']//div[#id='address-token-holders']//div[#class='block']//table//tr//td//a[contains(#class,'local-link')]"
driver.get(url)
data =driver.find_elements_by_xpath("//div[#id='token-holders-tab']//div[#id='address-token-holders']//div[#class='block']//table//tr//td//a[contains(#class,'local-link')]")
results=[]
for page_number in range(12,15): # Range should start from the next page (page # 12 in this case). Range end with last page you need + 2, in this case you need to scrape from 11 untill 13 ,so rage end should be 15.
for x in data: # On first round it will get page # 11 data.
results.append(x.text)
nextPage = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.XPATH,"//div[#id='token-holders-tab']//tr[contains(#class,'paginationFooter')]//A[#class='page-link'][text()='"+str(page_number)+"']")))
driver.execute_script("arguments[0].click();", nextPage)
time.sleep(3)
data =driver.find_elements_by_xpath("//div[#id='token-holders-tab']//div[#id='address-token-holders']//div[#class='block']//table//tr//td//a[contains(#class,'local-link')]")
driver.quit
with open("all_data.txt" , "w") as file:
for x in results:
file.write(x + "\n")

Selenium returning "Message: element not interactable" when trying to place text

Ive been attempting to use selenium to go through elements on soundclouds website and am having trouble interacting with the input tags. When I try to write in the input tag of the class "headerSearch__input" with the send keys command, I get back the error "Message: element not interactable". May someone please explain to me what im doing wrong?
from tkinter import *
import random
import urllib.request
from bs4 import BeautifulSoup
from selenium import webdriver
import time
import requests
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.chrome.options import Options
driver = webdriver.Chrome(executable_path='/Users/quanahbennett/PycharmProjects/SeleniumTest/chromedriver')
url= "https://soundcloud.com/"
driver.get(url)
#time.sleep(30)
wait = WebDriverWait(driver, 30)
#link = driver.find_elements_by_link_text("Sign in")
#link[0].click()
#driver.execute_script("arguments[0].click();", link[0])
#SUCCESFUL LOGIN BUTTON PUSH
#please = driver.find_element_by_css_selector('button.frontHero__loginButton')
#please.click()
attempt = driver.find_element_by_css_selector('input.headerSearch__input')
time.sleep(10)
attempt.send_keys('Hello')
breakpoint()
#driver.quit()

The locator - input.headerSearch__input is highlighting two different elements in the DOM. Its important to find unique locators. Link to refer
And also close the cookie pop-up. And then try to interact with elements.
Try like below and confirm.
driver.get("https://soundcloud.com/")
wait = WebDriverWait(driver,30)
# Click on Accept cookies button
wait.until(EC.element_to_be_clickable((By.ID,"onetrust-accept-btn-handler"))).click()
search_field = wait.until(EC.element_to_be_clickable((By.XPATH,"//div[#id='content']//input")))
search_field.send_keys("Sample text")

Selenium python getting stuck post driver.get('URL') for Internet Explorer

My selenium script is stuck post driver.get("url") and is not moving forward, later it error out.i am using below code. post executing this is paused for long, I have tried all options from the advance setting of the IE browser
from selenium import webdriver
from bs4 import BeautifulSoup
import time
from tqdm import tqdm
email='XXXXX'
password='XXXXX'
options = webdriver.IeOptions()
options.ignore_protected_mode_settings = True
driver = webdriver.Ie('C:\Program Files (x86)\selenium-
3.141.0\selenium\webdriver\ie\IEdriverServer.exe')
driver.get('https://s2fs.axisbank.com/EFTClient/Account/Login.htm')
email_box = driver.find_element_by_name('username')
email_box.send_keys(email)
pass_box = driver.find_element_by_name('password')
pass_box.send_keys(password)
submit_button = driver.find_element_by_id('loginSubmit')
submit_button.click()
time.sleep(3)
File2393= driver.find_element_by_link_text('Checkbox For Item 919020028802393.csv')
File2393.click()
time.sleep(1)
File3303= driver.find_element_by_link_text('Checkbox For Item 920020034873303.csv')
File3303.click()
time.sleep(1)
download = driver.find_element_by_class('icomoon icon-download2 toolbar-button')
download.click()
print("File is been downloaded")

You are missing a wait / delay before accessing the first element on the page.
You can simply add a sleep there, like this:
driver.get('https://s2fs.axisbank.com/EFTClient/Account/Login.htm')
time.sleep(10)
email_box = driver.find_element_by_name('username')
email_box.send_keys(email)
But it is better to use explicit waits

well, this URL :-
https://s2fs.axisbank.com/EFTClient/Account/Login.htm
is not loading at all in my browser, but if it works for you then you can try with Explicit wait as below :
options = webdriver.IeOptions()
options.ignore_protected_mode_settings = True
driver = webdriver.Ie('C:\Program Files (x86)\selenium-3.141.0\selenium\webdriver\ie\IEdriverServer.exe')
driver.maximize_window()
driver.implicitly_wait(30)
driver.get("https://s2fs.axisbank.com/EFTClient/Account/Login.htm")
wait = WebDriverWait(driver, 10)
email_box = wait.until(EC.element_to_be_clickable((By.NAME, "username")))
email_box.send_keys('email')
These would be the imports :
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC

I think, the site is not reachable, you can try to use the correct URL to access the page,
Accessing the element, you could use the explicitWait
email_box = WebDriverWait(driver,10).until(EC.visibility_of_element_located((By.XPATH,"element_XPATH")))
email_box.send_Keys("UserName")
import
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait

Web scraping with selenium to get more information from more events button

this time I would like to click a button in order to load more real-time searches.
In order to click "more events" button to get more events' link, title, date etc. information. Please help me.

Check Now:
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import expected_conditions as EC
import time
url = "https://www.rsc.org/events/subject/organic"
driver = webdriver.Chrome("./chromedriver.exe")
driver.maximize_window()
driver.get(url)
details = []
while True:
try:
get_btn = driver.find_element_by_class_name('ev-more-results-link')
get_btn.click()
time.sleep(2)
except:
complete = driver.find_element_by_xpath('//*[#id="divInternalEvents"]/div')
posts = complete.find_elements_by_class_name('ev-always-col')
dts = complete.find_elements_by_tag_name('h3')
for idx,post in enumerate(posts):
title = post.text
link = post.get_attribute("href")
date = dts[idx].text
details.append({"title":title,"link":link,"date":date})
break
print(details)

Try this:
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.keys import Keys
import time
url = "https://www.rsc.org/events/subject/organic"
driver = webdriver.Chrome("./chromedriver.exe")
driver.maximize_window()
driver.get(url)
while True:
try:
get_btn = driver.find_element_by_class_name('ev-more-results-link')
get_btn.click()
time.sleep(2)
except:
print("completed")
break;
In case, If you have any problem, feel free to ask.

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Web scraping text returns 0 - python

Related

How to extract the comments count correctly

Scraping with Selenium. for page in range() issue

Selenium returning "Message: element not interactable" when trying to place text

Selenium python getting stuck post driver.get('URL') for Internet Explorer

Web scraping with selenium to get more information from more events button

Categories

Resources