Extracting user comments from news website

Extracting user comments from news website - python

from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
def wait(dr, x):
element = WebDriverWait(dr, 50).until(
EC.presence_of_all_elements_located((By.XPATH, x))
)
return element
from selenium import webdriver
browser = webdriver.Firefox()
browser.get("http://www.dinamalar.com/user_comments.asp? uid=14701&name=%E0%AE%A4%E0%AE%AE%E0%AE%BF%E0%AE%B4%E0%AF%8D%E0%AE%9A%E0%AF%86%E0%AE%B2%E0%AF%8D%E0%AE%B5%E0%AE%A9%E0%AF%8D")
for elem in wait(browser, '//*[#id="commsec"]/div[2]/div[1]'):
print elem.text
This is the link i need to extract all the comments http://www.dinamalar.com/user_comments.asp?uid=14701&name=%E0%AE%A4%E0%AE%AE%E0%AE%BF%E0%AE%B4%E0%AF%8D%E0%AE%9A%E0%AF%86%E0%AE%B2%E0%AF%8D%E0%AE%B5%E0%AE%A9%E0%AF%8D
But my code extracting only the first 10 comments. After clicking the button the other 10 comments are loaded dynamically. How to extract all these comments using python selenium

The idea would be to look for how many "more ideas" elements are present on the page. Every time you click the button and load more comments, one more "more ideas" red button becomes present. Implementation:
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium import webdriver
browser = webdriver.Firefox()
wait = WebDriverWait(browser, 10)
browser.get("http://www.dinamalar.com/user_comments.asp?uid=14701&name=%E0%AE%A4%E0%AE%AE%E0%AE%BF%E0%AE%B4%E0%AF%8D%E0%AE%9A%E0%AF%86%E0%AE%B2%E0%AF%8D%E0%AE%B5%E0%AE%A9%E0%AF%8D")
# initial wait for the page to load
wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, ".morered")))
pages = 1
while True:
browser.find_elements_by_css_selector(".morered")[-1].click()
# wait for more "load more" buttons to be present
try:
wait.until(lambda browser: len(browser.find_elements_by_css_selector(".morered")) > pages)
except TimeoutException:
break # no more data loaded, exit the loop
print("Comments loaded: %d" % len(browser.find_elements_by_css_selector(".dateg")))
pages += 1
browser.close()
Note that I've also removed that extra space inside the URL.

Related

Python & Selenium : Scroll down to avoid overlaping advertising and click button

I try to loop through pages but I have overlaping advertising on my numbers of pages button.
I have this on my browser, the pages buttons are behind this "DocuSign" advertising:
So I tried to scroll down to be able to click on next page but it doesn't work.
I would like to have this to be able to click on next pages:
I tried this:
import time
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains
driver = webdriver.Chrome(executable_path="/Users/name/Downloads/chromedriver 4")
url = 'http://www.legorafi.fr/category/france/politique'
driver.get(url)
WebDriverWait(driver, 20).until(EC.frame_to_be_available_and_switch_to_it((By.CSS_SELECTOR,"div#appconsent>iframe")))
WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.CSS_SELECTOR, "button.button--filled>span.baseText"))).click()
page_number = 1
while True:
driver.execute_script("window.scrollTo(0,document.body.scrollHeight)")
time.sleep(3)
try:
link = driver.find_element_by_xpath('//*[#id="main"]/div[5]/div/a[1]')
except NoSuchElementException:
break
link.click()
print(driver.current_url)
page_number += 1

You can try finding the next page button more appropriately instead.
driver.find_elements_by_xpath("//*[contains(text(), 'Next Page')]")

Button to clipboard in Selenium

I am looking to scrape hashtags generated from a site using Selenium webdriver. Since the site is using Shadow Content (User Agent) I decided to just copy the hashtags using the button already in the site that copies them into my clipboard. However, I am failing to locate the <button>
This is the HTML
<button type="button" id="copyBtn" data-clipboard-target="#hashtag_textarea" class="btn btn-success">Copy to clipboard</button>
How is it that Selenium can't find the button. What am I doing wrong.
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.wait import WebDriverWait as wait
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import expected_conditions as EC
browser = webdriver.Chrome('/Users/user/Documents/docs/chromedriver')
browser.get('https://seekmetrics.com/hashtag-generator')
delay = 15
print ("Headless Chrome Initialized")
print ("\n")
try:
element = wait(browser, delay).until(EC.element_to_be_clickable((By.CLASS_NAME, 'el-input__inner')))
element.click()
element.send_keys('love')
element.send_keys(Keys.ENTER)
wait(browser, delay)
browser.find_element_by_id('copyBtn').click()
print('Page is ready!')
# print(hashtags.text)
# print (browser.page_source)
except TimeoutException:
print("Loading took too much time!")
browser.quit()

You don't need to click the button just get textarea value but It need to wait until the textarea located.
element.send_keys('love')
element.send_keys(Keys.ENTER)
# wait until hashtags generated
hashtags = wait(browser, delay).until(EC.presence_of_element_located((By.ID, 'hashtag_textarea')))
print(hashtags.get_attribute('value'))
print('Page is ready!')

after inputing value in textbox page refreshed and it takes some amount of time during that time your code tries to click on button which is not clickable or say not loaded in DOM. instead doing that wait for button until it becomes clickable check following code sample:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.wait import WebDriverWait as wait
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import expected_conditions as EC
browser = webdriver.Chrome('C:\Python27\Scripts\chromedriver')
browser.get('https://seekmetrics.com/hashtag-generator')
delay = 15
print ("Headless Chrome Initialized")
print ("\n")
try:
element = wait(browser, delay).until(EC.element_to_be_clickable((By.CLASS_NAME, 'el-input__inner')))
element.click()
element.send_keys('love')
element.send_keys(Keys.ENTER)
wait(browser, delay)
button = wait(browser, delay).until(EC.element_to_be_clickable((By.XPATH, "//button[text()='Copy to clipboard']")))
button.click()
print('Page is ready!')
# print(hashtags.text)
# print (browser.page_source)
except TimeoutException:
print("Loading took too much time!")
browser.quit()
hope this helps you..

Script fails to keep clicking on load more button

I've written a script in Python in association with selenium to keep clicking on MORE button to load more items until there are no new items left to load from a webpage. However, my below script can click once on that MORE button available on the bottom of that page.
Link to that site
This is my try so far:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
link = "https://angel.co/companies?company_types[]=Startup&company_types[]=Private+Company&company_types[]=Mobile+App&locations[]=1688-United+States"
driver = webdriver.Chrome()
wait = WebDriverWait(driver, 10)
driver.get(link)
while True:
for elems in wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR,".results .name a.startup-link"))):
print(elems.get_attribute("href"))
try:
loadmore = wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR,"[class='more']")))
driver.execute_script("arguments[0].scrollIntoView();", loadmore)
loadmore.click()
except Exception:break
driver.quit()
How can I keep clicking on that MORE button until there are no such button left to click and parse the links as I've already tried using for loop.

I've managed to solve the problem pursuing sir Andersson's logic within my exising script. This is what the modified script look like.
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
link = "https://angel.co/companies?company_types[]=Startup&company_types[]=Private+Company&company_types[]=Mobile+App&locations[]=1688-United+States"
driver = webdriver.Chrome()
wait = WebDriverWait(driver, 10)
driver.get(link)
while True:
try:
loadmore = wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR,"[class='more']")))
driver.execute_script("arguments[0].click();", loadmore)
wait.until(EC.staleness_of(loadmore))
except Exception:break
for elems in wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR,".results .name a.startup-link"))):
print(elems.get_attribute("href"))
driver.quit()

why not just?
while (driver.FindElements(By.ClassName("more")).Count > 0)
{
driver.FindElement(By.ClassName("more")).Click();
//Some delay to wait lazyload to complete
}
c# example. pretty sure that it can be done with python as well

Facing issues while clicking on some links in a webpage

I've written a script in python to click on some categories in a webpage. I could manage to click on the first two categories but got stuck when it comes to initiate the final click. I've given a link leading to the two images in I have marked where to click.
This is the first link where there is a sign (marked with pencil) to click on to enter the second portion.
This is the second link where I get stuck when I try to click on the names (I've marked those names with pencil)
This is the site link.
Script I've tried with so far:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
driver = webdriver.Chrome()
wait = WebDriverWait(driver, 10)
driver.get("replace_with_above_link")
wait.until(EC.element_to_be_clickable((By.CLASS_NAME, "i4ewOd-pzNkMb-ornU0b-b0t70b-Bz112c"))).click()
post = wait.until(EC.visibility_of_all_elements_located((By.CSS_SELECTOR, "div[role='checkbox']")))[1]
post.click()
for item in wait.until(EC.visibility_of_all_elements_located((By.CSS_SELECTOR,".HzV7m-pbTTYe-JNdkSc .suEOdc"))):
item.click()
driver.quit()
My intention is to click the names cyclically. Thanks in advance.

Try below code to click each item in list:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
driver = webdriver.Chrome()
wait = WebDriverWait(driver, 10)
driver.get(URL)
wait.until(EC.element_to_be_clickable((By.CLASS_NAME, "i4ewOd-pzNkMb-ornU0b-b0t70b-Bz112c"))).click()
post = wait.until(EC.visibility_of_all_elements_located((By.CSS_SELECTOR, "div[role='checkbox']")))[1]
post.click()
for item in wait.until(EC.visibility_of_all_elements_located((By.CSS_SELECTOR,".HzV7m-pbTTYe-JNdkSc .suEOdc")))[1:]:
item.click()
wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, ".HzV7m-tJHJj-LgbsSe-Bz112c.qqvbed-a4fUwd-LgbsSe-Bz112c"))).click()
wait.until(EC.invisibility_of_element_located((By.CSS_SELECTOR, ".qqvbed-p83tee")))
driver.quit()

Selenium Python - Explicit waits not working

I am unable to get explicit waits to work while waiting for the page to render the js, so I am forced to use time.sleep() in order for the code to work as intended.
I read the docs and still wasn't able to get it to work.
http://selenium-python.readthedocs.io/waits.html
The commented out section of code with the time.sleep() works as intended.
The WebDriverWait part runs but does not wait.
from selenium import webdriver
import time
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException
driver = webdriver.Chrome()
url = "https://www.target.com/"
# tells the driver to wait up to 10 seconds before timing out
# for data that will be loaded on the screen
DELAY = 10
driver.implicitly_wait(DELAY)
SLEEP_TIME = 1
# navigate to the page
driver.get(url)
time.sleep(SLEEP_TIME)
try:
WebDriverWait(driver, DELAY).until(EC.visibility_of_element_located((By.XPATH, """//*[#id="js-toggleLeftNav"]/img"""))).click()
WebDriverWait(driver, DELAY).until(EC.visibility_of_element_located((By.XPATH, """//*[#id="5"]"""))).click()
WebDriverWait(driver, DELAY).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, "#leftNavigation > ul:nth-child(2)")))
"""
# opens up the side bar javascript
driver.find_element_by_xpath("""//*[#id="js-toggleLeftNav"]/img""").click()
time.sleep(SLEEP_TIME)
# clicks on browse by category
driver.find_element_by_xpath("""//*[#id="5"]""").click()
time.sleep(SLEEP_TIME)
# gets all the category elements
items = driver.find_element_by_css_selector("#leftNavigation > ul:nth-child(2)").find_elements_by_tag_name("li")
time.sleep(SLEEP_TIME)
"""
# gets the hyperlink and category name but the first and the last,
# since the first is back to main menu and the last is exit
category_links = {}
for i in range(1, len(items) - 1):
hyperlink = items[i].find_element_by_tag_name('a').get_attribute('href')
category_name = items[i].text
category_links[category_name] = hyperlink
print(category_links)
except:
print("Timed out.")

This version successfully loads the site, waits for it to render, then opens the side menu. Notice how the wait.until method is is used successfully wait until the page is loaded. You should be able to use the pattern below with the rest of your code to achieve your goal.
CODE
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
driver = webdriver.Chrome()
wait = WebDriverWait(driver, 10)
driver.get("https://www.target.com/")
wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, "#leftNavigation > ul:nth-child(2)")))
button = driver.find_element_by_xpath("""//*[#id="js-toggleLeftNavLg"]""")
button.click()
time.sleep(5)
driver.quit()

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Extracting user comments from news website - python

Related

Python & Selenium : Scroll down to avoid overlaping advertising and click button

Button to clipboard in Selenium

Script fails to keep clicking on load more button

Facing issues while clicking on some links in a webpage

Selenium Python - Explicit waits not working

Categories

Resources