I try to crawl the wechat public accounts includes the key word through "http://weixin.sogou.com/"
But i find i must use twice ActionChains(driver).move_to_element(nextpage).click().perform(),it can still work,and go to the next page !
who can tell me why and how to fix ! Thank you!
The source code are as follow , and sorry the comments are in the Chinese .
# coding=utf-8
import time
from selenium import webdriver
from selenium.webdriver import ActionChains
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
key = u"江南大学" #搜索的关键词
driver = webdriver.Chrome()
driver.get("http://weixin.sogou.com/")
assert u'搜狗微信' in driver.title
elem = driver.find_element_by_id("upquery")
elem.clear()
elem.send_keys(key)
button = driver.find_element_by_class_name("swz2") #搜索公众号
button.click()
WebDriverWait(driver,10).until(
EC.title_contains(key)
)
count = 0
while True:
for i in range(10):
try:
wechat_name = driver.find_element_by_xpath("//*[#id=\"sogou_vr_11002301_box_{}\"]/div[2]/h3".format(i)).text
print wechat_name
wechat_id = driver.find_element_by_xpath("//*[#id=\"sogou_vr_11002301_box_{}\"]/div[2]/h4/span/label".format(i)).text
print wechat_id
wechat_intro = driver.find_element_by_xpath("//*[#id=\"sogou_vr_11002301_box_{}\"]/div[2]/p[1]/span[2]".format(i)).text
print wechat_intro
print "*************************"
count += 1
except:
pass
try:
nextpage = driver.find_element_by_xpath("//*[#id=\"sogou_next\"]") #下一页的按钮
actions = ActionChains(driver)
actions.move_to_element(nextpage)
actions.click().
actions.perform()
actions = ActionChains(driver)
actions.move_to_element(nextpage)
actions.click().
actions.perform()
except Exception,e:
print e
break
driver.quit()
print count
You can chain your action, so no need to do perform after each action.
actions = ActionChains(driver)
actions.move_to_element(nextpage)
actions.click(nextpage)
actions.perform()
OR
actions = ActionChains(driver)
actions.move_to_element(nextpage)
actions.click(nextpage).perform()
Related
Ok this is a bit embarrassing because I've asked a similar question on here sometime ago, but I tried the suggested solution ie (wait till element clickable), but it didn't work. So here's my code snipped.
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.common.exceptions import ElementClickInterceptedException, ElementNotInteractableException, TimeoutException, WebDriverException, NoSuchElementException
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from time import sleep
import re
import pandas as pd
def get_links(link):
driver = webdriver.Firefox()
driver.get(link)
driver.implicitly_wait(50)
sleep(5)
_flag = True
knt = 0
while _flag:
try:
WebDriverWait(driver, 50).until(EC.invisibility_of_element((By.XPATH, "//a[contains(class='ot-buttons-fw')]")))
WebDriverWait(driver, 50).until(EC.element_to_be_clickable((By.XPATH, "//a[#class='event__more event__more--static']")))
driver.find_element_by_xpath("//*[contains(text(), 'Show more matches')]").click()
print("works here!")
print("clicked....................................")
sleep(5)
_flag = True
#tmp = driver.find_elements_by_xpath("//span[contains(text(), 'NBA - Pre-season')]")
#if len(tmp) > 0:
#print("Found them!")
#_flag = False
if knt > 5: # For testing
print("Nuff clicked")
_flag = False
except(ElementNotInteractableException):
print("Error!")
_flag = False
driver.close()
return None
link = "https://www.flashscore.com/basketball/usa/nba/results/"
_ = get_links(link)
For some reason I keep getting an ElementClickInterceptedException Error at the driver.find_element_by_xpath("//*[contains(text(), 'Show more matches')]").click() line. Any help can do please
Your element overlap with other element, it cause the ElementClickInterceptedException error appear.
Before perform you code, please close the cookies popup with this code snippet:
def get_links(link):
driver = webdriver.Firefox()
driver.get(link)
driver.implicitly_wait(50)
sleep(5)
#here, close popup
if(len(driver.find_elements_by_id('onetrust-accept-btn-handler'))>0):
driver.find_element_by_id('onetrust-accept-btn-handler').click()
_flag = True
knt = 0
while _flag:
....
....
And remove this line:
WebDriverWait(driver, 50).until(EC.invisibility_of_element((By.XPATH, "//a[contains(class='ot-buttons-fw')]")))
This is invalid xpath expression, and no needed, have handled by if(popup accept) condition the above.
I'm working on scraping the memes and all their comments from 9gag.
I used this code below but I am only getting few extra comments.
actions = ActionChains(driver)
link = driver.find_element(By.XPATH, "//button[#class='comment-list__load-more']")
actions.move_to_element(link).click(on_element=link).perform()
I would also like to access the subcomments under a comment by simulating click on view more replies.
From the html I found this XPATH element = driver.find_element(By.XPATH, "//div[#class='vue-recycle-scroller ready page-mode direction-vertical']")holds the comments section but I'm not sure how to iterate through each comment in this element and simulate these clicks.
This code should work directly provided the necessary libraries are present in case you wanna test it.
Please help me with these following tasks:
Getting all the comments from view all comments
Iterating through each comment section and clicking on view more replies to get all the subcomments
My Code
import time
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
import undetected_chromedriver as uc
if __name__ == '__main__':
options = Options()
# options.headless = True
options.add_argument("start-maximized") # ensure window is full-screen
driver = uc.Chrome(service=Service(ChromeDriverManager().install()), options=options)
driver.get("https://9gag.com/gag/a5EAv9O")
prev_h = 0
for i in range(10):
height = driver.execute_script("""
function getActualHeight() {
return Math.max(
Math.max(document.body.scrollHeight, document.documentElement.scrollHeight),
Math.max(document.body.offsetHeight, document.documentElement.offsetHeight),
Math.max(document.body.clientHeight, document.documentElement.clientHeight)
);
}
return getActualHeight();
""")
driver.execute_script(f"window.scrollTo({prev_h},{prev_h + 200})")
time.sleep(1)
prev_h += 200
if prev_h >= height:
break
time.sleep(5)
title = driver.title[:-7]
try:
upvotes_count = \
driver.find_element(By.XPATH, "//meta[#property='og:description']").get_attribute("content").split(' ')[0]
comments_count = \
driver.find_element(By.XPATH, "//meta[#property='og:description']").get_attribute("content").split(' ')[3]
upvotes_count = int(upvotes_count) if len(upvotes_count) <= 3 else int("".join(upvotes_count.split(',')))
comments_count = int(comments_count) if len(comments_count) <= 3 else int("".join(comments_count.split(',')))
date_posted = driver.find_element(By.XPATH, "//p[#class='message']")
date_posted = date_posted.text.split("·")[1].strip()
# actions = ActionChains(driver)
# link = driver.find_element(By.XPATH, "//button[#class='comment-list__load-more']")
# actions.move_to_element(link).click(on_element=link).perform()
element = driver.find_element(By.XPATH,
"//div[#class='vue-recycle-scroller ready page-mode direction-vertical']")
print(element.text)
driver.quit()
except NoSuchElementException or Exception as err:
print(err)
Output
Edit:
I managed to make the code work better. It scrolls through the page until it sees all the comments. It also clicks on view more replies if there are subcomments.
But it's only able to read comments from middle to end. Maybe as the page is scrolled down, the initial comments are hidden dynamically. I do not know how to overcome this. And clicking on view more replies stops after some clicks and is throwing the error
selenium.common.exceptions.MoveTargetOutOfBoundsException: Message: move target out of bounds
Here's the updated code
import driver as driver
from selenium.webdriver.remote.webelement import WebElement
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains
import time
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException, ElementClickInterceptedException
from selenium.webdriver.support.wait import WebDriverWait
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
import undetected_chromedriver as uc
def scroll_page(scrl_hgt):
prev_h = 0
for i in range(10):
height = driver.execute_script("""
function getActualHeight() {
return Math.max(
Math.max(document.body.scrollHeight, document.documentElement.scrollHeight),
Math.max(document.body.offsetHeight, document.documentElement.offsetHeight),
Math.max(document.body.clientHeight, document.documentElement.clientHeight)
);
}
return getActualHeight();
""")
driver.execute_script(f"window.scrollTo({prev_h},{prev_h + scrl_hgt})")
time.sleep(1)
prev_h += scrl_hgt
if prev_h >= height:
break
if __name__ == '__main__':
options = Options()
# options.headless = True
driver = uc.Chrome(service=Service(ChromeDriverManager().install()), options=options)
driver.maximize_window()
driver.get("https://9gag.com/gag/a5EAv9O")
time.sleep(5)
# click on I accept cookies
actions = ActionChains(driver)
consent_button = driver.find_element(By.XPATH, '//*[#id="qc-cmp2-ui"]/div[2]/div/button[2]')
actions.move_to_element(consent_button).click().perform()
scroll_page(150)
time.sleep(2)
# click on fresh comments sectin
fresh_comments = driver.find_element(By.XPATH, '//*[#id="page"]/div[1]/section[2]/section/header/div/button[2]')
actions.move_to_element(fresh_comments).click(on_element=fresh_comments).perform()
time.sleep(5)
# getting meta data
title = driver.title[:-7]
upvotes_count = driver.find_element(By.XPATH, "//meta[#property='og:description']").get_attribute("content").split(' ')[0]
comments_count = driver.find_element(By.XPATH, "//meta[#property='og:description']").get_attribute("content").split(' ')[3]
upvotes_count = int(upvotes_count) if len(upvotes_count) <= 3 else int("".join(upvotes_count.split(',')))
comments_count = int(comments_count) if len(comments_count) <= 3 else int("".join(comments_count.split(',')))
date_posted = driver.find_element(By.XPATH, "//p[#class='message']")
date_posted = date_posted.text.split("·")[1].strip()
time.sleep(3)
# click on lood more comments button to load all the comments
load_more_comments = driver.find_element(By.XPATH, "//button[#class='comment-list__load-more']")
actions.move_to_element(load_more_comments).click(on_element=load_more_comments).perform()
scroll_page(500)
print([my_elem.text for my_elem in driver.find_elements(By.CSS_SELECTOR, "div.comment-list-item__text")])
comments = driver.find_elements(By.CSS_SELECTOR, "div.vue-recycle-scroller__item-view")
for item in comments:
html = item.get_attribute("innerHTML")
if "comment-list-item__text" in html:
print(item.find_element(By.CSS_SELECTOR, "div.comment-list-item__text").text)
elif "comment-list-item__deleted-text" in html:
print(item.find_element(By.CSS_SELECTOR, "div.comment-list-item__deleted-text").text)
# get sub comments
if "comment-list-item__replies" in html:
#item.find_element(By.CSS_SELECTOR, "div.comment-list-item__replies").click()
sub_comments = item.find_element(By.CSS_SELECTOR, "div.comment-list-item__replies")
actions.move_to_element(sub_comments).click(on_element=sub_comments).perform()
time.sleep(2)
driver.quit()
PS: My goal is to get every single comments and all their sub comments (whether they are text, image, gif, etc) in the order they appear and save them somewhere so that I should be able to recreate the comments section again.
To extract and print the comment texts you need to induce WebDriverWait for visibility_of_all_elements_located() and you can use the following Locator Strategies:
driver.get("https://9gag.com/gag/a5EAv9O")
WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.CSS_SELECTOR, "button.comment-list__load-more"))).click()
print([my_elem.text for my_elem in driver.find_elements(By.CSS_SELECTOR, "div.comment-list-item__text")])
Console Output:
['Man, the battle of the cults is getting interesting now.', 'rent free in your head', 'Sorry saving all my money up for the Joe Biden Depends Multipack and the Karmella knee pads.', "It's basically a cult now.", "I'll take one. I'm not even American", '', 'that eagle looks familiar.', "Who doesn't want a trump card?"]
Note : You have to add the following imports :
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
I'm working on a project that can scrape comments off posts on instagram and write them into an excel file.
Here's my code:
from selenium.webdriver.common.by import By
from selenium import webdriver
import time
import sys
import pandas as pd
from pandas import ExcelWriter
import os.path
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains
url = [
"https://www.instagram.com/p/CcVTqRtJ2gj/",
"https://www.instagram.com/p/CcXpLHepve-/",
]
user_names = []
user_comments = []
driver = driver = webdriver.Chrome("C:\chromedriver.exe")
driver.get(url[0])
time.sleep(3)
username = WebDriverWait(driver, 30).until(EC.element_to_be_clickable((By.CSS_SELECTOR, "input[name='username']")))
password = WebDriverWait(driver, 30).until(EC.element_to_be_clickable((By.CSS_SELECTOR,"input[name='password']")))
username.clear()
username.send_keys("username")
password.clear()
password.send_keys("pwd")
Login_button = (
WebDriverWait(driver, 2)
.until(EC.element_to_be_clickable((By.CSS_SELECTOR, "button[type='submit']")))
.click()
)
time.sleep(4)
not_now = (
WebDriverWait(driver, 30)
.until(
EC.element_to_be_clickable((By.XPATH, '//button[contains(text(), "Not Now")]'))
)
.click()
)
for n in url:
try:
driver.get(n)
time.sleep(3)
load_more_comment = driver.find_element_by_xpath("//button[class='wpO6b ']")
print("Found {}".format(str(load_more_comment)))
i = 0
while load_more_comment.is_displayed() and i < 10:
load_more_comment.click()
time.sleep(1.5)
load_more_comment = driver.find_element_by_xpath(
"//button[class='wpO6b ']"
)
print("Found {}".format(str(load_more_comment)))
i += 1
user_names.pop(0)
user_comments.pop(0)
except Exception as e:
print(e)
pass
comment = driver.find_elements_by_class_name("gElp9 ")
for c in comment:
container = c.find_element_by_class_name("C4VMK")
name = container.find_element_by_class_name("_6lAjh ").text
content = container.find_element_by_class_name("MOdxS ").text
content = content.replace("\n", " ").strip().rstrip()
user_names.append(name)
user_comments.append(content)
print(content)
user_names.pop(0)
user_comments.pop(0)
# export(user_names, user_comments)
driver.close()
df = pd.DataFrame(list(zip(user_names, user_comments)), columns=["Name", "Comments"])
# df.to_excel("Anime Content Engagement.xlsx")
print(df)
And the load-more-comments part, doesn't seem to work.
Since there are more than one buttons with the same class name, I"m not able to choose the right button to click on. And I'm a beginner so if there's anyone with any solution to how I can solve this it would be great.
you can select by aria-label text:
driver.find_element_by_css_selector("svg._8-yf5[aria-label='TEXT']")
i believe the text inside changes according to instagram language, put it according to what appears on your
I've rceated a script to scrape different collection names from a webpage traversing multiple pages. The script can parse first 13 names from each page out of 100 names. One such collection name looks like Pudgy Penguins. How can I capture 100 names instead of only 13 from different pages of that site using selenium?
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
link = "https://opensea.io/rankings"
def scroll_to_the_bottom():
check_height = driver.execute_script("return document.body.scrollHeight;")
while True:
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
try:
WebDriverWait(driver,5).until(lambda driver: driver.execute_script("return document.body.scrollHeight;") > check_height)
check_height = driver.execute_script("return document.body.scrollHeight;")
except TimeoutException:
break
def get_collection_names(link):
driver.get(link)
while True:
scroll_to_the_bottom()
for item in WebDriverWait(driver,10).until(EC.visibility_of_all_elements_located((By.CSS_SELECTOR,"[role='listitem'] [class$='Ranking--row']"))):
collection_name = WebDriverWait(item,10).until(EC.visibility_of_element_located((By.CSS_SELECTOR,"[class$='Ranking--collection-name']"))).text
yield collection_name
try:
button = WebDriverWait(driver,10).until(EC.presence_of_element_located((By.XPATH,"//button[contains(#class,'Buttonreact__StyledButton')][./i[contains(.,'arrow_forward')]]")))
driver.execute_script('arguments[0].click();',button)
WebDriverWait(driver,10).until(EC.staleness_of(item))
except Exception as e:
return
if __name__ == '__main__':
driver = webdriver.Chrome()
for collection_name in get_collection_names(link):
print(collection_name)
Scrolling to the bottom of every page seems not to have any effect on the number of results the script produces.
I have checked your description and source code and I think there are many elements. So it doesn't load at one time. For solving this, scroll down to the bottom step by step. So, I have changed function scroll_to_the_bottom as below :
def scroll_to_the_bottom() :
H = driver.execute_script('return document.body.scrollHeight;')
h = 0
while True :
h += 300
if h >= H :
break
driver.execute_script("window.scrollTo({}, {});".format(0, h))
time.sleep(1)
So, embed above code into your code, we can change it as below :
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
link = "https://opensea.io/rankings"
def get_collection_names(link):
driver.get(link)
unique_items = set()
while True:
item = WebDriverWait(driver,10).until(EC.presence_of_element_located((By.CSS_SELECTOR,"[class$='Ranking--collection-name']")))
H = driver.execute_script('return document.body.scrollHeight;')
h = 0
while True :
h += 300
if h >= H :
break
for element in WebDriverWait(driver,10).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR,"[class$='Ranking--collection-name']"))):
if element.text not in unique_items:
yield element.text
unique_items.add(element.text)
driver.execute_script("window.scrollTo(0, {});".format(h))
time.sleep(1)
try:
button = WebDriverWait(driver,10).until(EC.presence_of_element_located((By.XPATH,"//button[contains(#class,'Buttonreact__StyledButton')][./i[contains(.,'arrow_forward')]]")))
driver.execute_script('arguments[0].click();',button)
WebDriverWait(driver,10).until(EC.staleness_of(item))
except Exception as e:
return
if __name__ == '__main__':
driver = webdriver.Chrome()
for item in get_collection_names(link):
print(item)
driver.quit()
Hope to be helpful for you. Thanks.
I am trying to scrape data from here but getting error.
I have taken code from here Scraping using Selenium and python
This code was working perfectly fine but now I am getting error
wait.until(EC.visibility_of_element_located((By.LINK_TEXT, "All Boards")))
raise TimeoutException(message, screen, stacktrace)
After clicking on pe-funnel link , you can try with this code :
wait.until(EC.visibility_of_element_located((By.XPATH, "//*[name()='text' and #text-anchor='end']")))
all_data = driver.find_elements_by_xpath("//*[name()='text' and #text-anchor='end']")
print(len(all_data))
for data in all_data:
print(data.text)
UPDATE1 :
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support.select import Select
import time
driver = webdriver.Chrome(executable_path = r'D:/Automation/chromedriver.exe')
driver.maximize_window()
driver.get("https://eu1.dashboard.clevertap.com/login.html")
wait = WebDriverWait(driver, 20)
action = ActionChains(driver)
driver.switch_to.default_content()
wait.until(EC.element_to_be_clickable((By.NAME, "email"))).send_keys("abhishe***")
wait.until(EC.element_to_be_clickable((By.NAME,"password"))).send_keys("***")
wait.until(EC.element_to_be_clickable((By.ID,"submitBtn"))).click()
wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, 'div.ct-breadcrumb')))
driver.switch_to.default_content()
action.move_to_element(driver.find_element_by_css_selector("div.sidebar__brand+ul>li:first-child>a")).perform()
wait.until(EC.visibility_of_element_located((By.LINK_TEXT, "All Boards")))
wait.until(EC.element_to_be_clickable((By.LINK_TEXT,"All Boards"))).click()
wait.until(EC.element_to_be_clickable((By.LINK_TEXT,"pe-funnel"))).click()
time.sleep(1)
driver.execute_script("window.scrollTo(0,100)")
wait.until(EC.presence_of_all_elements_located((By.XPATH,"//*[name()='svg' and #class='highcharts-root']")))
all_charts = driver.find_elements_by_xpath("//*[name()='svg' and #class='highcharts-root']")
length_of_list = len(all_charts)
print(length_of_list)
i=0
while(i<len(all_charts)):
wait.until(EC.presence_of_all_elements_located((By.XPATH,"//*[name()='svg' and #class='highcharts-root']")))
all_charts = driver.find_elements_by_xpath("//*[name()='svg' and #class='highcharts-root']")
all_charts[i].click()
i=i+1
try:
print("Switch to frame")
wait.until(EC.frame_to_be_available_and_switch_to_it((By.ID,"wiz-iframe-intent")))
print("Switched to frame")
wait.until(EC.element_to_be_clickable((By.XPATH, "//button[text()='OK' and #class='annoucement-popup__btn']"))).click()
driver.switch_to.default_content()
print("Clicked on Ok button")
except:
print("in catch block")
pass
print("last of CATCH BLOCK")
driver.execute_script("window.scrollTo(0,1100)")
ActionChains(driver).move_to_element(driver.find_element_by_css_selector("input[data-introp='View your analysis']")).click().perform()
#wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR,""))).click()
#ActionChains(driver).move_to_element(driver.find_element_by_css_selector("label[for='funnelProgressionPercent']")).send_keys(Keys.END).perform()
wait.until(EC.presence_of_all_elements_located((By.XPATH,"//*[name()='tspan' and #class='highcharts-text-outline']")))
all_values = driver.find_elements_by_xpath("//*[name()='tspan' and #class='highcharts-text-outline']")
for values in all_values:
print(values.text)
driver.execute_script("window.history.go(-1)")
driver.refresh()