Scrapping twitter followers using selenium scroll issue

Scrapping twitter followers using selenium scroll issue - python

I have written a script which can scrape followers' usernames. But the issue is I am getting all the usernames in the first attempt but when I try to scroll the page using javascript the page keeps on going down instead of going once and scraping the ids and then going down.Although I am getting date till 34th username but later its just messed up. I am sharing the code here you can use your own username and password to check what is the issue with the code. If you copy paste this code (entering your username and password in the empty string) it will run on your PC completely fine.
import warnings
warnings.filterwarnings('ignore')
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import NoSuchElementException, StaleElementReferenceException
from getpass import getpass
from time import sleep
chrome_options = Options()
driver=webdriver.Chrome(ChromeDriverManager().install(),chrome_options=chrome_options)
driver.maximize_window()
website='https://twitter.com/i/flow/login'
driver.get(website)
print('website getting')
sleep(5)
username = driver.find_element_by_xpath('//input[#name="text"]')
username.send_keys('')
print('username running')
username.send_keys(Keys.RETURN)
sleep(3)
password = driver.find_element_by_xpath('//input[#name="password"]')
print('password running')
sleep(2)
password.send_keys('')
password.send_keys(Keys.RETURN)
website='https://twitter.com/MehroozW/followers'
driver.get(website)
print('got it')
import warnings
warnings.filterwarnings('ignore')
import csv
data = []
tweet_ids = set()
last_position = driver.execute_script("return window.pageYOffset;")
scrolling = True
count = 1
i = 1
while scrolling:
for i in range(1,190):
try:
follower_username = driver.find_element_by_xpath(f'//div[#data-testid="primaryColumn"]/div[1]/section/div[1]/div[1]/div[{count}]/div[1]/div[1]/div[1]/div[2]/div[1]/div[1]/div[1]/div[2]/div[1]/a/div[1]/div[1]/span').text
print('index', count, follower_username)
data.append(follower_username)
count +=1
sleep(1)
except Exception:
pass
scroll_attempt = 0
while True:
driver.execute_script('window.scrollTo(0, document.body.scrollHeight);')
sleep(2)
curr_position = driver.execute_script("return window.pageYOffset;")
print('curr_position',curr_position, last_position, scroll_attempt, scrolling)
if last_position == curr_position:
scroll_attempt += 1
# end of scroll region
if scroll_attempt >= 3:
scrolling = False
break
else:
sleep(2) # attempt another scroll
else:
last_position = curr_position
break
data

Related

~FIREFOX~ Selenium Instagram Bot is unable to locate Follow button to follow profiles despite trying By. XPATH, LINK_TEXT, CLASS_NAME etc

I am on VS Code and my Selenium Instagram Bot's intentional design is to read from a list of profiles from a .txt file, visit those profiles, follow and like a specified number of their posts(if they are private, it just follows them) then goes on to the next profile in the list, all the while using different pre-made bot accounts who's usernames are also on a list, so the code may iterate over them once a number of profiles have been engaged with by a single bot.
I am able to iterate over target profiles, but right now I am just having problems with locating elements and having them to be clicked by the bot. I got it to work on 1 profile, after going to the next profile, it simply didn't do anything and seems to can't find the follow button to click again(I can't recreate this, after some changes were made lol, just getting back into Python after briefly touching on it in school). I still haven't even seen the bot like a post too. Although, the XPATHS on the Log In and the Pop Ups seems to work. It's now just not interacting with the profiles.
~
Any insights would be highly appreciated!
Source Code:
from selenium import webdriver
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.firefox.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
import time, random
from selenium.webdriver.common.keys import Keys
profilePath = (r'C:\Users\****\AppData\Roaming\Mozilla\Firefox\Profiles\75d4lwz2.3rd')
options = Options()
service = Service('geckodrivere.exe')
firefox = webdriver.Firefox(options=options, service=service)
wait = WebDriverWait(firefox, 20)
file = open('scrape_archivepages.txt', 'r')
data = file.read()
igUsers = data.split('\n')
file.close()
file2 = open('botlist.txt', 'r')
data2 = file2.read()
bots = data2.split('\n')
file2.close()
def startLogIn(user_, pass_,):
firefox.get('https://www.instagram.com/')
while True:
try:
cookiesAccept = firefox.find_element(By.XPATH, '/html/body/div[2]/div/div/div/div[2]/div/div/div[1]/div/div[2]/div/div/div/div/div[2]/div/button[2]')
time.sleep(4)
cookiesAccept.click()
time.sleep(4)
break
except:
pass
username = firefox.find_element(By.XPATH, '//*[#id="loginForm"]/div/div[1]/div/label/input')
password = firefox.find_element(By.XPATH, '//*[#id="loginForm"]/div/div[2]/div/label/input')
username.click()
username.send_keys(user_)
time.sleep(random.randint(1, 2))
password.click()
password.send_keys(pass_)
time.sleep(random.randint(1, 2))
log_in=firefox.find_element(By.XPATH, '//*[#id="loginForm"]/div/div[3]')
log_in.click()
time.sleep(5)
# while True:
# try:
# credentials= firefox.find_element(By.XPATH, '//button[text()="Not Now"]')
# time.sleep(3)
# credentials.click()
# break
# except:
# pass
# while True:
# try:
# notifications = firefox.find_element(By.XPATH, '//button[text()="Not Now"]')
# time.sleep(3)
# notifications.click()
# break
# except:
# pass
def interact(igUserLink, n):
firefox.get(igUserLink)
time.sleep(2)
#while True:
# try:
follow = firefox.find_element(By.CSS_SELECTOR, '#mount_0_0_0I > div > div > div > div.x9f619.x1n2onr6.x1ja2u2z > div > div > div > div.x78zum5.xdt5ytf.x10cihs4.x1t2pt76.x1n2onr6.x1ja2u2z > div.x9f619.xnz67gz.x78zum5.x168nmei.x13lgxp2.x5pf9jr.xo71vjh.x1uhb9sk.x1plvlek.xryxfnj.x1c4vz4f.x2lah0s.x1q0g3np.xqjyukv.x1qjc9v5.x1oa3qoh.x1qughib > div.xh8yej3.x1gryazu.x10o80wk.x14k21rp.x1porb0y.x17snn68.x6osk4m > section > main > div > header > section > div.x6s0dn4.x78zum5.x1q0g3np.xs83m0k.xeuugli.x1n2onr6 > div._ab8w._ab94._ab99._ab9f._ab9k._ab9p._abb3._abcm > div > div._ab8w._ab94._ab99._ab9f._ab9m._ab9o._abb0._abcm > button > div > div')
time.sleep(2)
private = firefox.find_element(By.XPATH, '/html/body/div[2]/div/div/div/div[1]/div/div/div/div[1]/section/main/div/div/article/div[1]/div/h2')
if(bool(private)):
print('lol')
follow.click()
time.sleep(2)
##xpath of header from IG saying profile is private
if (not(bool(private))):
print('here')
follow.click()
time.sleep(2)
time.sleep(2)
c = 0
numPosts = firefox.find_element(By.XPATH, '/html/body/div[2]/div/div/div/div[1]/div/div/div/div[1]/div[1]/div[2]/section/main/div/header/section/ul/li[1]/div/span/span')
numPosts = int(numPosts.text)
if n == 0:
#do nothing
if n <= numPosts:
media = firefox.find_element(By.XPATH, '/html/body/div[2]/div/div/div/div[1]/div/div/div/div[1]/div[1]/div[2]/section/main/div/div[2]/article/div/div/div[1]/div[1]')
media.click()
time.sleep(1)
like = firefox.find_element(By.NAME, 'Like')
next = firefox.find_element(By.NAME, 'Next')
while(c<n):
like.click()
time.sleep(3)
c=c+1
next.click()
#break
#except:
#pass
def VibeFinderInteract(listOfBots, passw, userLink):
for userbot in listOfBots:
startLogIn(userbot, passw)
for u in userLink:
interact(u, 2)
print('')
VibeFinderInteract(bots, 'samepasswordforallthebots', igUsers)

How do I press load more button while scraping comments on Instagram with Selenium Python

I'm working on a project that can scrape comments off posts on instagram and write them into an excel file.
Here's my code:
from selenium.webdriver.common.by import By
from selenium import webdriver
import time
import sys
import pandas as pd
from pandas import ExcelWriter
import os.path
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains
url = [
"https://www.instagram.com/p/CcVTqRtJ2gj/",
"https://www.instagram.com/p/CcXpLHepve-/",
]
user_names = []
user_comments = []
driver = driver = webdriver.Chrome("C:\chromedriver.exe")
driver.get(url[0])
time.sleep(3)
username = WebDriverWait(driver, 30).until(EC.element_to_be_clickable((By.CSS_SELECTOR, "input[name='username']")))
password = WebDriverWait(driver, 30).until(EC.element_to_be_clickable((By.CSS_SELECTOR,"input[name='password']")))
username.clear()
username.send_keys("username")
password.clear()
password.send_keys("pwd")
Login_button = (
WebDriverWait(driver, 2)
.until(EC.element_to_be_clickable((By.CSS_SELECTOR, "button[type='submit']")))
.click()
)
time.sleep(4)
not_now = (
WebDriverWait(driver, 30)
.until(
EC.element_to_be_clickable((By.XPATH, '//button[contains(text(), "Not Now")]'))
)
.click()
)
for n in url:
try:
driver.get(n)
time.sleep(3)
load_more_comment = driver.find_element_by_xpath("//button[class='wpO6b ']")
print("Found {}".format(str(load_more_comment)))
i = 0
while load_more_comment.is_displayed() and i < 10:
load_more_comment.click()
time.sleep(1.5)
load_more_comment = driver.find_element_by_xpath(
"//button[class='wpO6b ']"
)
print("Found {}".format(str(load_more_comment)))
i += 1
user_names.pop(0)
user_comments.pop(0)
except Exception as e:
print(e)
pass
comment = driver.find_elements_by_class_name("gElp9 ")
for c in comment:
container = c.find_element_by_class_name("C4VMK")
name = container.find_element_by_class_name("_6lAjh ").text
content = container.find_element_by_class_name("MOdxS ").text
content = content.replace("\n", " ").strip().rstrip()
user_names.append(name)
user_comments.append(content)
print(content)
user_names.pop(0)
user_comments.pop(0)
# export(user_names, user_comments)
driver.close()
df = pd.DataFrame(list(zip(user_names, user_comments)), columns=["Name", "Comments"])
# df.to_excel("Anime Content Engagement.xlsx")
print(df)
And the load-more-comments part, doesn't seem to work.
Since there are more than one buttons with the same class name, I"m not able to choose the right button to click on. And I'm a beginner so if there's anyone with any solution to how I can solve this it would be great.

you can select by aria-label text:
driver.find_element_by_css_selector("svg._8-yf5[aria-label='TEXT']")
i believe the text inside changes according to instagram language, put it according to what appears on your

How to handle StaleElementReferenceException in Selenium Python?

I am trying to scrape Twitter followers and the bio of the followers from a certain Twitter account.
The account has more than 10000 followers. I have ran the code many times but it scrapes like 5000,7000 sometimes 9000 followers and then throws StaleElementRefrenceException.
I am a beginner, so it would be of great help if you suggest where to make what changes in the code, so it won't throw the exception.
import csv
from getpass import getpass
from time import sleep
from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver import Chrome
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
def get_followers_and_bio(cardd):
screen_name = cardd.find_element_by_xpath('./div//span').text
username = cardd.find_element_by_xpath('.//span[contains(text(), "#")]').text
i = cardd.text.split("\n").index('Follow')
bio = cardd.text.split("\n")[i+1:]
user = (screen_name, username, bio)
return user
# Create instance of the web driver
driver = webdriver.Chrome(ChromeDriverManager().install())
# Navigate to login screen
driver.get('https://www.twitter.com/login')
driver.maximize_window()
sleep(5)
username = driver.find_element_by_xpath('//input[#name="text"]')
username.send_keys('myemail#gmail.com')
username.send_keys(Keys.RETURN)
sleep(10)
username1 = driver.find_element_by_xpath('//input[#name="text"]')
username1.send_keys('myusername')
username1.send_keys(Keys.RETURN)
my_password = getpass()
password = driver.find_element_by_xpath('//input[#name="password"]')
password.send_keys(my_password)
password.send_keys(Keys.RETURN)
sleep(5)
# Find search input and search for term or user
search_input = driver.find_element_by_xpath('//input[#aria-label="Search query"]')
search_input.send_keys('#username')
search_input.send_keys(Keys.RETURN)
sleep(5)
driver.find_element_by_link_text('People').click()
sleep(5)
driver.find_element_by_link_text('#username').click()
sleep(5)
# Opening user's followers list
driver.find_element_by_xpath("//a[#href='/username/followers']").click()
sleep(5)
# Get all followers and their bio on the page
followers_list = []
last_position = driver.execute_script("return window.pageYOffset;")
scrolling = True
while scrolling:
cards = driver.find_elements_by_xpath('//div[#data-testid="UserCell"]')
for card in cards:
data = get_followers_and_bio(card)
if data:
followers_list.append(data)
scroll_attempt = 0
while True:
# Check scroll position
driver.execute_script('window.scrollTo(0, document.body.scrollHeight);')
sleep(2)
curr_position = driver.execute_script("return window.pageYOffset;")
if last_position == curr_position:
scroll_attempt += 1
# End of scroll region
if scroll_attempt >= 5:
scrolling = False
break
else:
sleep(3) # Attempt to scroll again
else:
last_position = curr_position
break

How to stop selenium scraper from redirecting to another internal weblink of the scraped website?

Was wondering if anyone knows of a way for instructing a selenium script to avoid visiting/redirecting to an internal page that wasn't part of the code. Essentially, my code opens up this page:
https://cryptwerk.com/companies/?coins=1,6,11,2,3,8,17,7,13,4,25,29,24,32,9,38,15,30,43,42,41,12,40,44,20
keeps clicking on show more button until there's none (at end of page) - which by then - it should have collected the links of all the products listed on the page it scrolled through till the end, then visit each one respectively.
What happens instead, it successfully clicks on show more till the end of the page, but then it visits this weird promotion page of the same website instead of following each of the gathered links respectively and then scraping further data points located off each of those newly opened ones.
In a nutshell, would incredibly appreciate it if someone can explain how to avoid this automated redirection on its own! And this is the code in case someone can gratefully nudge me in the right direction :)
from selenium.webdriver import Chrome
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
import time
from selenium.common.exceptions import NoSuchElementException, ElementNotVisibleException
import json
import selenium.common.exceptions as exception
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.keys import Keys
from selenium import webdriver
webdriver = '/Users/karimnabil/projects/selenium_js/chromedriver-1'
driver = Chrome(webdriver)
driver.implicitly_wait(5)
url = 'https://cryptwerk.com/companies/?coins=1,6,11,2,3,8,17,7,13,4,25,29,24,32,9,38,15,30,43,42,41,12,40,44,20'
driver.get(url)
links_list = []
coins_list = []
all_names = []
all_cryptos = []
all_links = []
all_twitter = []
all_locations = []
all_categories = []
all_categories2 = []
wait = WebDriverWait(driver, 2)
sign_in = driver.find_element_by_xpath("//li[#class='nav-item nav-guest']/a")
sign_in.click()
time.sleep(2)
user_name = wait.until(EC.presence_of_element_located((By.XPATH, "//input[#name='login']")))
user_name.send_keys("karimnsaber95#gmail.com")
password = wait.until(EC.presence_of_element_located((By.XPATH, "//input[#name='password']")))
password.send_keys("PleomaxCW#2")
signIn_Leave = driver.find_element_by_xpath("//div[#class='form-group text-center']/button")
signIn_Leave.click()
time.sleep(3)
while True:
try:
loadMoreButton = driver.find_element_by_xpath("//button[#class='btn btn-outline-primary']")
time.sleep(2)
loadMoreButton.click()
time.sleep(2)
except exception.StaleElementReferenceException:
print('stale element')
break
print('no more elements to show')
try:
company_links = driver.find_elements_by_xpath("//div[#class='companies-list items-infinity']/div[position() > 3]/div[#class='media-body']/div[#class='title']/a")
for link in company_links:
links_list.append(link.get_attribute('href'))
except:
pass
try:
with open("links_list.json", "w") as f:
json.dump(links_list, f)
with open("links_list.json", "r") as f:
links_list = json.load(f)
except:
pass
try:
for link in links_list:
driver.get(link)
name = driver.find_element_by_xpath("//div[#class='title']/h1").text
try:
show_more_coins = driver.find_element_by_xpath("//a[#data-original-title='Show more']")
show_more_coins.click()
time.sleep(1)
except:
pass
try:
categories = driver.find_elements_by_xpath("//div[contains(#class, 'categories-list')]/a")
categories_list = []
for category in categories:
categories_list.append(category.text)
except:
pass
try:
top_page_categories = driver.find_elements_by_xpath("//ol[#class='breadcrumb']/li/a")
top_page_categories_list = []
for category in top_page_categories:
top_page_categories_list.append(category.text)
except:
pass
coins_links = driver.find_elements_by_xpath("//div[contains(#class, 'company-coins')]/a")
all_coins = []
for coin in coins_links:
all_coins.append(coin.get_attribute('href'))
try:
location = driver.find_element_by_xpath("//div[#class='addresses mt-3']/div/div/div/div/a").text
except:
pass
try:
twitter = driver.find_element_by_xpath("//div[#class='links mt-2']/a[2]").get_attribute('href')
except:
pass
try:
print('-----------')
print('Company name is: {}'.format(name))
print('Potential Categories are: {}'.format(categories_list))
print('Potential top page categories are: {}'.format(top_page_categories_list))
print('Supporting Crypto is:{}'.format(all_coins))
print('Registered location is: {}'.format(location))
print('Company twitter profile is: {}'.format(twitter))
time.sleep(1)
except:
pass
all_names.append(name)
all_categories.append(categories_list)
all_categories2.append(top_page_categories_list)
all_cryptos.append(all_coins)
all_twitter.append(twitter)
all_locations.append(location)
except:
pass
df = pd.DataFrame(list(zip(all_names, all_categories, all_categories2, all_cryptos, all_twitter, all_locations)), columns=['Company name', 'Categories1', 'Categories2', 'Supporting Crypto', 'Twitter Handle', 'Registered Location'])
CryptoWerk_Data = df.to_csv('CryptoWerk4.csv', index=False)

Redirect calls happen for two reasons, in your case either by executing some javascript code when clicking the last time on the load more button or by receiving an HTTP 3xx code, which is the least likely in your case.
So you need to identify when this javascript code is executed and send an ESC_KEY before it loads and then executing the rest of your script.
You could also scrape the links and append them to your list before clicking the load more button and each time it is clicked, make an if statement the verify the link of the page you're in, if it is that of the promotion page then execute the rest of your code, else click load more.
while page_is_same:
scrape_elements_add_to_list()
click_load_more()
verify_current_page_link()
if current_link_is_same != link_of_scraped_page:
page_is_same = False
# rest of the code here

what happens when find_elements can't find the class?

I am trying to find a particular class on a website. The class is sometimes present and sometimes it is absent.
So when the class is present, it takes a few seconds for the script to locate the element(logo). When the class is not present,the script runs for a long time and then end.
Why is that? is there any way to speed it up when the class doesn't exist?
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from time import sleep
chrome_path = r"C:\Users\peter\Desktop\chromedriver.exe"
driver = webdriver.Chrome(executable_path=r"C:\Users\peter\Desktop\chromedriver.exe")
driver.get("https://example.com/app/login")
driver.minimize_window()
driver.implicitly_wait(300)
input_email = driver.find_element_by_xpath("//input[#type='email']")
input_email.send_keys('example#gmail.com')
input_password = driver.find_element_by_xpath("//input[#type='password']")
input_password.send_keys('example')
click_login = driver.find_element_by_xpath("//button[#type='submit']")
click_login.click()
driver.find_element_by_id("schedule-today").click()
sleep(2)
logo = driver.find_elements_by_xpath( "//*[contains(#class, 'lbl_lesson_status label label-info lbl_lesson_open')]" );
if not logo:
print("empty")
f = open("reserved_date", "a+")
for i in logo:
opendate = i.get_attribute("data-t-start-local");
f.write((opendate)+'\n')
print(opendate)
driver.close()

You Need To Add Wait And Add Try Except for example if element not found throw message and quit that script
I Simply Code For You!
Try This Code:
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.common.exceptions import *
from selenium.webdriver.common.keys import Keys
import time
chrome_path = r"C:\Users\peter\Desktop\chromedriver.exe"
driver = webdriver.Chrome(executable_path=r"C:\Users\peter\Desktop\chromedriver.exe")
driver.get("https://example.com/app/login")
driver.minimize_window()
try:
input_email = WebDriverWait(driver,20).until(EC.element_to_be_clickable((By.XPATH,"//input[#type='email']")))
input_email.send_keys('example#gmail.com')
except (TimeoutException,NoSuchElementException):
print('There is No Email Input!')
quit()
try:
input_password = WebDriverWait(driver,20).until(EC.element_to_be_clickable((By.XPATH,"//input[#type='password']")))
input_password.send_keys('example')
except (TimeoutException,NoSuchElementException):
print('There is No Password Input!')
quit()
try:
click_login = WebDriverWait(driver,20).until(EC.element_to_be_clickable((By.XPATH,"//button[#type='submit']")))
click_login.click()
except (TimeoutException,NoSuchElementException):
print('There is No Login Button!')
quit()
try:
WebDriverWait(driver,20).until(EC.element_to_be_clickable((By.CSS_SELECTOR,"#schedule-today")))
time.sleep(2)
except (TimeoutException,NoSuchElementException):
print("Can't Find schedule-today id!")
quit()
try:
logo = WebDriverWait(driver,20).until(EC.element_to_be_clickable((By.XPATH,"//*[contains(#class, 'lbl_lesson_status label label-info lbl_lesson_open')]")))
f = open("reserved_date", "a+")
for i in logo:
opendate = i.get_attribute("data-t-start-local");
f.write((opendate)+'\n')
print(opendate)
except (TimeoutException,NoSuchElementException):
print("Can't Find Logo Button!")
quit()
driver.close()

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Scrapping twitter followers using selenium scroll issue - python

Related

~FIREFOX~ Selenium Instagram Bot is unable to locate Follow button to follow profiles despite trying By. XPATH, LINK_TEXT, CLASS_NAME etc

How do I press load more button while scraping comments on Instagram with Selenium Python

How to handle StaleElementReferenceException in Selenium Python?

How to stop selenium scraper from redirecting to another internal weblink of the scraped website?

what happens when find_elements can't find the class?

Categories

Resources