Looping and stop duplicating output | Selenium

Looping and stop duplicating output | Selenium | Python - python

Very new to Python and Selenium, looking to scrape a few data points. I'm struggling in three areas:
I don't understand how to loop through multiple URLs properly
I can't figure out why the script is iterating twice over each URL
I can't figure out why it's only outputting the data for the second URL
Much thanks for taking a look!
Here's my current script:
urls = [
'https://developers.google.com/speed/pagespeed/insights/?url=https://www.crutchfield.com/%2F&tab=mobile',
'https://developers.google.com/speed/pagespeed/insights/?url=https://www.lastpass.com%2F&tab=mobile'
]
driver = webdriver.Chrome(executable_path='/Library/Frameworks/Python.framework/Versions/3.9/bin/chromedriver')
for url in urls:
for page in range(0, 1):
driver.get(url)
wait = WebDriverWait(driver, 120).until(EC.presence_of_element_located((By.CLASS_NAME, 'origin-field-data')))
df = pd.DataFrame(columns = ['Title', 'Core Web Vitals', 'FCP', 'FID', 'CLS', 'TTI', 'TBT', 'Total Score'])
company = driver.find_elements_by_class_name("audited-url__link")
data = []
for i in company:
data.append(i.get_attribute('href'))
for x in data:
#Get URL name
title = driver.find_element_by_xpath('//*[#id="page-speed-insights"]/div[2]/div[3]/div[2]/div[1]/div[1]/div/div[2]/h1/a')
co_name = title.text
#Get Core Web Vitals text pass/fail
cwv = driver.find_element_by_xpath('//*[#id="page-speed-insights"]/div[2]/div[3]/div[2]/div[1]/div[2]/div/div[1]/div[1]/div[1]/span[2]')
core_web = cwv.text
#Get FCP
fcp = driver.find_element_by_xpath('//*[#id="page-speed-insights"]/div[2]/div[3]/div[2]/div[1]/div[2]/div/div[1]/div[1]/div[2]/div[1]/div[1]/div')
first_content = fcp.text
#Get FID
fid = driver.find_element_by_xpath('//*[#id="page-speed-insights"]/div[2]/div[3]/div[2]/div[1]/div[2]/div/div[1]/div[1]/div[2]/div[3]/div[1]/div')
first_input = fid.text
#Get CLS
cls = driver.find_element_by_xpath('//*[#id="page-speed-insights"]/div[2]/div[3]/div[2]/div[1]/div[2]/div/div[1]/div[1]/div[2]/div[4]/div[1]/div')
layout_shift = cls.text
#Get TTI
tti = driver.find_element_by_xpath('//*[#id="interactive"]/div/div[1]')
time_interactive = tti.text
#Get TBT
tbt = driver.find_element_by_xpath('//*[#id="total-blocking-time"]/div/div[1]')
total_block = tbt.text
#Get Total Score
total_score = driver.find_element_by_xpath('//*[#id="page-speed-insights"]/div[2]/div[3]/div[2]/div[1]/div[1]/div/div[1]/a/div[2]')
score = total_score.text
#Adding all columns to dataframe
df.loc[len(df)] = [co_name,core_web,first_content,first_input,layout_shift,time_interactive,total_block,score]
driver.close()
#df.to_csv('Double Page Speed Test 9-10.csv')
print(df)

Q1 : I don't understand how to loop through multiple URLs properly ?
Ans : for url in urls:
Q2. I can't figure out why the script is iterating twice over each URL
Ans : Cause you have for page in range(0, 1):
Update 1:
I did not run your entire code with DF. Also sometimes either one of the pages, does not show the number and href, but when I typically run the below code,
driver = webdriver.Chrome(driver_path)
driver.maximize_window()
driver.implicitly_wait(50)
wait = WebDriverWait(driver, 20)
urls = [
'https://developers.google.com/speed/pagespeed/insights/?url=https://www.crutchfield.com/%2F&tab=mobile',
'https://developers.google.com/speed/pagespeed/insights/?url=https://www.lastpass.com%2F&tab=mobile'
]
data = []
for url in urls:
driver.get(url)
wait = WebDriverWait(driver, 120).until(EC.presence_of_element_located((By.CLASS_NAME, 'origin-field-data')))
company = driver.find_elements_by_css_selector("h1.audited-url a")
for i in company:
data.append(i.get_attribute('href'))
print(data)
this output :
['https://www.crutchfield.com//', 'https://www.lastpass.com/', 'https://www.lastpass.com/']
which is true case the element locator that we have used is representing 1 element on page 1 or 2 element on page 2

Related

why xpath cannot get the target element?

I am quite new in scraping with xpath. I am trying to scrape product information on Target. I use selenium and xpath successfully get the price and name. But xpath cannot return any value when scraping for product sizeproduct size and sales locationsales location.
For example, for this url"https://www.target.com/p/pataday-once-daily-relief-extra-strength-drops-0-085-fl-oz/-/A-83775159?preselect=81887758#lnk=sametab", xpath for size is:
//*[#id="pageBodyContainer"]/div[1]/div[2]/div[2]/div/div[3]/div/div[1]/text()
xpath for sales location is:
//*[#id="pageBodyContainer"]/div[1]/div[2]/div[2]/div/div[1]/div[2]/span
I also try to get these two elements by using requests but it also did not work. Do anyone know why it happened? Any help appreciated. Thanks.
Following is my code:
def job_function():
urlList = ['https://www.target.com/p/pataday-once-daily-relief-extra-strength-drops-0-085-fl-oz/-/A-83775159?preselect=81887758#lnk=sametab',
'https://www.target.com/p/kleenex-ultra-soft-facial-tissue/-/A-84780536?preselect=12964744#lnk=sametab',
'https://www.target.com/p/claritin-24-hour-non-drowsy-allergy-relief-tablets-loratadine/-/A-80354268?preselect=14351285#lnk=sametab',
'https://www.target.com/p/opti-free-pure-moist-rewetting-drops-0-4-fl-oz/-/A-14358641#lnk=sametab'
]
def ScrapingTarget(url):
AArray = []
wait_imp = 10
CO = webdriver.ChromeOptions()
CO.add_experimental_option('useAutomationExtension', False)
CO.add_argument('--ignore-certificate-errors')
CO.add_argument('--start-maximized')
wd = webdriver.Chrome(r'D:\chromedriver\chromedriver_win32new\chromedriver_win32 (2)\chromedriver.exe',
options=CO)
wd.get(url)
wd.implicitly_wait(wait_imp)
sleep(1)
#start scraping
name = wd.find_element(by=By.XPATH, value="//*[#id='pageBodyContainer']/div[1]/div[1]/h1/span").text
sleep(0.5)
price = wd.find_element(by=By.XPATH, value="//*[#id='pageBodyContainer']/div[1]/div[2]/div[2]/div/div[1]/div[1]/span").text
sleep(0.5)
try:
size = wd.find_element(by=By.XPATH, value="//*[#id='pageBodyContainer']/div[1]/div[2]/div[2]/div/div[3]/div/div[1]/text()").text
except:
size = "none"
sleep(0.5)
try:
sales location = wd.find_element(by=By.XPATH, value="//*[#id='pageBodyContainer']/div[1]/div[2]/div[2]/div/div[1]/div[2]/span").text
except:
sales location = "none"
tz = pytz.timezone('Etc/GMT-0')
GMT = datetime.now(tz).strftime("%Y-%m-%d %H:%M:%S")
AArray.append([name, price, size, sales location, GMT])
with open(
r'C:\Users\12987\PycharmProjects\python\Network\priceingAlgoriCoding\export_Target_dataframe.csv',
'a', newline="", encoding='utf-8') as f:
writer = csv.writer(f)
writer.writerows(AArray)
with concurrent.futures.ThreadPoolExecutor(4) as executor:
executor.map(ScrapingTarget, urlList)
sched = BlockingScheduler()
sched.add_job(job_function,'interval',seconds=60)
sched.start()

Selenium - Iterating Through Drop Down Menu - Let Page Load

I am trying to iterate through player seasons on NBA.com and pull shooting statistics after each click of the season dropdown menu. After each click, I get the error message "list index out of range" for:
headers = table[1].findAll('th')
It seems to me that the page doesn't load all the way before the source data is saved.
Looking at other similar questions, I have tried using an browser.implicitly_wait() for each loop, but I am still getting the same error. It also doesn't seem that the browser waits after more than the first iteration of the loop.
from selenium.webdriver.support.ui import Select
from bs4 import BeautifulSoup
import pandas as pd
player_id = str(1629216)
url = 'https://www.nba.com/stats/player/' + player_id + "/shooting/"
browser = Chrome(executable_path='/usr/local/bin/chromedriver')
browser.get(url)
select = Select(browser.find_element_by_xpath('/html/body/main/div/div/div/div[4]/div/div/div/div/div[1]/div[1]/div/div/label/select'))
options = select.options
for index in range(0, len(options)):
select.select_by_index(index)
browser.implicitly_wait(5)
src = browser.page_source
parser = BeautifulSoup(src, "lxml")
table = parser.findAll("div", attrs = {"class":"nba-stat-table__overflow"})
headers = table[1].findAll('th')
headerlist = [h.text.strip() for h in headers[1:]]
headerlist = [a for a in headerlist if not '\n' in a]
headerlist.append('AST%')
headerlist.append('UAST%')
row_labels = table[1].findAll("td", {"class": "first"})
row_labels_list = [r.text.strip() for r in row_labels[0:]]
rows = table[1].findAll('tr')[1:]
player_stats = [[td.getText().strip() for td in rows[i].findAll('td')[1:]] for i in range(len(rows))]
df = pd.DataFrame(data=player_stats, columns=headerlist, index = row_labels_list)
print(df)

I found my own answer. I used time.sleep(1) at the top of the loop to give the browser a second to load all the way. Without this delay, the pages source code did not have the appropriate table that I am scraping.
Responding to those who answered - I did not want to go the api route, but I have seen people scrape nba.com using that method. Table[1] is the correct table; just needed the source code a chance to load after the I loop through the season dropdown.
select.select_by_index(index)
time.sleep(1)
src = browser.page_source
parser = BeautifulSoup(src, "lxml")
table = parser.findAll("div", attrs = {"class":"nba-stat-table__overflow"})
headers = table[1].findAll('th')
headerlist = [h.text.strip() for h in headers[1:]]
headerlist = [a for a in headerlist if not '\n' in a]
headerlist.append('AST%')
headerlist.append('UAST%')
row_labels = table[1].findAll("td", {"class": "first"})
row_labels_list = [r.text.strip() for r in row_labels[0:]]
rows = table[1].findAll('tr')[1:]
player_stats = [[td.getText().strip() for td in rows[i].findAll('td')[1:]] for i in range(len(rows))]
df = pd.DataFrame(data=player_stats, columns=headerlist, index = row_labels_list)
print(df)

Can I pause a scroll function in selenium, scrape the current data, and then continue scrolling later in the script?

I am a student working on a scraping project and I am having trouble completing my script because it fills my computer's memory with all of the data is stores.
It currently stores all of my data until the end, so my solution to this would be to break up the scrape into smaller bits and then write out the data periodically so it does not just continue to make one big list and then write out at the end.
In order to do this, I would need to stop my scroll method, scrape the loaded profiles, write out the data that I have collected, and then repeat this process without duplicating my data. It would be appreciated if someone could show me how to do this. Thank you for your help :)
Here's my current code:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from time import sleep
from selenium.common.exceptions import NoSuchElementException
Data = []
driver = webdriver.Chrome()
driver.get("https://directory.bcsp.org/")
count = int(input("Number of Pages to Scrape: "))
body = driver.find_element_by_xpath("//body")
profile_count = driver.find_elements_by_xpath("//div[#align='right']/a")
while len(profile_count) < count: # Get links up to "count"
body.send_keys(Keys.END)
sleep(1)
profile_count = driver.find_elements_by_xpath("//div[#align='right']/a")
for link in profile_count: # Calling up links
temp = link.get_attribute('href') # temp for
driver.execute_script("window.open('');") # open new tab
driver.switch_to.window(driver.window_handles[1]) # focus new tab
driver.get(temp)
# scrape code
Name = driver.find_element_by_xpath('/html/body/table/tbody/tr/td/table/tbody/tr/td[5]/div/table[1]/tbody/tr/td[1]/div[2]/div').text
IssuedBy = "Board of Certified Safety Professionals"
CertificationorDesignaationNumber = driver.find_element_by_xpath('/html/body/table/tbody/tr/td/table/tbody/tr/td[5]/div/table[1]/tbody/tr/td[3]/table/tbody/tr[1]/td[3]/div[2]').text
CertfiedorDesignatedSince = driver.find_element_by_xpath('/html/body/table/tbody/tr/td/table/tbody/tr/td[5]/div/table[1]/tbody/tr/td[3]/table/tbody/tr[3]/td[1]/div[2]').text
try:
AccreditedBy = driver.find_element_by_xpath('/html/body/table/tbody/tr/td/table/tbody/tr/td[5]/div/table[1]/tbody/tr/td[3]/table/tbody/tr[5]/td[3]/div[2]/a').text
except NoSuchElementException:
AccreditedBy = "N/A"
try:
Expires = driver.find_element_by_xpath('/html/body/table/tbody/tr/td/table/tbody/tr/td[5]/div/table[1]/tbody/tr/td[3]/table/tbody/tr[5]/td[1]/div[2]').text
except NoSuchElementException:
Expires = "N/A"
info = Name, IssuedBy, CertificationorDesignaationNumber, CertfiedorDesignatedSince, AccreditedBy, Expires + "\n"
Data.extend(info)
driver.close()
driver.switch_to.window(driver.window_handles[0])
with open("Spredsheet.txt", "w") as output:
output.write(','.join(Data))
driver.close()
Test.py
Displaying Test.py.

Try the below approach using requests and beautifulsoup. In the below script i have used the API URL fetched from website itself for ex:-API URL
First it will create the URL(refer first url) for first iteration, add headers and data in .csv file.
Second iteration it will again create the URL(refer second url) with 2 extra params start_on_page=20 & show_per_page=20 where start_on_page number 20 is incremented by 20 on each iteration and show_per_page = 100 defaulted to extract 100 records per iteration so on till all the data dumped in to the .csv file.second iteration API URL
Script is dumping 4 things number, name, location and profile url.
On each iteration data will be appended to .csv file , so your memory issue will get resolved by this approach.
Do not forget to add your system path in file_path variable where do you want to create .csv file before running the script.
import requests
from urllib3.exceptions import InsecureRequestWarning
requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
from bs4 import BeautifulSoup as bs
import csv
def scrap_directory_data():
list_of_credentials = []
file_path = ''
file_name = 'credential_list.csv'
count = 0
page_number = 0
page_size = 100
create_url = ''
main_url = 'https://directory.bcsp.org/search_results.php?'
first_iteration_url = 'first_name=&last_name=&city=&state=&country=&certification=&unauthorized=0&retired=0&specialties=&industries='
number_of_records = 0
csv_headers = ['#','Name','Location','Profile URL']
while True:
if count == 0:
create_url = main_url + first_iteration_url
print('-' * 100)
print('1 iteration URL created: ' + create_url)
print('-' * 100)
else:
create_url = main_url + 'start_on_page=' + str(page_number) + '&show_per_page=' + str(page_size) + '&' + first_iteration_url
print('-' * 100)
print('Other then first iteration URL created: ' + create_url)
print('-' * 100)
page = requests.get(create_url,verify=False)
extracted_text = bs(page.text, 'lxml')
result = extracted_text.find_all('tr')
if len(result) > 0:
for idx, data in enumerate(result):
if idx > 0:
number_of_records +=1
name = data.contents[1].text
location = data.contents[3].text
profile_url = data.contents[5].contents[0].attrs['href']
list_of_credentials.append({
'#':number_of_records,
'Name':name,
'Location': location,
'Profile URL': profile_url
})
print(data)
with open(file_path + file_name ,'a+') as cred_CSV:
csvwriter = csv.DictWriter(cred_CSV, delimiter=',',lineterminator='\n',fieldnames=csv_headers)
if idx == 0 and count == 0:
print('Writing CSV header now...')
csvwriter.writeheader()
else:
for item in list_of_credentials:
print('Writing data rows now..')
print(item)
csvwriter.writerow(item)
list_of_credentials = []
count +=1
page_number +=20
scrap_directory_data()

How can I speed a Selenium scraper on the New York Times dining website?

Currently, I'm trying to scrape the New York Times dining website (nytimes.com/reviews/dining) and get a list of links and neighborhoods for each restaurant. Unfortunately, I've been running the codeblock below for about 9 hours on Google Colab, and the iterator X is on its 1,175th run. I'm trying to figure out what's going on, but I'm too scared to halt the cell and start over again. Is it the nested for loop that is causing this process to take such a long time?
driver = webdriver.Chrome('chromedriver', chrome_options = chrome_options)
driver.get("https://www.nytimes.com/reviews/dining")
WebDriverWait(driver, 20).until(EC.element_to_be_clickable
((By.XPATH,"//button[text()='Show More']"))).click()
url_list = []
nyt_dining = pd.DataFrame(columns = ['Restaurant', 'URL', 'servesCuisine', 'priceRange', 'addressLocality'])
x = 0
while(True):
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
elements = WebDriverWait(driver, 20).until(EC.visibility_of_all_elements_located
((By.CSS_SELECTOR,"div.css-102xbk1")))
url_before = len(url_list)
for e in elements:
# 2.1 Getting the links
link = e.find_element_by_css_selector("a.css-gg4vpm")
link = link.get_attribute("href") ##
# 2.1 Getting the restaurant name
name = e.find_element_by_css_selector("h2.css-8aqwnr")
name = name.text ##
# 2.3 Getting other information
info = e.find_element_by_css_selector("ul.css-o4kdzz")
info = info.find_elements_by_tag_name('li')
cuisine = ''
price = ''
neighborhood = ''
for i in info:
attribute = i.get_attribute("itemprop")
if attribute == "servesCuisine":
cuisine = i.text
elif attribute == "priceRange":
price = i.text
elif attribute == "addressLocality":
neighborhood = i.text
# 2.4 Append to dataframe
if link in url_list:
continue
else:
url_list.append(link)
nyt_dining = nyt_dining.append({'Restaurant': name, 'URL': link,
'servesCuisine': cuisine,
'priceRange': price,
'addressLocality': neighborhood},
ignore_index = True)
print(x)
x += 1
url_after = len(url_list)
if url_before >= url_after:
break
button = WebDriverWait(driver, 10).until(EC.visibility_of_element_located
((By.XPATH,"//button[text()='Show More']")))
driver.execute_script("arguments[0].click();", button)
time.sleep(2)
nyt_dining

Getting list of likers for an instagram post - Python & Selenium

I'm training to web crawling. To do so, I've challenged myself to get the list of all people having liked a post on instagram.
My problem is that I'm stuck to the point where I only get the first 11 usernames of likers. I cannot find the right way to automate the scrolling process while getting the likes.
Here is my process in Jupyter Notebook (it doesn't work as a script yet):
from selenium import webdriver
import pandas as pd
driver = webdriver.Chrome()
driver.get('https://www.instagram.com/p/BuE82VfHRa6/')
userid_element = driver.find_elements_by_xpath('//*[#id="react-root"]/section/main/div/div/article/div[2]/section[2]/div/div/a')[0].click()
elems = driver.find_elements_by_xpath("//*[#id]/div/a")
users = []
for elem in elems:
users.append(elem.get_attribute('title'))
print(users)
Do you guys have any idea?
Many thanks

I guess instagram site use liked user elements maximum 17.
so, this is one loop
get elements list from web
save to my list
scroll down for get new element
check, is this last scroll elements?
driver.get('https://www.instagram.com/p/BuE82VfHRa6/')
userid_element = driver.find_elements_by_xpath('//*[#id="react-root"]/section/main/div/div/article/div[2]/section[2]/div/div/a')[0].click()
time.sleep(2)
# here, you can see user list you want.
# you have to scroll down to download more data from instagram server.
# loop until last element with users table view height value.
users = []
height = driver.find_element_by_xpath("/html/body/div[3]/div/div[2]/div/div").value_of_css_property("padding-top")
match = False
while match==False:
lastHeight = height
# step 1
elements = driver.find_elements_by_xpath("//*[#id]/div/a")
# step 2
for element in elements:
if element.get_attribute('title') not in users:
users.append(element.get_attribute('title'))
# step 3
driver.execute_script("return arguments[0].scrollIntoView();", elements[-1])
time.sleep(1)
# step 4
height = driver.find_element_by_xpath("/html/body/div[3]/div/div[2]/div/div").value_of_css_property("padding-top")
if lastHeight==height:
match = True
print(users)
print(len(users))
driver.quit()
I test in near 100 liked post, and it worked.

Please try the following code and let me know if this work.
from selenium import webdriver
driver = webdriver.Chrome()
driver.get('https://www.instagram.com/p/BuE82VfHRa6/')
elems = driver.find_elements_by_xpath("//a[#class='FPmhX notranslate TlrDj']")
users = []
for elem in elems:
users.append(elem.get_attribute('title'))
print('Title : ' +elem.get_attribute('title'))
print(users)
output:-
Title : kyliejenner
Title : saturdayshade28
Title : worldmeetzboy
Title : mrokon
Title : addieisaac
Title : addieisaac
Title : amber_doerksen
Title : amber_doerksen
Title : addieisaac
Title : zayn6117
Title : amber_doerksen
Title : amber_doerksen
Title : worldmeetzboy
Title : worldmeetzboy
Title : razvanpopic1301
Title : johanna.trmn
Title : johanna.trmn
Title : johanna.trmn
Title : americ.av
Title : gabriellcostta1.0
Title : gabriellcostta1.0
Title : gabriellcostta1.0
Title : worldmeetzboy
Title : enactusepi
Title : enactusepi
[u'kyliejenner', u'saturdayshade28', u'worldmeetzboy', u'mrokon', u'addieisaac', u'addieisaac', u'amber_doerksen', u'amber_doerksen', u'addieisaac', u'zayn6117', u'amber_doerksen', u'amber_doerksen', u'worldmeetzboy', u'worldmeetzboy', u'razvanpopic1301', u'johanna.trmn', u'johanna.trmn', u'johanna.trmn', u'americ.av', u'gabriellcostta1.0', u'gabriellcostta1.0', u'gabriellcostta1.0', u'worldmeetzboy', u'enactusepi', u'enactusepi']

I wasn't able to get the code to work as posted in predicty's answer. Therefore I made the adaptation below and it gets me now ~500 likers per post.
def get_post_likers(shortcode):
chrome = ch.initialize()
chrome.get('https://www.instagram.com/p/' + shortcode + '/')
chrome.execute_script("window.scrollTo(0, 1080)")
url = "/p/" + shortcode + "/liked_by/"
time.sleep(2)
like_link = chrome.find_element_by_xpath('//a[#href="'+url+'"]')
like_link.click()
time.sleep(2)
users = []
pb = chrome.find_element_by_xpath("//div[#role = 'dialog']/div[2]/div[1]/div[1]").value_of_css_property("padding-bottom")
match = False
while match==False:
lastHeight = pb
# step 1
elements = chrome.find_elements_by_xpath("//*[#id]/div/a")
# step 2
for element in elements:
if element.get_attribute('title') not in users:
users.append(element.get_attribute('title'))
# step 3
chrome.execute_script("return arguments[0].scrollIntoView();", elements[-1])
time.sleep(1)
# step 4
pb = chrome.find_element_by_xpath("//div[#role = 'dialog']/div[2]/div[1]/div[1]").value_of_css_property("padding-bottom")
if lastHeight==pb or len(users) >= 1500:
match = True
return users

This worked for me:
driver.get('https://www.instagram.com/p/BuE82VfHRa6/')
time.sleep(2)
userid_element = driver.find_element_by_xpath('//*[#id="react-root"]/section/main/div/div[1]/article/div[3]/section[2]/div/div[2]/button').click()
time.sleep(2)
elems = driver.find_elements_by_xpath("//a[#class='FPmhX notranslate TlrDj']")
users = []
for i in range(10):
i += 1
if(i%10) == 9 :
driver.find_element_by_xpath('/html/body/div[4]/div/div/div[2]/div').click()
actionChain.key_down(Keys.SPACE).key_up(Keys.SPACE).perform()
print('/html/body/div[4]/div/div/div[2]/div/div/div['+str(i)+']/div[2]/div[1]/div/a')
Title = driver.find_element_by_xpath('/html/body/div[4]/div/div/div[2]/div/div/div['+str(i)+']/div[2]/div[1]/div/a').get_attribute('title')
users.append(Title)
print('Title : ' + Title)
print(users)

I tried all the solutions above, but none of them are working. I think they are outdated.
Instead, I wrote my own. It works perfectly in 2020.
This code goes to the "username" address and take the latest post in the profile and get the liked users.
def getPosts():
hrefs_in_view = driver.find_elements_by_tag_name('a')
# finding relevant hrefs
hrefs_in_view = [elem.get_attribute('href') for elem in hrefs_in_view
if '.com/p/' in elem.get_attribute('href')]
return hrefs_in_view;
def getLikers(username,limit,post=1):
driver.get('https://www.instagram.com/' + username)
time.sleep(1)
users=[]
#Get Latest Post
driver.get(getPosts()[post])
time.sleep(2)
#Open Dialog
followersLinkX = driver.find_element_by_xpath('//button[#class="sqdOP yWX7d _8A5w5 "]')
followersLinkX.click()
time.sleep(1)
#Get Dialog
xxx = driver.find_element_by_xpath('//div[#role="dialog"]/div[1]/div[2]/div[1]/div[1]')
#Focus on and Scroll
xxx.click()
# step 3
actionChain = webdriver.ActionChains(driver)
count = 0
while(count < limit):
for i in range(1,1000):
try:
users.append("https://www.instagram.com/" + driver.find_element_by_xpath('//div[#role="dialog"]/div[1]/div[2]/div[1]/div[1]/div['+ str(i) +']/div[2]/div[1]/div[1]').text)
count+=1
except:
break
actionChain.key_down(Keys.SPACE).key_up(Keys.SPACE).perform()
time.sleep(0.5)
return users
For runing likers = getLikers("deirvlon",100,1)

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Looping and stop duplicating output | Selenium | Python - python

Related

why xpath cannot get the target element?

Selenium - Iterating Through Drop Down Menu - Let Page Load

Can I pause a scroll function in selenium, scrape the current data, and then continue scrolling later in the script?

How can I speed a Selenium scraper on the New York Times dining website?

Getting list of likers for an instagram post - Python & Selenium

Categories

Resources