Adding to list from a fo loop - python

I got most of my code working but have a lingering question. This is not my full code below but for the interest of readability I selected only a portion. I'm scraping a list of URLs from a web page (in imgs2) and then scraping info from the list of URLs. I would like to create a second list of URLs based on the results gathered in the first. (see img_url2 below). What happens is that instead of appending every new url to the list, it just replaces the previous one with the new one. Any idea how to have all of them be added to the list?
driver.get("https://superrare.com/market?market-options=%257B%2522first%2522%3A30%2C%2522orderBy%2522%3A%2522RECENT_NFT_EVENT_BY_TOKEN_CONTRACT_ADDRESS_AND_TOKEN_ID__TIMESTAMP_DESC%2522%2C%2522fileTypes%2522%3A%255B%2522image%2Fjpeg%2522%2C%2522image%2Fpng%2522%255D%2C%2522listPrice%2522%3Afalse%2C%2522isGenesis%2522%3Afalse%2C%2522isSeries%2522%3Afalse%2C%2522neverReceivedOffer%2522%3Afalse%2C%2522reservePrice%2522%3Afalse%2C%2522liveAuctions%2522%3Afalse%2C%2522upcomingAuctions%2522%3Afalse%2C%2522hasSold%2522%3Afalse%2C%2522ownedByCreator%2522%3Afalse%2C%2522openOffers%2522%3Afalse%2C%2522artistsCollected%2522%3Afalse%2C%2522artistsYouFollow%2522%3Afalse%2C%2522artistsThatFollowYou%2522%3Afalse%2C%2522artistsFollowedByFollowed%2522%3Afalse%2C%2522lowerPriceRange%2522%3A0%2C%2522upperPriceRange%2522%3A100000%2C%2522numCreatorSales%2522%3Afalse%2C%2522lowerMintedRange%2522%3Anull%2C%2522upperMintedRange%2522%3Anull%2C%2522startCursor%2522%3A%2522WyJyZWNlbnRfbmZ0X2V2ZW50X2J5X3Rva2VuX2NvbnRyYWN0X2FkZHJlc3NfYW5kX3Rva2VuX2lkX190aW1lc3RhbXBfZGVzYyIsWyIyMDIyLTAyLTE3VDE0OjExOjMyKzAwOjAwIiwiMHhiOTMyYTcwYTU3NjczZDg5ZjRhY2ZmYmU4MzBlOGVkN2Y3NWZiOWUwIiwxNzYzMF1d%2522%2C%2522endCursor%2522%3A%2522WyJyZWNlbnRfbmZ0X2V2ZW50X2J5X3Rva2VuX2NvbnRyYWN0X2FkZHJlc3NfYW5kX3Rva2VuX2lkX190aW1lc3RhbXBfZGVzYyIsWyIyMDIyLTAyLTE2VDIwOjMxOjUxKzAwOjAwIiwiMHg0MjQyMzk5YzE2Yjc4MzgxOTZlZDMzZjE3OWU5OWUzZjk5Yjg4NGYyIiwzXV0%3D%2522%2C%2522lastEndCursor%2522%3A%2522WyJyZWNlbnRfbmZ0X2V2ZW50X2J5X3Rva2VuX2NvbnRyYWN0X2FkZHJlc3NfYW5kX3Rva2VuX2lkX190aW1lc3RhbXBfZGVzYyIsWyIyMDIyLTAyLTE3VDE0OjMwOjI3KzAwOjAwIiwiMHhiOTMyYTcwYTU3NjczZDg5ZjRhY2ZmYmU4MzBlOGVkN2Y3NWZiOWUwIiwyNzgxNl1d%2522%2C%2522lastStartCursor%2522%3Afalse%2C%2522hasPreviousPage%2522%3Atrue%2C%2522hasNextPage%2522%3Atrue%2C%2522reverse%2522%3Afalse%257D")
imgs2 = WebDriverWait(driver,10).until(EC.presence_of_all_elements_located((By.XPATH, "//a[contains(#class,'Name-sc-7kf6vz-3')]")))
time.sleep(5)
for i in range(0,30):
img_url = []
for number, item in enumerate(imgs2, 1):
imgwors2 = item.get_attribute("href")
driver3 = webdriver.Chrome()
driver3.get(imgwors2)
def check_exists_by_xpath(xpath):
try:
WebDriverWait(driver3,55).until(EC.presence_of_all_elements_located((By.XPATH, xpath)))
except TimeoutException:
return False
return True
if check_exists_by_xpath("//h1[#class='collectible-detail__collectible-name']"):
imgsrc4 = WebDriverWait(driver3,65).until(EC.presence_of_all_elements_located((By.XPATH, "//h1[contains(#class,'collectible-detail__collectible-name')]")))
for i in imgsrc4:
title = i.text
else:
title = "none"
print(title)
img_url2 = []
imgsrc2 = WebDriverWait(driver3,55).until(EC.presence_of_all_elements_located((By.XPATH, "//p[#data-testid='artistName']/ancestor::a[contains(#class,'ChildrenLink')]")))
for i in imgsrc2:
biourl = i.get_attribute("href")
img_url2.append(biourl)
print(img_url2)
driver.close()

I think from your description and code, the variable img_url2 should be initialized before the for loop(s)
driver.get("https://superrare.com/market?market-options=%257B%2522first%2522%3A30%2C%2522orderBy%2522%3A%2522RECENT_NFT_EVENT_BY_TOKEN_CONTRACT_ADDRESS_AND_TOKEN_ID__TIMESTAMP_DESC%2522%2C%2522fileTypes%2522%3A%255B%2522image%2Fjpeg%2522%2C%2522image%2Fpng%2522%255D%2C%2522listPrice%2522%3Afalse%2C%2522isGenesis%2522%3Afalse%2C%2522isSeries%2522%3Afalse%2C%2522neverReceivedOffer%2522%3Afalse%2C%2522reservePrice%2522%3Afalse%2C%2522liveAuctions%2522%3Afalse%2C%2522upcomingAuctions%2522%3Afalse%2C%2522hasSold%2522%3Afalse%2C%2522ownedByCreator%2522%3Afalse%2C%2522openOffers%2522%3Afalse%2C%2522artistsCollected%2522%3Afalse%2C%2522artistsYouFollow%2522%3Afalse%2C%2522artistsThatFollowYou%2522%3Afalse%2C%2522artistsFollowedByFollowed%2522%3Afalse%2C%2522lowerPriceRange%2522%3A0%2C%2522upperPriceRange%2522%3A100000%2C%2522numCreatorSales%2522%3Afalse%2C%2522lowerMintedRange%2522%3Anull%2C%2522upperMintedRange%2522%3Anull%2C%2522startCursor%2522%3A%2522WyJyZWNlbnRfbmZ0X2V2ZW50X2J5X3Rva2VuX2NvbnRyYWN0X2FkZHJlc3NfYW5kX3Rva2VuX2lkX190aW1lc3RhbXBfZGVzYyIsWyIyMDIyLTAyLTE3VDE0OjExOjMyKzAwOjAwIiwiMHhiOTMyYTcwYTU3NjczZDg5ZjRhY2ZmYmU4MzBlOGVkN2Y3NWZiOWUwIiwxNzYzMF1d%2522%2C%2522endCursor%2522%3A%2522WyJyZWNlbnRfbmZ0X2V2ZW50X2J5X3Rva2VuX2NvbnRyYWN0X2FkZHJlc3NfYW5kX3Rva2VuX2lkX190aW1lc3RhbXBfZGVzYyIsWyIyMDIyLTAyLTE2VDIwOjMxOjUxKzAwOjAwIiwiMHg0MjQyMzk5YzE2Yjc4MzgxOTZlZDMzZjE3OWU5OWUzZjk5Yjg4NGYyIiwzXV0%3D%2522%2C%2522lastEndCursor%2522%3A%2522WyJyZWNlbnRfbmZ0X2V2ZW50X2J5X3Rva2VuX2NvbnRyYWN0X2FkZHJlc3NfYW5kX3Rva2VuX2lkX190aW1lc3RhbXBfZGVzYyIsWyIyMDIyLTAyLTE3VDE0OjMwOjI3KzAwOjAwIiwiMHhiOTMyYTcwYTU3NjczZDg5ZjRhY2ZmYmU4MzBlOGVkN2Y3NWZiOWUwIiwyNzgxNl1d%2522%2C%2522lastStartCursor%2522%3Afalse%2C%2522hasPreviousPage%2522%3Atrue%2C%2522hasNextPage%2522%3Atrue%2C%2522reverse%2522%3Afalse%257D")
imgs2 = WebDriverWait(driver,10).until(EC.presence_of_all_elements_located((By.XPATH, "//a[contains(#class,'Name-sc-7kf6vz-3')]")))
time.sleep(5)
img_url2 = [] # <--- moved before the loop
for i in range(0,30):
for number, item in enumerate(imgs2, 1):
imgwors2 = item.get_attribute("href")
driver3 = webdriver.Chrome()
driver3.get(imgwors2)
def check_exists_by_xpath(xpath):
try:
WebDriverWait(driver3,55).until(EC.presence_of_all_elements_located((By.XPATH, xpath)))
except TimeoutException:
return False
return True
if check_exists_by_xpath("//h1[#class='collectible-detail__collectible-name']"):
imgsrc4 = WebDriverWait(driver3,65).until(EC.presence_of_all_elements_located((By.XPATH, "//h1[contains(#class,'collectible-detail__collectible-name')]")))
for i in imgsrc4:
title = i.text
else:
title = "none"
print(title)
imgsrc2 = WebDriverWait(driver3,55).until(EC.presence_of_all_elements_located((By.XPATH, "//p[#data-testid='artistName']/ancestor::a[contains(#class,'ChildrenLink')]")))
for i in imgsrc2:
biourl = i.get_attribute("href")
img_url2.append(biourl)
driver.close()
print(img_url2) # <--- moved below the loop

Related

Duplicates in output(csv) from web scraping - Python

I'm trying to Scrape LinkedInn for Job Listings. Unfortunately after each run I'm getting the same line repeatedly instead of all the listings. Would anyone know why this might be? I'm fairly new to WebScrapers. I'm not sure if it's my loop that's causing the same result to repeat or if I'm exporting to CSV incorrectly.
`
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
import time
import pandas as pd
import csv
job_name = "Data Analyst"
country_name = "United States"
job_url ="";
for item in job_name.split(" "):
if item != job_name.split(" ")[-1]:
job_url = job_url + item + "%20"
else:
job_url = job_url + item
country_url ="";
for item in country_name.split(" "):
if item != country_name.split(" ")[-1]:
country_url = country_url + item + "%20"
else:
country_url = country_url + item
url = "https://www.linkedin.com/jobs/search?keywords=Data%20Analyst&location=United%20States&geoId=103644278&trk=public_jobs_jobs-search-bar_search-submit&position=1&pageNum=0"
url.format(job_url,country_url)
# Creating a webdriver instance
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
#Change the path to match the location of your "chromedriver" file
#driver = webdriver.Chrome("/home/im-admin/.scripts/Py/chromedriver")
# Opening the url we have just defined in our browser
driver.get(url)
#We find how many jobs are offered.
jobs_num = driver.find_element(By.CSS_SELECTOR,"h1>span").get_attribute("innerText")
if len(jobs_num.split(',')) > 1:
jobs_num = int(jobs_num.split(',')[0])*2
else:
jobs_num = int(jobs_num)
jobs_num = int(jobs_num)
#We create a while loop to browse all jobs.
i = 2
while i <= int(jobs_num/2)+1:
#We keep scrollind down to the end of the view.
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
i = i + 1
print("Current at: ", i, "Percentage at: ", ((i+1)/(int(jobs_num/2)+1))*100, "%",end="\r")
try:
#We try to click on the load more results buttons in case it is already displayed.
infinite_scroller_button = driver.find_element(By.XPATH, ".//button[#aria-label='Load more results']")
infinite_scroller_button.click()
time.sleep(0.1)
except:
#If there is no button, there will be an error, so we keep scrolling down.
time.sleep(0.1)
pass
#We get a list containing all jobs that we have found.
job_lists = driver.find_element(By.CLASS_NAME,"jobs-search__results-list")
jobs = job_lists.find_elements(By.TAG_NAME,"li") # return a list
#We declare void list to keep track of all obtaind data.
job_title_list = []
company_name_list = []
location_list = []
date_list = []
job_link_list = []
#We loof over every job and obtain all the wanted info.
for job in jobs:
#job_title
job_title = job.find_element(By.CSS_SELECTOR,"h3").get_attribute("innerText")
job_title_list.append(job_title)
#company_name
company_name = job.find_element(By.CSS_SELECTOR,"h4").get_attribute("innerText")
company_name_list.append(company_name)
#location
location = job.find_element(By.CSS_SELECTOR,"div>div>span").get_attribute("innerText")
location_list.append(location)
#date
date = job.find_element(By.CSS_SELECTOR,"div>div>time").get_attribute("datetime")
date_list.append(date)
#job_link
job_link = job.find_element(By.CSS_SELECTOR,"a").get_attribute("href")
job_link_list.append(job_link)
jd = [] #job_description
seniority = []
emp_type = []
job_func = []
job_ind = []
for item in range(len(jobs)):
print(item)
job_func0=[]
industries0=[]
# clicking job to view job details
#__________________________________________________________________________ JOB Link
try:
job_click_path = f'/html/body/div/div/main/section/ul/li[{item+1}]'
job_click = job.find_element(By.XPATH,job_click_path).click()
except:
pass
#job_click = job.find_element(By.XPATH,'.//a[#class="base-card_full-link"]')
#__________________________________________________________________________ JOB Description
jd_path = '/html/body/div/div/section/div/div/section/div/div/section/div'
try:
jd0 = job.find_element(By.XPATH,jd_path).get_attribute('innerText')
jd.append(jd0)
except:
jd.append(None)
pass
#__________________________________________________________________________ JOB Seniority
seniority_path='/html/body/div/div/section/div/div/section/div/ul/li[1]/span'
try:
seniority0 = job.find_element(By.XPATH,seniority_path).get_attribute('innerText')
seniority.append(seniority0)
except:
seniority.append(None)
pass
#__________________________________________________________________________ JOB Time
emp_type_path='/html/body/div/div/section/div/div/section/div/ul/li[2]/span'
try:
emp_type0 = job.find_element(By.XPATH,emp_type_path).get_attribute('innerText')
emp_type.append(emp_type0)
except:
emp_type.append(None)
pass
#__________________________________________________________________________ JOB Function
function_path='/html/body/div/div/section/div/div/section/div/ul/li[3]/span'
try:
func0 = job.find_element(By.XPATH,function_path).get_attribute('innerText')
job_func.append(func0)
except:
job_func.append(None)
pass
#__________________________________________________________________________ JOB Industry
industry_path='/html/body/div/div/section/div/div/section/div/ul/li[4]/span'
try:
ind0 = job.find_element(By.XPATH,industry_path).get_attribute('innerText')
job_ind.append(ind0)
except:
job_ind.append(None)
pass
print("Current at: ", item, "Percentage at: ", (item+1)/len(jobs)*100, "%")
job_data = pd.DataFrame({
'Date': date,
'Company': company_name,
'Title': job_title,
'Location': location,
'Description': jd,
'Level': seniority,
'Type': emp_type,
'Function': job_func,
'Industry': job_ind,
'Link': job_link
})
#Change the path to jobdata.csv if you want it to output to a different folder.
##See https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.to_csv.html#
job_data.to_csv('jobdata.csv',encoding='utf-8',index=False)
`
This is my output
Date,Company,Title,Location,Description,Level,Type,Function,Industry,Link
2022-10-14,LHH,Data Analyst,"McLean, VA",,,,,,https://www.linkedin.com/jobs/view/data-analyst-at-lhh-3311865718?refId=pAkR2FDOYi8W2HOa%2FLgpiw%3D%3D&trackingId=5%2FX7p1W7L0eCE4XtpbzcEQ%3D%3D&position=23&pageNum=2&trk=public_jobs_jserp-result_search-card
2022-10-14,LHH,Data Analyst,"McLean, VA",,,,,,https://www.linkedin.com/jobs/view/data-analyst-at-lhh-3311865718?refId=pAkR2FDOYi8W2HOa%2FLgpiw%3D%3D&trackingId=5%2FX7p1W7L0eCE4XtpbzcEQ%3D%3D&position=23&pageNum=2&trk=public_jobs_jserp-result_search-card
2022-10-14,LHH,Data Analyst,"McLean, VA",,,,,,https://www.linkedin.com/jobs/view/data-analyst-at-lhh-3311865718?refId=pAkR2FDOYi8W2HOa%2FLgpiw%3D%3D&trackingId=5%2FX7p1W7L0eCE4XtpbzcEQ%3D%3D&position=23&pageNum=2&trk=public_jobs_jserp-result_search-card
I've tried printing the Panda Table directly with no success.

How to iterate a variable in XPATH, extract a link and store it into a list for further iteration

I'm following a Selenium tutorial for an Amazon price tracker (Clever Programming on Youtube) and I got stuck at getting the links from amazon using their techniques.
tutorial link: https://www.youtube.com/watch?v=WbJeL_Av2-Q&t=4315s
I realized the problem laid on the fact that I'm only getting one link out of the 17 available after doing the product search. I need to get all the links for every product after doing a search and them use then to get into each product and get their title, seller and price.
funtion get_products_links() should get all links and stores them into a list to be used by the function get_product_info()
def get_products_links(self):
self.driver.get(self.base_url) # Go to amazon.com using BASE_URL
element = self.driver.find_element_by_id('twotabsearchtextbox')
element.send_keys(self.search_term)
element.send_keys(Keys.ENTER)
time.sleep(2) # Wait to load page
self.driver.get(f'{self.driver.current_url}{self.price_filter}')
time.sleep(2) # Wait to load page
result_list = self.driver.find_elements_by_class_name('s-result-list')
links = []
try:
### Tying to get a list for Xpath links attributes ###
### Only numbers from 3 to 17 work after doing product search where 'i' is placed in the XPATH ###
i = 3
results = result_list[0].find_elements_by_xpath(
f'//*[#id="search"]/div[1]/div[1]/div/span[3]/div[2]/div[{i}]/div/div/div/div/div/div[1]/div/div[2]/div/span/a')
links = [link.get_attribute('href') for link in results]
return links
except Exception as e:
print("Didn't get any products...")
print(e)
return links
At this point get_products_links() only returns one link since I just made 'i' a fixed value of 3 to make it work for now.
I was thinking to iterate 'i' in some sort so I can save every different PATHs but I don't know how to implement this.
I've tried performing a for loop and append the result into a new list but them the app stops working
Here is the complete code:
from amazon_config import(
get_web_driver_options,
get_chrome_web_driver,
set_browser_as_incognito,
set_ignore_certificate_error,
NAME,
CURRENCY,
FILTERS,
BASE_URL,
DIRECTORY
)
import time
from selenium.webdriver.common.keys import Keys
class GenerateReport:
def __init__(self):
pass
class AmazonAPI:
def __init__(self, search_term, filters, base_url, currency):
self.base_url = base_url
self.search_term = search_term
options = get_web_driver_options()
set_ignore_certificate_error(options)
set_browser_as_incognito(options)
self.driver = get_chrome_web_driver(options)
self.currency = currency
self.price_filter = f"&rh=p_36%3A{filters['min']}00-{filters['max']}00"
def run(self):
print("Starting script...")
print(f"Looking for {self.search_term} products...")
links = self.get_products_links()
time.sleep(1)
if not links:
print("Stopped script.")
return
print(f"Got {len(links)} links to products...")
print("Getting info about products...")
products = self.get_products_info(links)
# self.driver.quit()
def get_products_info(self, links):
asins = self.get_asins(links)
product = []
for asin in asins:
product = self.get_single_product_info(asin)
def get_single_product_info(self, asin):
print(f"Product ID: {asin} - getting data...")
product_short_url = self.shorten_url(asin)
self.driver.get(f'{product_short_url}?language=en_GB')
time.sleep(2)
title = self.get_title()
seller = self.get_seller()
price = self.get_price()
def get_title(self):
try:
return self.driver.find_element_by_id('productTitle')
except Exception as e:
print(e)
print(f"Can't get title of a product - {self.driver.current_url}")
return None
def get_seller(self):
try:
return self.driver.find_element_by_id('bylineInfo')
except Exception as e:
print(e)
print(f"Can't get title of a product - {self.driver.current_url}")
return None
def get_price(self):
return '$99'
def shorten_url(self, asin):
return self.base_url + 'dp/' + asin
def get_asins(self, links):
return [self.get_asin(link) for link in links]
def get_asin(self, product_link):
return product_link[product_link.find('/dp/') + 4:product_link.find('/ref')]
def get_products_links(self):
self.driver.get(self.base_url) # Go to amazon.com using BASE_URL
element = self.driver.find_element_by_id('twotabsearchtextbox')
element.send_keys(self.search_term)
element.send_keys(Keys.ENTER)
time.sleep(2) # Wait to load page
self.driver.get(f'{self.driver.current_url}{self.price_filter}')
time.sleep(2) # Wait to load page
result_list = self.driver.find_elements_by_class_name('s-result-list')
links = []
try:
### Tying to get a list for Xpath links attributes ###
### Only numbers from 3 to 17 work after doing product search where 'i' is placed ###
i = 3
results = result_list[0].find_elements_by_xpath(
f'//*[#id="search"]/div[1]/div[1]/div/span[3]/div[2]/div[{i}]/div/div/div/div/div/div[1]/div/div[2]/div/span/a')
links = [link.get_attribute('href') for link in results]
return links
except Exception as e:
print("Didn't get any products...")
print(e)
return links
if __name__ == '__main__':
print("HEY!!!🚀🔥")
amazon = AmazonAPI(NAME, FILTERS, BASE_URL, CURRENCY)
amazon.run()
Steps to Run the script:
Step 1:
install Selenium==3.141.0 into your virtual environment
Step 2:
Search for Chrome Drivers on google and download the driver that matches you Chrome version. After download, extract the driver and paste it into your working folder
Step 3:
create a file called amazon_config.py and insert the following code:
from selenium import webdriver
DIRECTORY = 'reports'
NAME = 'PS4'
CURRENCY = '$'
MIN_PRICE = '275'
MAX_PRICE = '650'
FILTERS = {
'min': MIN_PRICE,
'max': MAX_PRICE
}
BASE_URL = "https://www.amazon.com/"
def get_chrome_web_driver(options):
return webdriver.Chrome('./chromedriver', chrome_options=options)
def get_web_driver_options():
return webdriver.ChromeOptions()
def set_ignore_certificate_error(options):
options.add_argument('--ignore-certificate-errors')
def set_browser_as_incognito(options):
options.add_argument('--incognito')
If you performed the steps correctly you should be able to run the script and it will perform the following:
Go to www.amazon.com
Search for a product (In this case "PS4")
Get a link for the first product
Visit that product link
Terminal should print:
HEY!!!🚀🔥
Starting script...
Looking for PS4 products...
Got 1 links to products...
Getting info about products...
Product ID: B012CZ41ZA - getting data...
What I'm not able to do is to get all links and iterate them so the script will visit all links in the first page
If you are able to get all links, the terminal should print:
HEY!!!🚀🔥
Starting script...
Looking for PS4 products...
Got 1 links to products...
Getting info about products...
Product ID: B012CZ41ZA - getting data...
Product ID: XXXXXXXXXX - getting data...
Product ID: XXXXXXXXXX - getting data...
Product ID: XXXXXXXXXX - getting data...
# and so on until all links are visited
I can't run it so I only guess how I would do it.
I would put all try/except in for-loop, and use links.append() instead of links = [...], and I would use return after exiting loop
# --- before loop ---
links = []
# --- loop ---
for i in range(3, 18):
try:
results = result_list[0].find_elements_by_xpath(
f'//*[#id="search"]/div[1]/div[1]/div/span[3]/div[2]/div[{i}]/div/div/div/div/div/div[1]/div/div[2]/div/span/a')
for link in results:
links.append(link.get_attribute('href'))
except Exception as e:
print(f"Didn't get any products... (i = {i})")
print(e)
# --- after loop ---
return links
But I would also try to use xpath with // to skip most of divs - and maybe if I would skip div[{i}] then I could get all products without for-loop.
BTW:
In get_products_info() I see similar problem - you create empty list product = [] but later in loop you assing value to product = ... so you remove previous value from product. It would need product.append() to keep all values.
Something like
def get_products_info(self, links):
# --- before loop ---
asins = self.get_asins(links)
product = []
# --- loop ---
for asin in asins:
product.append( self.get_single_product_info(asin) )
# --- after loop ---
return product

Selenium: stop loop once element is class is disabled

Currently trying to loop through all the pages on this website:
https://ephisahs.microsoftcrmportals.com/disclaimer/restaurantinspections/south-facilities/
When it reaches page 53 (end of the page), it continues to loop even though there are no more pages. How can I make the loop stop? I notice that the element class ="disabled" appears.
Here is my code so far:
driver = webdriver.Chrome(ChromeDriverManager().install())
driver.implicitly_wait(10)
driver.get('https://ephisahs.microsoftcrmportals.com/disclaimer/restaurantinspections/south-facilities/')
dfs = []
page_counter = 0
while True:
wait = WebDriverWait(driver, 30)
wait.until(EC.visibility_of_all_elements_located((By.XPATH, "//tr[#data-name]")))
cards = driver.find_elements_by_xpath("//tr[#data-name]")
facilities = []
for card in cards:
name = card.find_element_by_xpath(".//td[#data-th='Unit Name']").text
street1 = card.find_element_by_xpath(".//td[#data-th='Site Street 1']").text
street2 = card.find_element_by_xpath(".//td[#data-th='Site Street 2']").text
site_city = card.find_element_by_xpath(".//td[#data-th='Site City']").text
site_prov = card.find_element_by_xpath(".//td[#data-th='Site Province/State']").text
site_code = card.find_element_by_xpath(".//td[#data-th='Site Postal Code/Zip Code']").text
site_fac = card.find_element_by_xpath(".//td[#data-th='Facility Category']").text
site_inspection = card.find_element_by_xpath(".//td[#data-th='Inspections Completed']").text
ref_link = card.find_element_by_xpath(".//td//a").get_attribute("href")
facilities.append([name, street1, street2, site_city,site_prov,site_code,site_fac,site_inspection,ref_link])
df = pd.DataFrame(facilities)
dfs.append(df)
print(page_counter)
page_counter+=1
try:
wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR,"a[aria-label='Next page']"))).click()
except:
break
driver.close()
driver.quit()
You can simply check the class of the li element as mentioned in the doc
is_disabled = "disabled" in element.get_attribute("class")
if is_disabled:
break
is_active = "active" in target_element.get_attribute("class")
https://selenium-python.readthedocs.io/api.html#selenium.webdriver.remote.webelement.WebElement.get_attribute

Getting list of likers for an instagram post - Python & Selenium

I'm training to web crawling. To do so, I've challenged myself to get the list of all people having liked a post on instagram.
My problem is that I'm stuck to the point where I only get the first 11 usernames of likers. I cannot find the right way to automate the scrolling process while getting the likes.
Here is my process in Jupyter Notebook (it doesn't work as a script yet):
from selenium import webdriver
import pandas as pd
driver = webdriver.Chrome()
driver.get('https://www.instagram.com/p/BuE82VfHRa6/')
userid_element = driver.find_elements_by_xpath('//*[#id="react-root"]/section/main/div/div/article/div[2]/section[2]/div/div/a')[0].click()
elems = driver.find_elements_by_xpath("//*[#id]/div/a")
users = []
for elem in elems:
users.append(elem.get_attribute('title'))
print(users)
Do you guys have any idea?
Many thanks
I guess instagram site use liked user elements maximum 17.
so, this is one loop
get elements list from web
save to my list
scroll down for get new element
check, is this last scroll elements?
driver.get('https://www.instagram.com/p/BuE82VfHRa6/')
userid_element = driver.find_elements_by_xpath('//*[#id="react-root"]/section/main/div/div/article/div[2]/section[2]/div/div/a')[0].click()
time.sleep(2)
# here, you can see user list you want.
# you have to scroll down to download more data from instagram server.
# loop until last element with users table view height value.
users = []
height = driver.find_element_by_xpath("/html/body/div[3]/div/div[2]/div/div").value_of_css_property("padding-top")
match = False
while match==False:
lastHeight = height
# step 1
elements = driver.find_elements_by_xpath("//*[#id]/div/a")
# step 2
for element in elements:
if element.get_attribute('title') not in users:
users.append(element.get_attribute('title'))
# step 3
driver.execute_script("return arguments[0].scrollIntoView();", elements[-1])
time.sleep(1)
# step 4
height = driver.find_element_by_xpath("/html/body/div[3]/div/div[2]/div/div").value_of_css_property("padding-top")
if lastHeight==height:
match = True
print(users)
print(len(users))
driver.quit()
I test in near 100 liked post, and it worked.
Please try the following code and let me know if this work.
from selenium import webdriver
driver = webdriver.Chrome()
driver.get('https://www.instagram.com/p/BuE82VfHRa6/')
elems = driver.find_elements_by_xpath("//a[#class='FPmhX notranslate TlrDj']")
users = []
for elem in elems:
users.append(elem.get_attribute('title'))
print('Title : ' +elem.get_attribute('title'))
print(users)
output:-
Title : kyliejenner
Title : saturdayshade28
Title : worldmeetzboy
Title : mrokon
Title : addieisaac
Title : addieisaac
Title : amber_doerksen
Title : amber_doerksen
Title : addieisaac
Title : zayn6117
Title : amber_doerksen
Title : amber_doerksen
Title : worldmeetzboy
Title : worldmeetzboy
Title : razvanpopic1301
Title : johanna.trmn
Title : johanna.trmn
Title : johanna.trmn
Title : americ.av
Title : gabriellcostta1.0
Title : gabriellcostta1.0
Title : gabriellcostta1.0
Title : worldmeetzboy
Title : enactusepi
Title : enactusepi
[u'kyliejenner', u'saturdayshade28', u'worldmeetzboy', u'mrokon', u'addieisaac', u'addieisaac', u'amber_doerksen', u'amber_doerksen', u'addieisaac', u'zayn6117', u'amber_doerksen', u'amber_doerksen', u'worldmeetzboy', u'worldmeetzboy', u'razvanpopic1301', u'johanna.trmn', u'johanna.trmn', u'johanna.trmn', u'americ.av', u'gabriellcostta1.0', u'gabriellcostta1.0', u'gabriellcostta1.0', u'worldmeetzboy', u'enactusepi', u'enactusepi']
I wasn't able to get the code to work as posted in predicty's answer. Therefore I made the adaptation below and it gets me now ~500 likers per post.
def get_post_likers(shortcode):
chrome = ch.initialize()
chrome.get('https://www.instagram.com/p/' + shortcode + '/')
chrome.execute_script("window.scrollTo(0, 1080)")
url = "/p/" + shortcode + "/liked_by/"
time.sleep(2)
like_link = chrome.find_element_by_xpath('//a[#href="'+url+'"]')
like_link.click()
time.sleep(2)
users = []
pb = chrome.find_element_by_xpath("//div[#role = 'dialog']/div[2]/div[1]/div[1]").value_of_css_property("padding-bottom")
match = False
while match==False:
lastHeight = pb
# step 1
elements = chrome.find_elements_by_xpath("//*[#id]/div/a")
# step 2
for element in elements:
if element.get_attribute('title') not in users:
users.append(element.get_attribute('title'))
# step 3
chrome.execute_script("return arguments[0].scrollIntoView();", elements[-1])
time.sleep(1)
# step 4
pb = chrome.find_element_by_xpath("//div[#role = 'dialog']/div[2]/div[1]/div[1]").value_of_css_property("padding-bottom")
if lastHeight==pb or len(users) >= 1500:
match = True
return users
This worked for me:
driver.get('https://www.instagram.com/p/BuE82VfHRa6/')
time.sleep(2)
userid_element = driver.find_element_by_xpath('//*[#id="react-root"]/section/main/div/div[1]/article/div[3]/section[2]/div/div[2]/button').click()
time.sleep(2)
elems = driver.find_elements_by_xpath("//a[#class='FPmhX notranslate TlrDj']")
users = []
for i in range(10):
i += 1
if(i%10) == 9 :
driver.find_element_by_xpath('/html/body/div[4]/div/div/div[2]/div').click()
actionChain.key_down(Keys.SPACE).key_up(Keys.SPACE).perform()
print('/html/body/div[4]/div/div/div[2]/div/div/div['+str(i)+']/div[2]/div[1]/div/a')
Title = driver.find_element_by_xpath('/html/body/div[4]/div/div/div[2]/div/div/div['+str(i)+']/div[2]/div[1]/div/a').get_attribute('title')
users.append(Title)
print('Title : ' + Title)
print(users)
I tried all the solutions above, but none of them are working. I think they are outdated.
Instead, I wrote my own. It works perfectly in 2020.
This code goes to the "username" address and take the latest post in the profile and get the liked users.
def getPosts():
hrefs_in_view = driver.find_elements_by_tag_name('a')
# finding relevant hrefs
hrefs_in_view = [elem.get_attribute('href') for elem in hrefs_in_view
if '.com/p/' in elem.get_attribute('href')]
return hrefs_in_view;
def getLikers(username,limit,post=1):
driver.get('https://www.instagram.com/' + username)
time.sleep(1)
users=[]
#Get Latest Post
driver.get(getPosts()[post])
time.sleep(2)
#Open Dialog
followersLinkX = driver.find_element_by_xpath('//button[#class="sqdOP yWX7d _8A5w5 "]')
followersLinkX.click()
time.sleep(1)
#Get Dialog
xxx = driver.find_element_by_xpath('//div[#role="dialog"]/div[1]/div[2]/div[1]/div[1]')
#Focus on and Scroll
xxx.click()
# step 3
actionChain = webdriver.ActionChains(driver)
count = 0
while(count < limit):
for i in range(1,1000):
try:
users.append("https://www.instagram.com/" + driver.find_element_by_xpath('//div[#role="dialog"]/div[1]/div[2]/div[1]/div[1]/div['+ str(i) +']/div[2]/div[1]/div[1]').text)
count+=1
except:
break
actionChain.key_down(Keys.SPACE).key_up(Keys.SPACE).perform()
time.sleep(0.5)
return users
For runing likers = getLikers("deirvlon",100,1)

Remove an element in a container using selenium

I only want to scrape the required information contained in the black box, and delete/remove/exclude the information contained in the red box
I am doing this because class names "entry" and "partial entry" exist in both boxes. Only the first "partial entry" contains the information that I need, so I plan to delete/remove/exclude the classname "mgrRspnInLine".
My code is:
while True:
container = driver.find_elements_by_xpath('.//*[contains(#class,"review-container")]')
for item in container:
try:
element = item.find_element_by_class_name('mgrRspnInline')
driver.execute_script("""var element = document.getElementsByClassName("mgrRspnInline")[0];element.parentNode.removeChild(element);""", element)
WebDriverWait(driver, 50).until(EC.presence_of_element_located((By.XPATH,'.//*[contains(#class,"taLnk ulBlueLinks")]')))
element = WebDriverWait(driver, 50).until(EC.element_to_be_clickable((By.XPATH,'.//*[contains(#class,"taLnk ulBlueLinks")]')))
element.click()
time.sleep(2)
rating = item.find_elements_by_xpath('.//*[contains(#class,"ui_bubble_rating bubble_")]')
for rate in rating:
rate = rate.get_attribute("class")
rate = str(rate)
rate = rate[-2:]
score_list.append(rate)
time.sleep(2)
stay = item.find_elements_by_xpath('.//*[contains(#class,"recommend-titleInline noRatings")]')
for stayed in stay:
stayed = stayed.text
stayed = stayed.split(', ')
stayed.append(stayed[0])
travel_type.append(stayed[1])
WebDriverWait(driver, 50).until(EC.presence_of_element_located((By.XPATH,'.//*[contains(#class,"noQuotes")]')))
summary = item.find_elements_by_xpath('.//*[contains(#class,"noQuotes")]')
for comment in summary:
comment = comment.text
comments.append(comment)
WebDriverWait(driver, 50).until(EC.presence_of_element_located((By.XPATH,'.//*[contains(#class,"ratingDate")]')))
rating_date = item.find_elements_by_xpath('.//*[contains(#class,"ratingDate")]')
for date in rating_date:
date = date.get_attribute("title")
date = str(date)
review_date.append(date)
WebDriverWait(driver, 50).until(EC.presence_of_element_located((By.XPATH,'.//*[contains(#class,"partial_entry")]')))
review = item.find_elements_by_xpath('.//*[contains(#class,"partial_entry")]')
for comment in review:
comment = comment.text
print(comment)
reviews.append(comment)
except (NoSuchElementException) as e:
continue
try:
element = WebDriverWait(driver, 100).until(EC.element_to_be_clickable((By.XPATH,'.//*[contains(#class,"nav next taLnk ui_button primary")]')))
element.click()
time.sleep(2)
except (ElementClickInterceptedException,NoSuchElementException) as e:
print(e)
break
Basically within the "review-container" I searched first for the class name "mgrRspnInLine", then tried to delete it using the execute_script.
but unfortunately, the output still shows the contents contained in the"mgrRspnInLine".
If you want to avoid matching second element by your XPath you can just modify XPath as below:
.//*[contains(#class,"partial_entry") and not(ancestor::*[#class="mgrRspnInLine"])]
This will match element with class name "partial_entry" only if it doesn't have ancestor with class name "mgrRspnInLine"
If you want the first occurrence you could use css class selector instead of:
.partial_entry
and retrieve with find_element_by_css_selector:
find_element_by_css_selector(".partial_entry")
You can delete all the .mgrRspnInLine elements with:
driver.execute_script("[...document.querySelectorAll('.mgrRspnInLine')].map(el => el.parentNode.removeChild(el))")
Stitching the comment by Andersson, and the two answers provided by QHarr, and pguardiario. I finally solved the problem.
The key is to target a container within the container, all the information is contained in the class name "ui_column is-9" which is contained in the class name "review-container", hence addressing Andersson's comment of multiple mgrRspnInLine.
Within the nested loop, I used pguardianrio's suggestion to delete existing multiple mgrRspnInLine, then adding QHarr's answer on .partial_entry
while True:
container = driver.find_elements_by_xpath('.//*[contains(#class,"review-container")]')
for items in container:
element = WebDriverWait(driver, 1000).until(EC.element_to_be_clickable((By.XPATH,'.//*[contains(#class,"taLnk ulBlueLinks")]')))
element.click()
time.sleep(10)
contained = items.find_elements_by_xpath('.//*[contains(#class,"ui_column is-9")]')
for item in contained:
try:
driver.execute_script("[...document.querySelectorAll('.mgrRspnInLine')].map(el => el.parentNode.removeChild(el))")
rating = item.find_element_by_xpath('//*[contains(#class,"ui_bubble_rating bubble_")]')
rate = rating .get_attribute("class")
rate = str(rate)
rate = rate[-2:]
score_list.append(rate)
time.sleep(2)
stay = item.find_element_by_xpath('.//*[contains(#class,"recommend-titleInline")]')
stayed = stay.text
stayed = stayed.split(', ')
stayed.append(stayed[0])
travel_type.append(stayed[1])
WebDriverWait(driver, 50).until(EC.presence_of_element_located((By.XPATH,'.//*[contains(#class,"noQuotes")]')))
summary = item.find_element_by_xpath('.//*[contains(#class,"noQuotes")]')
comment = summary.text
comments.append(comment)
WebDriverWait(driver, 50).until(EC.presence_of_element_located((By.XPATH,'.//*[contains(#class,"ratingDate")]')))
rating_date = item.find_element_by_xpath('.//*[contains(#class,"ratingDate")]')
date = rating_date.get_attribute("title")
date = str(date)
review_date.append(date)
WebDriverWait(driver, 50).until(EC.presence_of_element_located((By.XPATH,'.//*[contains(#class,"partial_entry")]')))
review = item.find_element_by_css_selector(".partial_entry")
comment = review.text
print(comment)
except (NoSuchElementException) as e:
continue
try:
element = WebDriverWait(driver, 100).until(EC.element_to_be_clickable((By.XPATH,'.//*[contains(#class,"nav next taLnk ui_button primary")]')))
element.click()
time.sleep(2)
except (ElementClickInterceptedException,NoSuchElementException) as e:
print(e)
break

Categories

Resources