Currently trying to loop through all the pages on this website:
https://ephisahs.microsoftcrmportals.com/disclaimer/restaurantinspections/south-facilities/
When it reaches page 53 (end of the page), it continues to loop even though there are no more pages. How can I make the loop stop? I notice that the element class ="disabled" appears.
Here is my code so far:
driver = webdriver.Chrome(ChromeDriverManager().install())
driver.implicitly_wait(10)
driver.get('https://ephisahs.microsoftcrmportals.com/disclaimer/restaurantinspections/south-facilities/')
dfs = []
page_counter = 0
while True:
wait = WebDriverWait(driver, 30)
wait.until(EC.visibility_of_all_elements_located((By.XPATH, "//tr[#data-name]")))
cards = driver.find_elements_by_xpath("//tr[#data-name]")
facilities = []
for card in cards:
name = card.find_element_by_xpath(".//td[#data-th='Unit Name']").text
street1 = card.find_element_by_xpath(".//td[#data-th='Site Street 1']").text
street2 = card.find_element_by_xpath(".//td[#data-th='Site Street 2']").text
site_city = card.find_element_by_xpath(".//td[#data-th='Site City']").text
site_prov = card.find_element_by_xpath(".//td[#data-th='Site Province/State']").text
site_code = card.find_element_by_xpath(".//td[#data-th='Site Postal Code/Zip Code']").text
site_fac = card.find_element_by_xpath(".//td[#data-th='Facility Category']").text
site_inspection = card.find_element_by_xpath(".//td[#data-th='Inspections Completed']").text
ref_link = card.find_element_by_xpath(".//td//a").get_attribute("href")
facilities.append([name, street1, street2, site_city,site_prov,site_code,site_fac,site_inspection,ref_link])
df = pd.DataFrame(facilities)
dfs.append(df)
print(page_counter)
page_counter+=1
try:
wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR,"a[aria-label='Next page']"))).click()
except:
break
driver.close()
driver.quit()
You can simply check the class of the li element as mentioned in the doc
is_disabled = "disabled" in element.get_attribute("class")
if is_disabled:
break
is_active = "active" in target_element.get_attribute("class")
https://selenium-python.readthedocs.io/api.html#selenium.webdriver.remote.webelement.WebElement.get_attribute
Related
I need to loop the Google Maps page, we have several pages to be scraped, but I can only scrape the first one
Here is the code I use to scrape the first page, I would like to scrape all the pages but I don't know how to do that.
Link
url = "https://www.google.com/search?q=contabilidade+em+manaus&biw=1366&bih=657&tbm=lcl&sxsrf=AJOqlzXTyAs7rej8A4k9tuuY9FmGpdOjLg:1676314056027&ei=yIXqY8-tAavQ1sQP0Yy2wAo&ved=0ahUKEwjPsd2-lJP9AhUrqJUCHVGGDagQ4dUDCAk&uact=5&oq=contabilidade+em+manaus&gs_lcp=Cg1nd3Mtd2l6LWxvY2FsEAMyBAgjECcyBggAEBYQHjIJCAAQFhAeEPEEMgkIABAWEB4Q8QQyCQgAEBYQHhDxBDIJCAAQFhAeEPEEMgkIABAWEB4Q8QQyBggAEBYQHjIGCAAQFhAeMgkIABAWEB4Q8QRQAFgAYPYDaABwAHgAgAHWAYgB1gGSAQMyLTGYAQDAAQE&sclient=gws-wiz-local&pccc=1#rlfi=hd:;si:;mv:[[-3.0446025000000003,-59.9553221],[-3.1346859,-60.061026600000005]];tbs:lrf:!1m4!1u3!2m2!3m1!1e1!1m4!1u2!2m2!2m1!1e1!2m1!1e2!2m1!1e3!3sIAE,lf:1,lf_ui:14"
wait = WebDriverWait(driver, 20)
########################################################################################################################
procurar = "contabilidade em Curitiba"
########################################################################################################################
links = []
Nome = []
Endereco = []
Telefone = []
########################################################################################################################
driver.get(url)
driver.maximize_window()
sleep(2)
print("O que procura?")
driver.find_element(By. XPATH, "//input[#value='contabilidade em manaus']").clear()
sleep(2)
input_buscar = driver.find_element(By. XPATH, "//input[#aria-label='Pesquisar']")
input_buscar.send_keys(procurar, Keys. ENTER)
sleep(2)
########################################################################################################################
while True:
try:
classe_empresas = driver.find_elements(By.XPATH, "(//div[#class='rllt__details'])")
for empresa in classe_empresas:
empresa.click()
sleep(2)
nome = driver.find_element(By.XPATH, "//h2[#data-attrid='title']").text
print(nome)
Nome.append(nome)
endereco = driver.find_element(By. XPATH, "//span[#class='LrzXr']").get_attribute("innerHTML")
print(endereco)
Endereco.append(endereco)
try:
tel = driver.find_element(By.CSS_SELECTOR, ".LrzXr.zdqRlf.kno-fv").text
print(tel)
Telefone.append(tel)
except:
sem_telefone = "Não Tem Telefone Cadastrado"
Telefone.append(tel)
print(sem_telefone)
driver.find_element(By.XPATH, "//span[normalize-space()='Mais']").click()
except:
break
data = {'Nome': Nome, 'Endereço': Endereco, 'Telefone': Telefone}
df = pd.DataFrame(data)
df.to_excel('GoogleMaps.xlsx', engine='xlsxwriter')
print(df)
You are showing what works, it would be great to see which url you are scraping. However, in the Google Maps URL you sent, there is a last parameter called "start:x" where x is a number that defines the start of the items displayed on the page.
You can use that value to change it as you scrape all the results.
Here is the value:
https://www.google.com/search?q=contabilidade+&biw=1...;start:20 //the final prop
Also you can click on the number of the page in the bottom with a loop´:
driver.find_element(By.XPATH, "/html/body/div[6]/div/div[9]/div[1]/div/div[2]/div[2]/div/div/div/div/div/div/div/div/div/div[2]/div/table/tbody/tr/td[" + yourIterateVar + "]/a").click()
where yourIterateVar start at page 2 until error (no more pages).
wait = WebDriverWait(driver, 20) #wait 20s to let webpage load
driver.get('https://beta.clinicaltrials.gov/') #getting website url
driver.maximize_window()
time.sleep(1)
country = wait.until(EC.element_to_be_clickable((By.XPATH, '//*[#id="content"]/div/ctg-home/div/div[2]/ctg-advanced-search-home/div[2]/div[1]/fieldset/div[2]/div[3]/ctg-location-search-input/form/div[2]/div/label')))
country.click()
searchBar = driver.find_element("id",'location-input')
searchBar.send_keys("Singapore") #input country name into searchBar
search_dropdown = wait.until(EC.element_to_be_clickable((By.XPATH, '//*[#id="mat-option-14"]/span'))) #wait till xpath is visible
search_dropdown.click()
search_button = wait.until(EC.element_to_be_clickable((By.XPATH, '//*[#id="content"]/div/ctg-home/div/div[2]/ctg-advanced-search-home/div[2]/div[2]/div/div[2]/button')))
search_button.click()
#finding filter button for recruiting status
filter_button = wait.until(EC.element_to_be_clickable((By.XPATH, '//*[#id="filter-button-statusGroup"]')))
filter_button.click()
#clicking on 'recruiting' status
recruiting = wait.until(EC.element_to_be_clickable((By.XPATH, '//*[#id="adv-check-status"]/div[2]/div[2]/div/label')))
recruiting.click()
#scraping each clinical trial details
clinical_trial = {} #empty dict to store details
name_list = []
phone_list = []
email_list = []
enrollment = []
condition_list = []
#loop to go through all the clinical trials in the search page (10 per page)
for i in range(1,11):
time.sleep(2) #wait 2s to let page load
xpath = '//*[#id="content"]/div/ctg-search-results/div[2]/div/div[2]/div/div[2]/div[1]/ctg-search-hit-card[{}]/div/header/a'.format(i)
trials = driver.find_element("xpath", xpath)
trials.click()
#time.sleep(5) #wait 5s to let page load
#getting contact person name
name = wait.until(EC.visibility_of_element_located((By.XPATH, '//*[#id="studyDetailsInfo"]/ctg-study-info/div/ctg-study-info-view/div/div[2]/ctg-study-contacts-and-locations/div/div/div/ctg-study-contact-info/p[1]/span')))
name_list.append(name.text) #adding each name to the list
#phone number of contact person
phone = wait.until(EC.visibility_of_element_located((By.XPATH, '//*[#id="studyDetailsInfo"]/ctg-study-info/div/ctg-study-info-view/div/div[2]/ctg-study-contacts-and-locations/div/div/div/ctg-study-contact-info/p[2]/span')))
phone_list.append(phone.text) #adding each phone number to the list
#email of contact person
email = wait.until(EC.visibility_of_element_located((By.XPATH, '//*[#id="studyDetailsInfo"]/ctg-study-info/div/ctg-study-info-view/div/div[2]/ctg-study-contacts-and-locations/div/div/div/ctg-study-contact-info/p[3]/ctg-study-contact-email/span/a')))
email_list.append(email.text) #adding each email address to the list
#number of enrollment
enrollment_num = wait.until(EC.visibility_of_element_located((By.XPATH, '//*[#id="studyDetailsInfo"]/ctg-study-info/div/ctg-study-info-view/div/div[1]/ctg-study-overview/div[3]/div[2]/div[3]/div[2]')))
enrollment.append(enrollment_num.text) #adding each enrollment number to the list
#condition of study
conditions = wait.until(EC.visibility_of_element_located((By.XPATH, '//*[#id="studyDetailsInfo"]/ctg-study-info/div/ctg-study-info-view/div/div[1]/ctg-study-overview/div[3]/div[2]/div[1]/div[2]')))
condition_list.append(conditions.text) #adding conditions of the study to list
driver.back() #return to search page
#adding all the different list details to the contact_details dict
clinical_trial["name"] = name_list
clinical_trial["phone_num"] = phone_list
clinical_trial["email_address"] = email_list
clinical_trial["Enrollment"] = enrollment
clinical_trial["Conditions"] = condition_list
I am having an issue with selenium somehow not finding the xpath for enrollment_num in the loop. The loop runs through the 10 clickable links on the webpage, however it gives a TimeoutException error at the 9th link. Why is that so? When i change the loop to iterate through 8 links instead of the usual 10 links, it works fine. Its just that one link which creates the error.
Page number 9 is different from all the other pages. The difference is hard to spot. Tipp: to compare strings i use Notepad++ with the compare plugin.
This page does not have these 2 elements:
enrollment_num =...ctg-study-overview/div[3]/div[2]/di...'
here it is:
enrollment_num =...ctg-study-overview/div[2]/div[2]/di...
conditions = ...ctg-study-overview/div[3]/di...
here it is:
...ctg-study-overview/div[2]/di...
This is why it runs into a timeout. You could build a try:except:else around these to avoid the program from crashing. Below a quick fix. Of course you should tidy it up. I hope this helps.
# number of enrollment
try:
enrollment_num = wait.until(EC.visibility_of_element_located((By.XPATH,
'//*[#id="studyDetailsInfo"]/ctg-study-info/div/ctg-study-info-view/div/div[1]/ctg-study-overview/div[3]/div[2]/div[3]/div[2]')))
enrollment.append(enrollment_num.text) # adding each enrollment number to the list
except:
print("enrollement div[3] but div[2]")
enrollment_num = wait.until(EC.visibility_of_element_located((By.XPATH,
'//*[#id="studyDetailsInfo"]/ctg-study-info/div/ctg-study-info-view/div/div[1]/ctg-study-overview/div[2]/div[2]/div[3]/div[2]')))
enrollment.append(enrollment_num.text) # adding each enrollment number to the list
else:
pass
# condition of study
try:
conditions = wait.until(EC.visibility_of_element_located((By.XPATH,
'//*[#id="studyDetailsInfo"]/ctg-study-info/div/ctg-study-info-view/div/div[1]/ctg-study-overview/div[3]/div[2]/div[1]/div[2]')))
condition_list.append(conditions.text) # adding conditions of the study to list
except:
print("condition_list non div[3] but div[2]")
conditions = wait.until(EC.visibility_of_element_located((By.XPATH,
'//*[#id="studyDetailsInfo"]/ctg-study-info/div/ctg-study-info-view/div/div[1]/ctg-study-overview/div[2]/div[2]/div[1]/div[2]')))
condition_list.append(conditions.text) # adding conditions of the study to list
else:
pass
I got most of my code working but have a lingering question. This is not my full code below but for the interest of readability I selected only a portion. I'm scraping a list of URLs from a web page (in imgs2) and then scraping info from the list of URLs. I would like to create a second list of URLs based on the results gathered in the first. (see img_url2 below). What happens is that instead of appending every new url to the list, it just replaces the previous one with the new one. Any idea how to have all of them be added to the list?
driver.get("https://superrare.com/market?market-options=%257B%2522first%2522%3A30%2C%2522orderBy%2522%3A%2522RECENT_NFT_EVENT_BY_TOKEN_CONTRACT_ADDRESS_AND_TOKEN_ID__TIMESTAMP_DESC%2522%2C%2522fileTypes%2522%3A%255B%2522image%2Fjpeg%2522%2C%2522image%2Fpng%2522%255D%2C%2522listPrice%2522%3Afalse%2C%2522isGenesis%2522%3Afalse%2C%2522isSeries%2522%3Afalse%2C%2522neverReceivedOffer%2522%3Afalse%2C%2522reservePrice%2522%3Afalse%2C%2522liveAuctions%2522%3Afalse%2C%2522upcomingAuctions%2522%3Afalse%2C%2522hasSold%2522%3Afalse%2C%2522ownedByCreator%2522%3Afalse%2C%2522openOffers%2522%3Afalse%2C%2522artistsCollected%2522%3Afalse%2C%2522artistsYouFollow%2522%3Afalse%2C%2522artistsThatFollowYou%2522%3Afalse%2C%2522artistsFollowedByFollowed%2522%3Afalse%2C%2522lowerPriceRange%2522%3A0%2C%2522upperPriceRange%2522%3A100000%2C%2522numCreatorSales%2522%3Afalse%2C%2522lowerMintedRange%2522%3Anull%2C%2522upperMintedRange%2522%3Anull%2C%2522startCursor%2522%3A%2522WyJyZWNlbnRfbmZ0X2V2ZW50X2J5X3Rva2VuX2NvbnRyYWN0X2FkZHJlc3NfYW5kX3Rva2VuX2lkX190aW1lc3RhbXBfZGVzYyIsWyIyMDIyLTAyLTE3VDE0OjExOjMyKzAwOjAwIiwiMHhiOTMyYTcwYTU3NjczZDg5ZjRhY2ZmYmU4MzBlOGVkN2Y3NWZiOWUwIiwxNzYzMF1d%2522%2C%2522endCursor%2522%3A%2522WyJyZWNlbnRfbmZ0X2V2ZW50X2J5X3Rva2VuX2NvbnRyYWN0X2FkZHJlc3NfYW5kX3Rva2VuX2lkX190aW1lc3RhbXBfZGVzYyIsWyIyMDIyLTAyLTE2VDIwOjMxOjUxKzAwOjAwIiwiMHg0MjQyMzk5YzE2Yjc4MzgxOTZlZDMzZjE3OWU5OWUzZjk5Yjg4NGYyIiwzXV0%3D%2522%2C%2522lastEndCursor%2522%3A%2522WyJyZWNlbnRfbmZ0X2V2ZW50X2J5X3Rva2VuX2NvbnRyYWN0X2FkZHJlc3NfYW5kX3Rva2VuX2lkX190aW1lc3RhbXBfZGVzYyIsWyIyMDIyLTAyLTE3VDE0OjMwOjI3KzAwOjAwIiwiMHhiOTMyYTcwYTU3NjczZDg5ZjRhY2ZmYmU4MzBlOGVkN2Y3NWZiOWUwIiwyNzgxNl1d%2522%2C%2522lastStartCursor%2522%3Afalse%2C%2522hasPreviousPage%2522%3Atrue%2C%2522hasNextPage%2522%3Atrue%2C%2522reverse%2522%3Afalse%257D")
imgs2 = WebDriverWait(driver,10).until(EC.presence_of_all_elements_located((By.XPATH, "//a[contains(#class,'Name-sc-7kf6vz-3')]")))
time.sleep(5)
for i in range(0,30):
img_url = []
for number, item in enumerate(imgs2, 1):
imgwors2 = item.get_attribute("href")
driver3 = webdriver.Chrome()
driver3.get(imgwors2)
def check_exists_by_xpath(xpath):
try:
WebDriverWait(driver3,55).until(EC.presence_of_all_elements_located((By.XPATH, xpath)))
except TimeoutException:
return False
return True
if check_exists_by_xpath("//h1[#class='collectible-detail__collectible-name']"):
imgsrc4 = WebDriverWait(driver3,65).until(EC.presence_of_all_elements_located((By.XPATH, "//h1[contains(#class,'collectible-detail__collectible-name')]")))
for i in imgsrc4:
title = i.text
else:
title = "none"
print(title)
img_url2 = []
imgsrc2 = WebDriverWait(driver3,55).until(EC.presence_of_all_elements_located((By.XPATH, "//p[#data-testid='artistName']/ancestor::a[contains(#class,'ChildrenLink')]")))
for i in imgsrc2:
biourl = i.get_attribute("href")
img_url2.append(biourl)
print(img_url2)
driver.close()
I think from your description and code, the variable img_url2 should be initialized before the for loop(s)
driver.get("https://superrare.com/market?market-options=%257B%2522first%2522%3A30%2C%2522orderBy%2522%3A%2522RECENT_NFT_EVENT_BY_TOKEN_CONTRACT_ADDRESS_AND_TOKEN_ID__TIMESTAMP_DESC%2522%2C%2522fileTypes%2522%3A%255B%2522image%2Fjpeg%2522%2C%2522image%2Fpng%2522%255D%2C%2522listPrice%2522%3Afalse%2C%2522isGenesis%2522%3Afalse%2C%2522isSeries%2522%3Afalse%2C%2522neverReceivedOffer%2522%3Afalse%2C%2522reservePrice%2522%3Afalse%2C%2522liveAuctions%2522%3Afalse%2C%2522upcomingAuctions%2522%3Afalse%2C%2522hasSold%2522%3Afalse%2C%2522ownedByCreator%2522%3Afalse%2C%2522openOffers%2522%3Afalse%2C%2522artistsCollected%2522%3Afalse%2C%2522artistsYouFollow%2522%3Afalse%2C%2522artistsThatFollowYou%2522%3Afalse%2C%2522artistsFollowedByFollowed%2522%3Afalse%2C%2522lowerPriceRange%2522%3A0%2C%2522upperPriceRange%2522%3A100000%2C%2522numCreatorSales%2522%3Afalse%2C%2522lowerMintedRange%2522%3Anull%2C%2522upperMintedRange%2522%3Anull%2C%2522startCursor%2522%3A%2522WyJyZWNlbnRfbmZ0X2V2ZW50X2J5X3Rva2VuX2NvbnRyYWN0X2FkZHJlc3NfYW5kX3Rva2VuX2lkX190aW1lc3RhbXBfZGVzYyIsWyIyMDIyLTAyLTE3VDE0OjExOjMyKzAwOjAwIiwiMHhiOTMyYTcwYTU3NjczZDg5ZjRhY2ZmYmU4MzBlOGVkN2Y3NWZiOWUwIiwxNzYzMF1d%2522%2C%2522endCursor%2522%3A%2522WyJyZWNlbnRfbmZ0X2V2ZW50X2J5X3Rva2VuX2NvbnRyYWN0X2FkZHJlc3NfYW5kX3Rva2VuX2lkX190aW1lc3RhbXBfZGVzYyIsWyIyMDIyLTAyLTE2VDIwOjMxOjUxKzAwOjAwIiwiMHg0MjQyMzk5YzE2Yjc4MzgxOTZlZDMzZjE3OWU5OWUzZjk5Yjg4NGYyIiwzXV0%3D%2522%2C%2522lastEndCursor%2522%3A%2522WyJyZWNlbnRfbmZ0X2V2ZW50X2J5X3Rva2VuX2NvbnRyYWN0X2FkZHJlc3NfYW5kX3Rva2VuX2lkX190aW1lc3RhbXBfZGVzYyIsWyIyMDIyLTAyLTE3VDE0OjMwOjI3KzAwOjAwIiwiMHhiOTMyYTcwYTU3NjczZDg5ZjRhY2ZmYmU4MzBlOGVkN2Y3NWZiOWUwIiwyNzgxNl1d%2522%2C%2522lastStartCursor%2522%3Afalse%2C%2522hasPreviousPage%2522%3Atrue%2C%2522hasNextPage%2522%3Atrue%2C%2522reverse%2522%3Afalse%257D")
imgs2 = WebDriverWait(driver,10).until(EC.presence_of_all_elements_located((By.XPATH, "//a[contains(#class,'Name-sc-7kf6vz-3')]")))
time.sleep(5)
img_url2 = [] # <--- moved before the loop
for i in range(0,30):
for number, item in enumerate(imgs2, 1):
imgwors2 = item.get_attribute("href")
driver3 = webdriver.Chrome()
driver3.get(imgwors2)
def check_exists_by_xpath(xpath):
try:
WebDriverWait(driver3,55).until(EC.presence_of_all_elements_located((By.XPATH, xpath)))
except TimeoutException:
return False
return True
if check_exists_by_xpath("//h1[#class='collectible-detail__collectible-name']"):
imgsrc4 = WebDriverWait(driver3,65).until(EC.presence_of_all_elements_located((By.XPATH, "//h1[contains(#class,'collectible-detail__collectible-name')]")))
for i in imgsrc4:
title = i.text
else:
title = "none"
print(title)
imgsrc2 = WebDriverWait(driver3,55).until(EC.presence_of_all_elements_located((By.XPATH, "//p[#data-testid='artistName']/ancestor::a[contains(#class,'ChildrenLink')]")))
for i in imgsrc2:
biourl = i.get_attribute("href")
img_url2.append(biourl)
driver.close()
print(img_url2) # <--- moved below the loop
Very new to Python and Selenium, looking to scrape a few data points. I'm struggling in three areas:
I don't understand how to loop through multiple URLs properly
I can't figure out why the script is iterating twice over each URL
I can't figure out why it's only outputting the data for the second URL
Much thanks for taking a look!
Here's my current script:
urls = [
'https://developers.google.com/speed/pagespeed/insights/?url=https://www.crutchfield.com/%2F&tab=mobile',
'https://developers.google.com/speed/pagespeed/insights/?url=https://www.lastpass.com%2F&tab=mobile'
]
driver = webdriver.Chrome(executable_path='/Library/Frameworks/Python.framework/Versions/3.9/bin/chromedriver')
for url in urls:
for page in range(0, 1):
driver.get(url)
wait = WebDriverWait(driver, 120).until(EC.presence_of_element_located((By.CLASS_NAME, 'origin-field-data')))
df = pd.DataFrame(columns = ['Title', 'Core Web Vitals', 'FCP', 'FID', 'CLS', 'TTI', 'TBT', 'Total Score'])
company = driver.find_elements_by_class_name("audited-url__link")
data = []
for i in company:
data.append(i.get_attribute('href'))
for x in data:
#Get URL name
title = driver.find_element_by_xpath('//*[#id="page-speed-insights"]/div[2]/div[3]/div[2]/div[1]/div[1]/div/div[2]/h1/a')
co_name = title.text
#Get Core Web Vitals text pass/fail
cwv = driver.find_element_by_xpath('//*[#id="page-speed-insights"]/div[2]/div[3]/div[2]/div[1]/div[2]/div/div[1]/div[1]/div[1]/span[2]')
core_web = cwv.text
#Get FCP
fcp = driver.find_element_by_xpath('//*[#id="page-speed-insights"]/div[2]/div[3]/div[2]/div[1]/div[2]/div/div[1]/div[1]/div[2]/div[1]/div[1]/div')
first_content = fcp.text
#Get FID
fid = driver.find_element_by_xpath('//*[#id="page-speed-insights"]/div[2]/div[3]/div[2]/div[1]/div[2]/div/div[1]/div[1]/div[2]/div[3]/div[1]/div')
first_input = fid.text
#Get CLS
cls = driver.find_element_by_xpath('//*[#id="page-speed-insights"]/div[2]/div[3]/div[2]/div[1]/div[2]/div/div[1]/div[1]/div[2]/div[4]/div[1]/div')
layout_shift = cls.text
#Get TTI
tti = driver.find_element_by_xpath('//*[#id="interactive"]/div/div[1]')
time_interactive = tti.text
#Get TBT
tbt = driver.find_element_by_xpath('//*[#id="total-blocking-time"]/div/div[1]')
total_block = tbt.text
#Get Total Score
total_score = driver.find_element_by_xpath('//*[#id="page-speed-insights"]/div[2]/div[3]/div[2]/div[1]/div[1]/div/div[1]/a/div[2]')
score = total_score.text
#Adding all columns to dataframe
df.loc[len(df)] = [co_name,core_web,first_content,first_input,layout_shift,time_interactive,total_block,score]
driver.close()
#df.to_csv('Double Page Speed Test 9-10.csv')
print(df)
Q1 : I don't understand how to loop through multiple URLs properly ?
Ans : for url in urls:
Q2. I can't figure out why the script is iterating twice over each URL
Ans : Cause you have for page in range(0, 1):
Update 1:
I did not run your entire code with DF. Also sometimes either one of the pages, does not show the number and href, but when I typically run the below code,
driver = webdriver.Chrome(driver_path)
driver.maximize_window()
driver.implicitly_wait(50)
wait = WebDriverWait(driver, 20)
urls = [
'https://developers.google.com/speed/pagespeed/insights/?url=https://www.crutchfield.com/%2F&tab=mobile',
'https://developers.google.com/speed/pagespeed/insights/?url=https://www.lastpass.com%2F&tab=mobile'
]
data = []
for url in urls:
driver.get(url)
wait = WebDriverWait(driver, 120).until(EC.presence_of_element_located((By.CLASS_NAME, 'origin-field-data')))
company = driver.find_elements_by_css_selector("h1.audited-url a")
for i in company:
data.append(i.get_attribute('href'))
print(data)
this output :
['https://www.crutchfield.com//', 'https://www.lastpass.com/', 'https://www.lastpass.com/']
which is true case the element locator that we have used is representing 1 element on page 1 or 2 element on page 2
Currently, I'm trying to scrape the New York Times dining website (nytimes.com/reviews/dining) and get a list of links and neighborhoods for each restaurant. Unfortunately, I've been running the codeblock below for about 9 hours on Google Colab, and the iterator X is on its 1,175th run. I'm trying to figure out what's going on, but I'm too scared to halt the cell and start over again. Is it the nested for loop that is causing this process to take such a long time?
driver = webdriver.Chrome('chromedriver', chrome_options = chrome_options)
driver.get("https://www.nytimes.com/reviews/dining")
WebDriverWait(driver, 20).until(EC.element_to_be_clickable
((By.XPATH,"//button[text()='Show More']"))).click()
url_list = []
nyt_dining = pd.DataFrame(columns = ['Restaurant', 'URL', 'servesCuisine', 'priceRange', 'addressLocality'])
x = 0
while(True):
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
elements = WebDriverWait(driver, 20).until(EC.visibility_of_all_elements_located
((By.CSS_SELECTOR,"div.css-102xbk1")))
url_before = len(url_list)
for e in elements:
# 2.1 Getting the links
link = e.find_element_by_css_selector("a.css-gg4vpm")
link = link.get_attribute("href") ##
# 2.1 Getting the restaurant name
name = e.find_element_by_css_selector("h2.css-8aqwnr")
name = name.text ##
# 2.3 Getting other information
info = e.find_element_by_css_selector("ul.css-o4kdzz")
info = info.find_elements_by_tag_name('li')
cuisine = ''
price = ''
neighborhood = ''
for i in info:
attribute = i.get_attribute("itemprop")
if attribute == "servesCuisine":
cuisine = i.text
elif attribute == "priceRange":
price = i.text
elif attribute == "addressLocality":
neighborhood = i.text
# 2.4 Append to dataframe
if link in url_list:
continue
else:
url_list.append(link)
nyt_dining = nyt_dining.append({'Restaurant': name, 'URL': link,
'servesCuisine': cuisine,
'priceRange': price,
'addressLocality': neighborhood},
ignore_index = True)
print(x)
x += 1
url_after = len(url_list)
if url_before >= url_after:
break
button = WebDriverWait(driver, 10).until(EC.visibility_of_element_located
((By.XPATH,"//button[text()='Show More']")))
driver.execute_script("arguments[0].click();", button)
time.sleep(2)
nyt_dining