I need to loop the Google Maps page, we have several pages to be scraped, but I can only scrape the first one
Here is the code I use to scrape the first page, I would like to scrape all the pages but I don't know how to do that.
Link
url = "https://www.google.com/search?q=contabilidade+em+manaus&biw=1366&bih=657&tbm=lcl&sxsrf=AJOqlzXTyAs7rej8A4k9tuuY9FmGpdOjLg:1676314056027&ei=yIXqY8-tAavQ1sQP0Yy2wAo&ved=0ahUKEwjPsd2-lJP9AhUrqJUCHVGGDagQ4dUDCAk&uact=5&oq=contabilidade+em+manaus&gs_lcp=Cg1nd3Mtd2l6LWxvY2FsEAMyBAgjECcyBggAEBYQHjIJCAAQFhAeEPEEMgkIABAWEB4Q8QQyCQgAEBYQHhDxBDIJCAAQFhAeEPEEMgkIABAWEB4Q8QQyBggAEBYQHjIGCAAQFhAeMgkIABAWEB4Q8QRQAFgAYPYDaABwAHgAgAHWAYgB1gGSAQMyLTGYAQDAAQE&sclient=gws-wiz-local&pccc=1#rlfi=hd:;si:;mv:[[-3.0446025000000003,-59.9553221],[-3.1346859,-60.061026600000005]];tbs:lrf:!1m4!1u3!2m2!3m1!1e1!1m4!1u2!2m2!2m1!1e1!2m1!1e2!2m1!1e3!3sIAE,lf:1,lf_ui:14"
wait = WebDriverWait(driver, 20)
########################################################################################################################
procurar = "contabilidade em Curitiba"
########################################################################################################################
links = []
Nome = []
Endereco = []
Telefone = []
########################################################################################################################
driver.get(url)
driver.maximize_window()
sleep(2)
print("O que procura?")
driver.find_element(By. XPATH, "//input[#value='contabilidade em manaus']").clear()
sleep(2)
input_buscar = driver.find_element(By. XPATH, "//input[#aria-label='Pesquisar']")
input_buscar.send_keys(procurar, Keys. ENTER)
sleep(2)
########################################################################################################################
while True:
try:
classe_empresas = driver.find_elements(By.XPATH, "(//div[#class='rllt__details'])")
for empresa in classe_empresas:
empresa.click()
sleep(2)
nome = driver.find_element(By.XPATH, "//h2[#data-attrid='title']").text
print(nome)
Nome.append(nome)
endereco = driver.find_element(By. XPATH, "//span[#class='LrzXr']").get_attribute("innerHTML")
print(endereco)
Endereco.append(endereco)
try:
tel = driver.find_element(By.CSS_SELECTOR, ".LrzXr.zdqRlf.kno-fv").text
print(tel)
Telefone.append(tel)
except:
sem_telefone = "Não Tem Telefone Cadastrado"
Telefone.append(tel)
print(sem_telefone)
driver.find_element(By.XPATH, "//span[normalize-space()='Mais']").click()
except:
break
data = {'Nome': Nome, 'Endereço': Endereco, 'Telefone': Telefone}
df = pd.DataFrame(data)
df.to_excel('GoogleMaps.xlsx', engine='xlsxwriter')
print(df)
You are showing what works, it would be great to see which url you are scraping. However, in the Google Maps URL you sent, there is a last parameter called "start:x" where x is a number that defines the start of the items displayed on the page.
You can use that value to change it as you scrape all the results.
Here is the value:
https://www.google.com/search?q=contabilidade+&biw=1...;start:20 //the final prop
Also you can click on the number of the page in the bottom with a loop´:
driver.find_element(By.XPATH, "/html/body/div[6]/div/div[9]/div[1]/div/div[2]/div[2]/div/div/div/div/div/div/div/div/div/div[2]/div/table/tbody/tr/td[" + yourIterateVar + "]/a").click()
where yourIterateVar start at page 2 until error (no more pages).
Related
wait = WebDriverWait(driver, 20) #wait 20s to let webpage load
driver.get('https://beta.clinicaltrials.gov/') #getting website url
driver.maximize_window()
time.sleep(1)
country = wait.until(EC.element_to_be_clickable((By.XPATH, '//*[#id="content"]/div/ctg-home/div/div[2]/ctg-advanced-search-home/div[2]/div[1]/fieldset/div[2]/div[3]/ctg-location-search-input/form/div[2]/div/label')))
country.click()
searchBar = driver.find_element("id",'location-input')
searchBar.send_keys("Singapore") #input country name into searchBar
search_dropdown = wait.until(EC.element_to_be_clickable((By.XPATH, '//*[#id="mat-option-14"]/span'))) #wait till xpath is visible
search_dropdown.click()
search_button = wait.until(EC.element_to_be_clickable((By.XPATH, '//*[#id="content"]/div/ctg-home/div/div[2]/ctg-advanced-search-home/div[2]/div[2]/div/div[2]/button')))
search_button.click()
#finding filter button for recruiting status
filter_button = wait.until(EC.element_to_be_clickable((By.XPATH, '//*[#id="filter-button-statusGroup"]')))
filter_button.click()
#clicking on 'recruiting' status
recruiting = wait.until(EC.element_to_be_clickable((By.XPATH, '//*[#id="adv-check-status"]/div[2]/div[2]/div/label')))
recruiting.click()
#scraping each clinical trial details
clinical_trial = {} #empty dict to store details
name_list = []
phone_list = []
email_list = []
enrollment = []
condition_list = []
#loop to go through all the clinical trials in the search page (10 per page)
for i in range(1,11):
time.sleep(2) #wait 2s to let page load
xpath = '//*[#id="content"]/div/ctg-search-results/div[2]/div/div[2]/div/div[2]/div[1]/ctg-search-hit-card[{}]/div/header/a'.format(i)
trials = driver.find_element("xpath", xpath)
trials.click()
#time.sleep(5) #wait 5s to let page load
#getting contact person name
name = wait.until(EC.visibility_of_element_located((By.XPATH, '//*[#id="studyDetailsInfo"]/ctg-study-info/div/ctg-study-info-view/div/div[2]/ctg-study-contacts-and-locations/div/div/div/ctg-study-contact-info/p[1]/span')))
name_list.append(name.text) #adding each name to the list
#phone number of contact person
phone = wait.until(EC.visibility_of_element_located((By.XPATH, '//*[#id="studyDetailsInfo"]/ctg-study-info/div/ctg-study-info-view/div/div[2]/ctg-study-contacts-and-locations/div/div/div/ctg-study-contact-info/p[2]/span')))
phone_list.append(phone.text) #adding each phone number to the list
#email of contact person
email = wait.until(EC.visibility_of_element_located((By.XPATH, '//*[#id="studyDetailsInfo"]/ctg-study-info/div/ctg-study-info-view/div/div[2]/ctg-study-contacts-and-locations/div/div/div/ctg-study-contact-info/p[3]/ctg-study-contact-email/span/a')))
email_list.append(email.text) #adding each email address to the list
#number of enrollment
enrollment_num = wait.until(EC.visibility_of_element_located((By.XPATH, '//*[#id="studyDetailsInfo"]/ctg-study-info/div/ctg-study-info-view/div/div[1]/ctg-study-overview/div[3]/div[2]/div[3]/div[2]')))
enrollment.append(enrollment_num.text) #adding each enrollment number to the list
#condition of study
conditions = wait.until(EC.visibility_of_element_located((By.XPATH, '//*[#id="studyDetailsInfo"]/ctg-study-info/div/ctg-study-info-view/div/div[1]/ctg-study-overview/div[3]/div[2]/div[1]/div[2]')))
condition_list.append(conditions.text) #adding conditions of the study to list
driver.back() #return to search page
#adding all the different list details to the contact_details dict
clinical_trial["name"] = name_list
clinical_trial["phone_num"] = phone_list
clinical_trial["email_address"] = email_list
clinical_trial["Enrollment"] = enrollment
clinical_trial["Conditions"] = condition_list
I am having an issue with selenium somehow not finding the xpath for enrollment_num in the loop. The loop runs through the 10 clickable links on the webpage, however it gives a TimeoutException error at the 9th link. Why is that so? When i change the loop to iterate through 8 links instead of the usual 10 links, it works fine. Its just that one link which creates the error.
Page number 9 is different from all the other pages. The difference is hard to spot. Tipp: to compare strings i use Notepad++ with the compare plugin.
This page does not have these 2 elements:
enrollment_num =...ctg-study-overview/div[3]/div[2]/di...'
here it is:
enrollment_num =...ctg-study-overview/div[2]/div[2]/di...
conditions = ...ctg-study-overview/div[3]/di...
here it is:
...ctg-study-overview/div[2]/di...
This is why it runs into a timeout. You could build a try:except:else around these to avoid the program from crashing. Below a quick fix. Of course you should tidy it up. I hope this helps.
# number of enrollment
try:
enrollment_num = wait.until(EC.visibility_of_element_located((By.XPATH,
'//*[#id="studyDetailsInfo"]/ctg-study-info/div/ctg-study-info-view/div/div[1]/ctg-study-overview/div[3]/div[2]/div[3]/div[2]')))
enrollment.append(enrollment_num.text) # adding each enrollment number to the list
except:
print("enrollement div[3] but div[2]")
enrollment_num = wait.until(EC.visibility_of_element_located((By.XPATH,
'//*[#id="studyDetailsInfo"]/ctg-study-info/div/ctg-study-info-view/div/div[1]/ctg-study-overview/div[2]/div[2]/div[3]/div[2]')))
enrollment.append(enrollment_num.text) # adding each enrollment number to the list
else:
pass
# condition of study
try:
conditions = wait.until(EC.visibility_of_element_located((By.XPATH,
'//*[#id="studyDetailsInfo"]/ctg-study-info/div/ctg-study-info-view/div/div[1]/ctg-study-overview/div[3]/div[2]/div[1]/div[2]')))
condition_list.append(conditions.text) # adding conditions of the study to list
except:
print("condition_list non div[3] but div[2]")
conditions = wait.until(EC.visibility_of_element_located((By.XPATH,
'//*[#id="studyDetailsInfo"]/ctg-study-info/div/ctg-study-info-view/div/div[1]/ctg-study-overview/div[2]/div[2]/div[1]/div[2]')))
condition_list.append(conditions.text) # adding conditions of the study to list
else:
pass
I have an e-commerce page and there are multiple products on a page. I need to click the link of a product then return on the main page and click the link of the next product, but when I return, the elements can't be found anymore.
Path = "C:\Program Files (x86)\chromedriver.exe"
driver = webdriver.Chrome(Path)
driver.get("https://www.emag.ro/")
search_bar = driver.find_element_by_id("searchboxTrigger")
search_bar.send_keys("laptopuri")
search_bar.send_keys(Keys.RETURN)
main = None
try:
main = WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.ID, "main-container"))
)
print("Page loaded,main retrived succesfully")
print(" ")
except:
driver.quit()
products = main.find_elements_by_css_selector("div.card-item.js-product-data")
for product in products:
raw_name = product.text
raw_price = product.find_element_by_css_selector("p.product-new-price").text
link = product.find_element_by_tag_name("a")
#clicking the link
link.click()
spec_page = None
try:
spec_page = WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.CLASS_NAME, "col-md-12"))
)
except:
driver.quit()
print(spec_page)
driver.back()
After the first iteration, I get the following error :
selenium.common.exceptions.StaleElementReferenceException: Message: stale element reference: element is not attached to the page document on line raw_name = product.text,basically at the beginning of the loop.
I assume the page is not loading properly or something like that, I tried using time.sleep before going through the loop but nothing
When you are writing driver.back(), it will go back to previous page and by the time it will reach to original page all the defined elements will become stale. You need to redefined them like below :-
This should handle the exception.
products = len(main.find_elements_by_css_selector("div.card-item.js-product-data"))
j = 0
for product in range(products):
elements = main.find_elements_by_css_selector("div.card-item.js-product-data")
raw_name = elements[j].text
raw_price = elements[j].find_element_by_css_selector("p.product-new-price").text
link = elements[j].find_element_by_tag_name("a")
# clicking the link
link.click()
spec_page = None
try:
spec_page = WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.CLASS_NAME, "col-md-12"))
)
except:
driver.quit()
print(spec_page)
j = j + 1
driver.back()
I'm trying to click on a div to get to the next page of a table (the url does not change when the page changes). The go to the next page div has the same class as the go to the previous page's.
ive used:
elem = driver.find_element_by_class_name('cXQSjq')
elem.click()
timeout = 30
try:
WebDriverWait(driver, timeout).until(EC.visibility_of_element_located((By.CLASS_NAME, "gxzFwa")))
except TimeoutException:
driver.quit()
names = driver.find_elements_by_class_name('iBSZGH')
for company in names[1:]:
name.append(company.text)
mostdata = driver.find_elements_by_class_name('gvgMSe.gJYnHB')
for most in mostdata:
most = most.text
most = most.replace(',','')
data.append(most)
last7dsales = driver.find_elements_by_class_name('fGpHsy.kpsxyE')
for last in last7dsales:
last = last.text
last = last.replace(',','')
last7day.append(last)
#loop for the other pages:
for i in range(6):
elem.click()
timeout = 30
try:
WebDriverWait(driver, timeout).until(EC.visibility_of_element_located((By.CLASS_NAME, "gxzFwa")))
except TimeoutException:
driver.quit()
names = driver.find_elements_by_class_name('iBSZGH')
for company in names[1:]:
name.append(company.text)
mostdata = driver.find_elements_by_class_name('gvgMSe.gJYnHB')
for most in mostdata:
most = most.text.replace(',','')
most = most.text.replace(',','')
data.append(most)
last7dsales = driver.find_elements_by_class_name('fGpHsy.kpsxyE')
for last in last7dsales:
last = last.text.replace(',','')
last7day.append(last)
and it worked to get me to page 2, but after page 2 it gives me the error:
selenium.common.exceptions.ElementClickInterceptedException: Message: element click
intercepted: Element <div class="styles__Chevron-sc-1buchb9-1 cXQSjq">...</div> is
not clickable at point (702, 656). Other element would receive the click: <div
id="hs-eu-cookie-confirmation-inner">...</div>
(Session info: chrome=92.0.4515.107)
Do you know if there is an issue that i am able to call elem.click() after using selenium to find other parts of the page. I'm scraping data from
https://nonfungible.com/market/history
I guess the issue here is that the next page button appears on the bottom of the page, so to click on it you should first scroll this element into the view.
See if this will work now:
from selenium.webdriver.common.action_chains import ActionChains
actions = ActionChains(driver)
elem = driver.find_element_by_class_name('cXQSjq')
actions.move_to_element(elem).perform()
time.sleep(0.3)
elem.click()
timeout = 30
try:
WebDriverWait(driver, timeout).until(EC.visibility_of_element_located((By.CLASS_NAME, "gxzFwa")))
except TimeoutException:
driver.quit()
names = driver.find_elements_by_class_name('iBSZGH')
for company in names[1:]:
name.append(company.text)
mostdata = driver.find_elements_by_class_name('gvgMSe.gJYnHB')
for most in mostdata:
most = most.text
most = most.replace(',','')
data.append(most)
last7dsales = driver.find_elements_by_class_name('fGpHsy.kpsxyE')
for last in last7dsales:
last = last.text
last = last.replace(',','')
last7day.append(last)
#loop for the other pages:
for i in range(6):
actions.move_to_element(elem).perform()
time.sleep(0.3)
elem.click()
timeout = 30
try:
WebDriverWait(driver, timeout).until(EC.visibility_of_element_located((By.CLASS_NAME, "gxzFwa")))
except TimeoutException:
driver.quit()
names = driver.find_elements_by_class_name('iBSZGH')
for company in names[1:]:
name.append(company.text)
mostdata = driver.find_elements_by_class_name('gvgMSe.gJYnHB')
for most in mostdata:
most = most.text.replace(',','')
most = most.text.replace(',','')
data.append(most)
last7dsales = driver.find_elements_by_class_name('fGpHsy.kpsxyE')
for last in last7dsales:
last = last.text.replace(',','')
last7day.append(last)
I didn't check correctness of the rest of your code, but clicking the next button should work now
I'm beginner to Python & Selenium, I am just trying to get all LinkedIn profile hrefs on specific page with Selenium for adding to a list but I don't know why it returns all same URL 10 times:
This is my code:
try:
browser.find_element_by_id("username").send_keys(email_address)
sleep(1)
browser.find_element_by_id("password").send_keys(password)
sleep(1)
browser.find_element_by_xpath("//button[#class='btn__primary--large from__button--floating']").click()
sleep(1)
element = WebDriverWait(browser, 10).until(
EC.presence_of_element_located((By.ID, "global-nav")))
sleep(0.5)
browser.get('https://www.linkedin.com/search/results/people/?geoUrn=%5B%22104246759%22%2C%2290009659%22%2C%22106383538%22%5D&keywords=mark%20johnson&origin=GLOBAL_SEARCH_HEADER')
user = []
url = browser.find_elements_by_xpath("//a[#class='app-aware-link']")
for i in range(10):
href = url[i].get_attribute('href')
user.append(href)
print(user)
except Exception as e:
traceback.print_exc()
It looks like the xpath is matching multiple elements with the same href. You could make a list of unique hrefs:
user = []
url = browser.find_elements_by_xpath("//a[#class='app-aware-link']")
unique_hrefs = []
[unique_hrefs.append(x.get_attribute('href')) for x in url if x.get_attribute('href') not in unique_hrefs]
for i in unique_hrefs:
print(i)
I am dealing with pagination and would like my script to scrape a table, click on the next button, scrape the next table and click on next until it is no longer clickable.
The only difference in clickable vs non-clickable seems to be the disabled> closing tag.
My idea was to create a while loop and click on the button until the disabled tag disappears, but I'm not sure how to get that tag in the first place.
Even if the button is disabled, Selenium doesn't throw an "Element not interactable" error so I don't think I can go down that route.
airport_list = []
fees_list = []
airports = ["https://www.aopa.org/destinations/business/13035#fees", "https://www.aopa.org/destinations/business/35555#fees"]
for a in airports:
driver.get(a)
time.sleep(3)
# Click dropdown
driver.find_element_by_xpath('//div[#class = "mat-select-arrow"]').click()
time.sleep(1)
# Select "All aircraft"
driver.find_elements_by_xpath('//span[#class = "mat-option-text"]')[8].click()
time.sleep(2)
try:
# Check if fees are available
driver.find_element_by_xpath('//mat-row[#class = "mat-row ng-star-inserted"]')
#Scrape each row
fees_table = driver.find_elements_by_xpath('//mat-row[#class = "mat-row ng-star-inserted"]')
for fee in fees_table:
fees_list.append(fee.text)
airport_list.append(a)
#Click on "Next" button
driver.find_elements_by_xpath('//span[#class = "mat-button-wrapper"]')[4].click()
time.sleep(2)
except:
fees_list.append("This location has not disclosed fees or does not charge fees.")
airport_list.append(a)
driver.close()
I was able to extract the maximum number of items from the bottom of the table, divide that number by 10 and round up to the nearest number. I am then using that number to iterate through a range.
airport_list = []
fees_list = []
airports = ["https://www.aopa.org/destinations/business/13035#fees"]
for a in airports:
driver.get(a)
time.sleep(3)
# Click dropdown
driver.find_element_by_xpath('//div[#class = "mat-select-arrow"]').click()
time.sleep(1)
# Select "All aircraft"
driver.find_elements_by_xpath('//span[#class = "mat-option-text"]')[8].click()
time.sleep(2)
try:
# Check if fees are available
driver.find_element_by_xpath('//mat-row[#class = "mat-row ng-star-inserted"]')
# Get number of items
number_of_items = driver.find_element_by_xpath('//div[#class = "mat-paginator-range-label"]').text.split()[-1]
#print(number_of_items)
if float(number_of_items) >= 11:
number_of_button_clicks = math.ceil(float(number_of_items)/10)
else:
number_of_button_clicks = 0
#print(number_of_button_clicks)
for click in range(0, number_of_button_clicks):
#Scrape each row
fees_table = driver.find_elements_by_xpath('//mat-row[#class = "mat-row ng-star-inserted"]')
for fee in fees_table:
fees_list.append(fee.text)
airport_list.append(a)
#Click on "Next" button
driver.find_elements_by_xpath('//span[#class = "mat-button-wrapper"]')[4].click()
time.sleep(2)
except:
fees_list.append("This location has not disclosed fees or does not charge fees.")
airport_list.append(a)
#print(fee_list)
#print(airport_list)
driver.close()
Instead of going on next page use the max line limit as shown in below code. On top of that you really dont need to use try except block -
for a in airports:
driver.get(a)
time.sleep(3)
# Click dropdown
driver.find_element_by_xpath('//div[#class = "mat-select-arrow"]').click()
time.sleep(1)
# Select "All aircraft"
driver.find_elements_by_xpath('//span[#class = "mat-option-text"]')[8].click()
time.sleep(3)
# select 100 Items per page if items are present
if len(driver.find_elements_by_xpath(".//mat-select[#aria-label='Items per page:']")) > 0 :
driver.find_element_by_xpath(".//mat-select[#aria-label='Items per page:']").click()
time.sleep(3)
driver.find_element_by_xpath(".//span[#class='mat-option-text' and text()='100']/parent::mat-option").click()
# Scrape each row
fees_table = driver.find_elements_by_xpath('//mat-row[#class = "mat-row ng-star-inserted"]')
for fee in fees_table:
fees_list.append(fee.text)
print(fees_list)
# if needed then Click on "Next" button using this xpath and apply same for loop as above
#driver.find_elements_by_xpath(".//button[#aria-label='Next page']").click()
driver.close()