wait = WebDriverWait(driver, 20) #wait 20s to let webpage load
driver.get('https://beta.clinicaltrials.gov/') #getting website url
driver.maximize_window()
time.sleep(1)
country = wait.until(EC.element_to_be_clickable((By.XPATH, '//*[#id="content"]/div/ctg-home/div/div[2]/ctg-advanced-search-home/div[2]/div[1]/fieldset/div[2]/div[3]/ctg-location-search-input/form/div[2]/div/label')))
country.click()
searchBar = driver.find_element("id",'location-input')
searchBar.send_keys("Singapore") #input country name into searchBar
search_dropdown = wait.until(EC.element_to_be_clickable((By.XPATH, '//*[#id="mat-option-14"]/span'))) #wait till xpath is visible
search_dropdown.click()
search_button = wait.until(EC.element_to_be_clickable((By.XPATH, '//*[#id="content"]/div/ctg-home/div/div[2]/ctg-advanced-search-home/div[2]/div[2]/div/div[2]/button')))
search_button.click()
#finding filter button for recruiting status
filter_button = wait.until(EC.element_to_be_clickable((By.XPATH, '//*[#id="filter-button-statusGroup"]')))
filter_button.click()
#clicking on 'recruiting' status
recruiting = wait.until(EC.element_to_be_clickable((By.XPATH, '//*[#id="adv-check-status"]/div[2]/div[2]/div/label')))
recruiting.click()
#scraping each clinical trial details
clinical_trial = {} #empty dict to store details
name_list = []
phone_list = []
email_list = []
enrollment = []
condition_list = []
#loop to go through all the clinical trials in the search page (10 per page)
for i in range(1,11):
time.sleep(2) #wait 2s to let page load
xpath = '//*[#id="content"]/div/ctg-search-results/div[2]/div/div[2]/div/div[2]/div[1]/ctg-search-hit-card[{}]/div/header/a'.format(i)
trials = driver.find_element("xpath", xpath)
trials.click()
#time.sleep(5) #wait 5s to let page load
#getting contact person name
name = wait.until(EC.visibility_of_element_located((By.XPATH, '//*[#id="studyDetailsInfo"]/ctg-study-info/div/ctg-study-info-view/div/div[2]/ctg-study-contacts-and-locations/div/div/div/ctg-study-contact-info/p[1]/span')))
name_list.append(name.text) #adding each name to the list
#phone number of contact person
phone = wait.until(EC.visibility_of_element_located((By.XPATH, '//*[#id="studyDetailsInfo"]/ctg-study-info/div/ctg-study-info-view/div/div[2]/ctg-study-contacts-and-locations/div/div/div/ctg-study-contact-info/p[2]/span')))
phone_list.append(phone.text) #adding each phone number to the list
#email of contact person
email = wait.until(EC.visibility_of_element_located((By.XPATH, '//*[#id="studyDetailsInfo"]/ctg-study-info/div/ctg-study-info-view/div/div[2]/ctg-study-contacts-and-locations/div/div/div/ctg-study-contact-info/p[3]/ctg-study-contact-email/span/a')))
email_list.append(email.text) #adding each email address to the list
#number of enrollment
enrollment_num = wait.until(EC.visibility_of_element_located((By.XPATH, '//*[#id="studyDetailsInfo"]/ctg-study-info/div/ctg-study-info-view/div/div[1]/ctg-study-overview/div[3]/div[2]/div[3]/div[2]')))
enrollment.append(enrollment_num.text) #adding each enrollment number to the list
#condition of study
conditions = wait.until(EC.visibility_of_element_located((By.XPATH, '//*[#id="studyDetailsInfo"]/ctg-study-info/div/ctg-study-info-view/div/div[1]/ctg-study-overview/div[3]/div[2]/div[1]/div[2]')))
condition_list.append(conditions.text) #adding conditions of the study to list
driver.back() #return to search page
#adding all the different list details to the contact_details dict
clinical_trial["name"] = name_list
clinical_trial["phone_num"] = phone_list
clinical_trial["email_address"] = email_list
clinical_trial["Enrollment"] = enrollment
clinical_trial["Conditions"] = condition_list
I am having an issue with selenium somehow not finding the xpath for enrollment_num in the loop. The loop runs through the 10 clickable links on the webpage, however it gives a TimeoutException error at the 9th link. Why is that so? When i change the loop to iterate through 8 links instead of the usual 10 links, it works fine. Its just that one link which creates the error.
Page number 9 is different from all the other pages. The difference is hard to spot. Tipp: to compare strings i use Notepad++ with the compare plugin.
This page does not have these 2 elements:
enrollment_num =...ctg-study-overview/div[3]/div[2]/di...'
here it is:
enrollment_num =...ctg-study-overview/div[2]/div[2]/di...
conditions = ...ctg-study-overview/div[3]/di...
here it is:
...ctg-study-overview/div[2]/di...
This is why it runs into a timeout. You could build a try:except:else around these to avoid the program from crashing. Below a quick fix. Of course you should tidy it up. I hope this helps.
# number of enrollment
try:
enrollment_num = wait.until(EC.visibility_of_element_located((By.XPATH,
'//*[#id="studyDetailsInfo"]/ctg-study-info/div/ctg-study-info-view/div/div[1]/ctg-study-overview/div[3]/div[2]/div[3]/div[2]')))
enrollment.append(enrollment_num.text) # adding each enrollment number to the list
except:
print("enrollement div[3] but div[2]")
enrollment_num = wait.until(EC.visibility_of_element_located((By.XPATH,
'//*[#id="studyDetailsInfo"]/ctg-study-info/div/ctg-study-info-view/div/div[1]/ctg-study-overview/div[2]/div[2]/div[3]/div[2]')))
enrollment.append(enrollment_num.text) # adding each enrollment number to the list
else:
pass
# condition of study
try:
conditions = wait.until(EC.visibility_of_element_located((By.XPATH,
'//*[#id="studyDetailsInfo"]/ctg-study-info/div/ctg-study-info-view/div/div[1]/ctg-study-overview/div[3]/div[2]/div[1]/div[2]')))
condition_list.append(conditions.text) # adding conditions of the study to list
except:
print("condition_list non div[3] but div[2]")
conditions = wait.until(EC.visibility_of_element_located((By.XPATH,
'//*[#id="studyDetailsInfo"]/ctg-study-info/div/ctg-study-info-view/div/div[1]/ctg-study-overview/div[2]/div[2]/div[1]/div[2]')))
condition_list.append(conditions.text) # adding conditions of the study to list
else:
pass
Related
I need to loop the Google Maps page, we have several pages to be scraped, but I can only scrape the first one
Here is the code I use to scrape the first page, I would like to scrape all the pages but I don't know how to do that.
Link
url = "https://www.google.com/search?q=contabilidade+em+manaus&biw=1366&bih=657&tbm=lcl&sxsrf=AJOqlzXTyAs7rej8A4k9tuuY9FmGpdOjLg:1676314056027&ei=yIXqY8-tAavQ1sQP0Yy2wAo&ved=0ahUKEwjPsd2-lJP9AhUrqJUCHVGGDagQ4dUDCAk&uact=5&oq=contabilidade+em+manaus&gs_lcp=Cg1nd3Mtd2l6LWxvY2FsEAMyBAgjECcyBggAEBYQHjIJCAAQFhAeEPEEMgkIABAWEB4Q8QQyCQgAEBYQHhDxBDIJCAAQFhAeEPEEMgkIABAWEB4Q8QQyBggAEBYQHjIGCAAQFhAeMgkIABAWEB4Q8QRQAFgAYPYDaABwAHgAgAHWAYgB1gGSAQMyLTGYAQDAAQE&sclient=gws-wiz-local&pccc=1#rlfi=hd:;si:;mv:[[-3.0446025000000003,-59.9553221],[-3.1346859,-60.061026600000005]];tbs:lrf:!1m4!1u3!2m2!3m1!1e1!1m4!1u2!2m2!2m1!1e1!2m1!1e2!2m1!1e3!3sIAE,lf:1,lf_ui:14"
wait = WebDriverWait(driver, 20)
########################################################################################################################
procurar = "contabilidade em Curitiba"
########################################################################################################################
links = []
Nome = []
Endereco = []
Telefone = []
########################################################################################################################
driver.get(url)
driver.maximize_window()
sleep(2)
print("O que procura?")
driver.find_element(By. XPATH, "//input[#value='contabilidade em manaus']").clear()
sleep(2)
input_buscar = driver.find_element(By. XPATH, "//input[#aria-label='Pesquisar']")
input_buscar.send_keys(procurar, Keys. ENTER)
sleep(2)
########################################################################################################################
while True:
try:
classe_empresas = driver.find_elements(By.XPATH, "(//div[#class='rllt__details'])")
for empresa in classe_empresas:
empresa.click()
sleep(2)
nome = driver.find_element(By.XPATH, "//h2[#data-attrid='title']").text
print(nome)
Nome.append(nome)
endereco = driver.find_element(By. XPATH, "//span[#class='LrzXr']").get_attribute("innerHTML")
print(endereco)
Endereco.append(endereco)
try:
tel = driver.find_element(By.CSS_SELECTOR, ".LrzXr.zdqRlf.kno-fv").text
print(tel)
Telefone.append(tel)
except:
sem_telefone = "Não Tem Telefone Cadastrado"
Telefone.append(tel)
print(sem_telefone)
driver.find_element(By.XPATH, "//span[normalize-space()='Mais']").click()
except:
break
data = {'Nome': Nome, 'Endereço': Endereco, 'Telefone': Telefone}
df = pd.DataFrame(data)
df.to_excel('GoogleMaps.xlsx', engine='xlsxwriter')
print(df)
You are showing what works, it would be great to see which url you are scraping. However, in the Google Maps URL you sent, there is a last parameter called "start:x" where x is a number that defines the start of the items displayed on the page.
You can use that value to change it as you scrape all the results.
Here is the value:
https://www.google.com/search?q=contabilidade+&biw=1...;start:20 //the final prop
Also you can click on the number of the page in the bottom with a loop´:
driver.find_element(By.XPATH, "/html/body/div[6]/div/div[9]/div[1]/div/div[2]/div[2]/div/div/div/div/div/div/div/div/div/div[2]/div/table/tbody/tr/td[" + yourIterateVar + "]/a").click()
where yourIterateVar start at page 2 until error (no more pages).
I am attempting to scrape data through multiple pages (36) from a website to gather the document number and the revision number for each available document and save it to two different lists. If I run the code block below for each individual page, it works perfectly. However, when I added the while loop to loop through all 36 pages, it will loop, but only the data from the first page is saved.
#sam.gov website
url = 'https://sam.gov/search/?index=sca&page=1&sort=-modifiedDate&pageSize=25&sfm%5Bstatus%5D%5Bis_active%5D=true&sfm%5BwdPreviouslyPerformedWrapper%5D%5BpreviouslyPeformed%5D=prevPerfNo%2F'
#webdriver
driver = webdriver.Chrome(options = options_, executable_path = r'C:/Users/439528/Python Scripts/Spyder/chromedriver.exe' )
driver.get(url)
#get rid of pop up window
WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.CSS_SELECTOR, '#sds-dialog-0 > button > usa-icon > i-bs > svg'))).click()
#list of revision numbers
revision_num = []
#empty list for all the WD links
WD_num = []
substring = '2015'
current_page = 0
while True:
current_page += 1
if current_page == 36:
#find all elements on page named "field name". For each one, get the text. if the text is 'Revision Date'
#then, get the 'sibling' element, which is the actual revision number. append the date text to the revision_num list.
elements = driver.find_elements_by_class_name('sds-field__name')
wd_links = driver.find_elements_by_class_name('usa-link')
for i in elements:
element = i.text
if element == 'Revision Number':
revision_numbers = i.find_elements_by_xpath("./following-sibling::div")
for x in revision_numbers:
a = x.text
revision_num.append(a)
#finding all links that have the partial text 2015 and putting the wd text into the WD_num list
for link in wd_links:
wd = link.text
if substring in wd:
WD_num.append(wd)
print('Last Page Complete!')
break
else:
#find all elements on page named "field name". For each one, get the text. if the text is 'Revision Date'
#then, get the 'sibling' element, which is the actual revision number. append the date text to the revision_num list.
elements = driver.find_elements_by_class_name('sds-field__name')
wd_links = driver.find_elements_by_class_name('usa-link')
for i in elements:
element = i.text
if element == 'Revision Number':
revision_numbers = i.find_elements_by_xpath("./following-sibling::div")
for x in revision_numbers:
a = x.text
revision_num.append(a)
#finding all links that have the partial text 2015 and putting the wd text into the WD_num list
for link in wd_links:
wd = link.text
if substring in wd:
WD_num.append(wd)
#click on next page
click_icon = WebDriverWait(driver, 5, 0.25).until(EC.visibility_of_element_located([By.ID,'bottomPagination-nextPage']))
click_icon.click()
WebDriverWait(driver, 5).until(EC.presence_of_element_located((By.ID, 'main-container')))
Things I've tried:
I added the WebDriverWait in order to slow the script down for the page to load and/or elements to be clickable/located
I declared the empty lists outside the loop so it does not overwrite over each iteration
I have edited the while loop multiple times to either count up to 36 (while current_page <37) or moved the counter to the top or bottom of the loop)
Any ideas? TIA.
EDIT: added screenshot of 'field name'
I have refactor your code and made things very simple.
driver = webdriver.Chrome(options = options_, executable_path = r'C:/Users/439528/Python Scripts/Spyder/chromedriver.exe' )
revision_num = []
WD_num = []
for page in range(1,37):
url = 'https://sam.gov/search/?index=sca&page={}&sort=-modifiedDate&pageSize=25&sfm%5Bstatus%5D%5Bis_active%5D=true&sfm%5BwdPreviouslyPerformedWrapper%5D%5BpreviouslyPeformed%5D=prevPerfNo%2F'.format(page)
driver.get(url)
if page==1:
WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.CSS_SELECTOR, '#sds-dialog-0 > button > usa-icon > i-bs > svg'))).click()
elements = WebDriverWait(driver, 10).until(EC.visibility_of_all_elements_located((By.XPATH,"//a[contains(#class,'usa-link') and contains(.,'2015')]")))
wd_links = WebDriverWait(driver, 10).until(EC.visibility_of_all_elements_located((By.XPATH,"//div[#class='sds-field__name' and text()='Revision Number']/following-sibling::div")))
for element in elements:
revision_num.append(element.text)
for wd_link in wd_links:
WD_num.append(wd_link.text)
print(revision_num)
print(WD_num)
if you know only 36 pages to iterate you can pass the value in the url.
wait for element visible using webdriverwait
construct your xpath in such a way so can identify element uniquely without if, but.
console output on my terminal:
I am dealing with pagination and would like my script to scrape a table, click on the next button, scrape the next table and click on next until it is no longer clickable.
The only difference in clickable vs non-clickable seems to be the disabled> closing tag.
My idea was to create a while loop and click on the button until the disabled tag disappears, but I'm not sure how to get that tag in the first place.
Even if the button is disabled, Selenium doesn't throw an "Element not interactable" error so I don't think I can go down that route.
airport_list = []
fees_list = []
airports = ["https://www.aopa.org/destinations/business/13035#fees", "https://www.aopa.org/destinations/business/35555#fees"]
for a in airports:
driver.get(a)
time.sleep(3)
# Click dropdown
driver.find_element_by_xpath('//div[#class = "mat-select-arrow"]').click()
time.sleep(1)
# Select "All aircraft"
driver.find_elements_by_xpath('//span[#class = "mat-option-text"]')[8].click()
time.sleep(2)
try:
# Check if fees are available
driver.find_element_by_xpath('//mat-row[#class = "mat-row ng-star-inserted"]')
#Scrape each row
fees_table = driver.find_elements_by_xpath('//mat-row[#class = "mat-row ng-star-inserted"]')
for fee in fees_table:
fees_list.append(fee.text)
airport_list.append(a)
#Click on "Next" button
driver.find_elements_by_xpath('//span[#class = "mat-button-wrapper"]')[4].click()
time.sleep(2)
except:
fees_list.append("This location has not disclosed fees or does not charge fees.")
airport_list.append(a)
driver.close()
I was able to extract the maximum number of items from the bottom of the table, divide that number by 10 and round up to the nearest number. I am then using that number to iterate through a range.
airport_list = []
fees_list = []
airports = ["https://www.aopa.org/destinations/business/13035#fees"]
for a in airports:
driver.get(a)
time.sleep(3)
# Click dropdown
driver.find_element_by_xpath('//div[#class = "mat-select-arrow"]').click()
time.sleep(1)
# Select "All aircraft"
driver.find_elements_by_xpath('//span[#class = "mat-option-text"]')[8].click()
time.sleep(2)
try:
# Check if fees are available
driver.find_element_by_xpath('//mat-row[#class = "mat-row ng-star-inserted"]')
# Get number of items
number_of_items = driver.find_element_by_xpath('//div[#class = "mat-paginator-range-label"]').text.split()[-1]
#print(number_of_items)
if float(number_of_items) >= 11:
number_of_button_clicks = math.ceil(float(number_of_items)/10)
else:
number_of_button_clicks = 0
#print(number_of_button_clicks)
for click in range(0, number_of_button_clicks):
#Scrape each row
fees_table = driver.find_elements_by_xpath('//mat-row[#class = "mat-row ng-star-inserted"]')
for fee in fees_table:
fees_list.append(fee.text)
airport_list.append(a)
#Click on "Next" button
driver.find_elements_by_xpath('//span[#class = "mat-button-wrapper"]')[4].click()
time.sleep(2)
except:
fees_list.append("This location has not disclosed fees or does not charge fees.")
airport_list.append(a)
#print(fee_list)
#print(airport_list)
driver.close()
Instead of going on next page use the max line limit as shown in below code. On top of that you really dont need to use try except block -
for a in airports:
driver.get(a)
time.sleep(3)
# Click dropdown
driver.find_element_by_xpath('//div[#class = "mat-select-arrow"]').click()
time.sleep(1)
# Select "All aircraft"
driver.find_elements_by_xpath('//span[#class = "mat-option-text"]')[8].click()
time.sleep(3)
# select 100 Items per page if items are present
if len(driver.find_elements_by_xpath(".//mat-select[#aria-label='Items per page:']")) > 0 :
driver.find_element_by_xpath(".//mat-select[#aria-label='Items per page:']").click()
time.sleep(3)
driver.find_element_by_xpath(".//span[#class='mat-option-text' and text()='100']/parent::mat-option").click()
# Scrape each row
fees_table = driver.find_elements_by_xpath('//mat-row[#class = "mat-row ng-star-inserted"]')
for fee in fees_table:
fees_list.append(fee.text)
print(fees_list)
# if needed then Click on "Next" button using this xpath and apply same for loop as above
#driver.find_elements_by_xpath(".//button[#aria-label='Next page']").click()
driver.close()
I'm trying to loop through a dropdown menu on at this url: https://www.accuform.com/safety-sign/danger-danger-authorized-personnel-only-MADM006
So, for example, the first dropdown menu - under options - lists out different materials and I want to select each one in turn and then gather some other information from the webpage before moving on to the next material. Here is my current code:
driver = webdriver.Firefox()
driver.get('https://www.accuform.com/safety-sign/danger-danger-authorized-personnel-only-MADM006')
time.sleep(3)
driver.find_element_by_id('x-mark-icon').click()
select = Select(driver.find_element_by_name('Wiqj7mb4rsAq9LB'))
options = select.options
optionsList = []
driver.find_elements_by_class_name('select-wrapper')[0].click()
element = driver.find_element_by_xpath("//select[#name='Wiqj7mb4rsAq9LB']")
actions = ActionChains(driver)
actions.move_to_element(element).perform()
# driver.execute_script("arguments[0].scrollIntoView();", element)
for option in options: #iterate over the options, place attribute value in list
optionsList.append(option.get_attribute("value"))
for optionValue in optionsList:
print("starting loop on option %s" % optionValue)
# select = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, "//select[#name='Wiqj7mb4rsAq9LB']")))
# select = Select(select)
select.select_by_value(optionValue)
I started with just the loop, but got this error:
ElementNotInteractableException: Message: Element <option> could not be scrolled into view
I then added the webdriverwait and get a TimeoutException error.
I then realized I should probably click on the wrapper in which the dropdown is held, so I added the click, which does pup up the menu, but I still got the TimeoutException.
So I thought, maybe I should move to the element, which I tried with the action chain lines and I got this error
WebDriverException: Message: TypeError: rect is undefined
I tried to avoid that error by using this code instead:
# driver.execute_script("arguments[0].scrollIntoView();", element)
Which just resulted in the timeoutexception again.
I pretty new to Python and Selenium and have basically just been modifying code from SO answers to similar questions, but nothing has worked.
I'm using python 3.6 and the current versions of Selenium and firefox webdriver.
If anything is unclear or if you need more info just let me know.
Thanks so much!
EDIT: Based on the answer and comments by Kajal Kunda, I've updated my code to the following:
`material_dropdown = driver.find_element_by_xpath("//input[#class='select-
dropdown']")
driver.execute_script("arguments[0].click();", material_dropdown)
materials=driver.find_elements_by_css_selector("div.select-wrapper
ul.dropdown-content li")
for material in materials:
# material_dropdown =
driver.find_element_by_xpath("//input[#class='select-dropdown']")
# driver.execute_script("arguments[0].click();", material_dropdown)
# materials=driver.find_elements_by_css_selector("div.select-wrapper ul.dropdown-content li")
material_ele=material.find_element_by_tag_name('span')
if material_ele.text!='':
material_ele.click()
time.sleep(5)
price = driver.find_element_by_class_name("dataPriceDisplay")
print(price.text)`
The result is that it successfully prints the price for the first type of material, but then it returns:
StaleElementReferenceException: Message: The element reference of <li class=""> is stale;...
I've tried variations of having the hashed out lines in and outside of the loop, but always get a version of the StaleElementReferenceException error.
Any suggestions?
Thanks!
You could do the whole thing with requests. Grab the drop down list from the options listed in drop down then concatenate the value attributes into requests url that retrieves json containing all the info on the page. Same principle applies for adding in other dropdown values. The ids for each drop down selection are the value attributes of the options in the drop down and appear in the url I show separated by // for each drop down selection.
import requests
from bs4 import BeautifulSoup as bs
url = 'https://www.accuform.com/product/getSku/danger-danger-authorized-personnel-only-MADM006/1/false/null//{}//WHFIw3xXmQx8zlz//6wr93DdrFo5JV//WdnO0RpwKpc4fGF'
startURL = 'https://www.accuform.com/safety-sign/danger-danger-authorized-personnel-only-MADM006'
res = requests.get(startURL)
soup = bs(res.content, 'lxml')
materials = [item['value'] for item in soup.select('#Wiqj7mb4rsAq9LB option')]
sizes = [item['value'] for item in soup.select('#WvXESrTyQjM3Ciw option')]
languages = [item['value'] for item in soup.select('#WUYWGMePtpmpmhy option')]
units = [item['value'] for item in soup.select('#W91eqaJ0WPXwe9b option')]
for material in materials:
data = requests.get(url.format(material)).json()
soup = bs(data['dataMaterialBullets'], 'lxml')
lines = [item.text for item in soup.select('li')]
print(lines)
print(data['dataPriceDisplay'])
# etc......
Sample of JSON:
Try the below code.It should work.
driver = webdriver.Firefox()
driver.get('https://www.accuform.com/safety-sign/danger-danger-authorized-personnel-only-MADM006')
time.sleep(3)
driver.find_element_by_id('x-mark-icon').click()
material_dropdown = driver.find_element_by_xpath("//input[#class='select-dropdown']")
driver.execute_script("arguments[0].click();", material_dropdown)
#Code for material dropdown
materials=driver.find_elements_by_css_selector("div.select-wrapper ul.dropdown-content li")
material_optionsList = []
for material in materials:
material_ele=material.find_element_by_tag_name('span')
if material_ele.text!='':
material_optionsList.append(material_ele.text)
print(material_optionsList)
driver.execute_script("arguments[0].click();", material_dropdown)
size_dropdown = driver.find_element_by_xpath("(//input[#class='select-dropdown'])[2]")
driver.execute_script("arguments[0].click();", size_dropdown)
#Code for size dropdown
Sizes=driver.find_elements_by_css_selector("div.select-wrapper ul.dropdown-content li")
size_optionsList = []
for size in Sizes:
size_ele=size.find_element_by_tag_name('span')
if size_ele.text!='':
size_optionsList.append(size_ele.text)
driver.execute_script("arguments[0].click();", size_dropdown)
Output :
[u'Adhesive Vinyl', u'Plastic', u'Adhesive Dura-Vinyl', u'Aluminum', u'Dura-Plastic\u2122', u'Aluma-Lite\u2122', u'Dura-Fiberglass\u2122', u'Accu-Shield\u2122']
Hope you will do the remaining.Let me know if it works for you.
EDIT Code for loop through and get the price value of materials.
for material in range(len(materials)):
material_ele=materials[material]
if material_ele.text!='':
#material_optionsList.append(material_ele.text)
#material_ele.click()
driver.execute_script("arguments[0].click();", material_ele)
time.sleep(2)
price = driver.find_element_by_id("priceDisplay")
print( price.text)
time.sleep(2)
material_dropdown = driver.find_element_by_xpath("//input[#class='select-dropdown']")
driver.execute_script("arguments[0].click();", material_dropdown)
materials = driver.find_elements_by_css_selector("div.select-wrapper ul.dropdown-content li")
material+=2
Output :
$8.31
$9.06
$13.22
$15.91
$15.91
I only want to scrape the required information contained in the black box, and delete/remove/exclude the information contained in the red box
I am doing this because class names "entry" and "partial entry" exist in both boxes. Only the first "partial entry" contains the information that I need, so I plan to delete/remove/exclude the classname "mgrRspnInLine".
My code is:
while True:
container = driver.find_elements_by_xpath('.//*[contains(#class,"review-container")]')
for item in container:
try:
element = item.find_element_by_class_name('mgrRspnInline')
driver.execute_script("""var element = document.getElementsByClassName("mgrRspnInline")[0];element.parentNode.removeChild(element);""", element)
WebDriverWait(driver, 50).until(EC.presence_of_element_located((By.XPATH,'.//*[contains(#class,"taLnk ulBlueLinks")]')))
element = WebDriverWait(driver, 50).until(EC.element_to_be_clickable((By.XPATH,'.//*[contains(#class,"taLnk ulBlueLinks")]')))
element.click()
time.sleep(2)
rating = item.find_elements_by_xpath('.//*[contains(#class,"ui_bubble_rating bubble_")]')
for rate in rating:
rate = rate.get_attribute("class")
rate = str(rate)
rate = rate[-2:]
score_list.append(rate)
time.sleep(2)
stay = item.find_elements_by_xpath('.//*[contains(#class,"recommend-titleInline noRatings")]')
for stayed in stay:
stayed = stayed.text
stayed = stayed.split(', ')
stayed.append(stayed[0])
travel_type.append(stayed[1])
WebDriverWait(driver, 50).until(EC.presence_of_element_located((By.XPATH,'.//*[contains(#class,"noQuotes")]')))
summary = item.find_elements_by_xpath('.//*[contains(#class,"noQuotes")]')
for comment in summary:
comment = comment.text
comments.append(comment)
WebDriverWait(driver, 50).until(EC.presence_of_element_located((By.XPATH,'.//*[contains(#class,"ratingDate")]')))
rating_date = item.find_elements_by_xpath('.//*[contains(#class,"ratingDate")]')
for date in rating_date:
date = date.get_attribute("title")
date = str(date)
review_date.append(date)
WebDriverWait(driver, 50).until(EC.presence_of_element_located((By.XPATH,'.//*[contains(#class,"partial_entry")]')))
review = item.find_elements_by_xpath('.//*[contains(#class,"partial_entry")]')
for comment in review:
comment = comment.text
print(comment)
reviews.append(comment)
except (NoSuchElementException) as e:
continue
try:
element = WebDriverWait(driver, 100).until(EC.element_to_be_clickable((By.XPATH,'.//*[contains(#class,"nav next taLnk ui_button primary")]')))
element.click()
time.sleep(2)
except (ElementClickInterceptedException,NoSuchElementException) as e:
print(e)
break
Basically within the "review-container" I searched first for the class name "mgrRspnInLine", then tried to delete it using the execute_script.
but unfortunately, the output still shows the contents contained in the"mgrRspnInLine".
If you want to avoid matching second element by your XPath you can just modify XPath as below:
.//*[contains(#class,"partial_entry") and not(ancestor::*[#class="mgrRspnInLine"])]
This will match element with class name "partial_entry" only if it doesn't have ancestor with class name "mgrRspnInLine"
If you want the first occurrence you could use css class selector instead of:
.partial_entry
and retrieve with find_element_by_css_selector:
find_element_by_css_selector(".partial_entry")
You can delete all the .mgrRspnInLine elements with:
driver.execute_script("[...document.querySelectorAll('.mgrRspnInLine')].map(el => el.parentNode.removeChild(el))")
Stitching the comment by Andersson, and the two answers provided by QHarr, and pguardiario. I finally solved the problem.
The key is to target a container within the container, all the information is contained in the class name "ui_column is-9" which is contained in the class name "review-container", hence addressing Andersson's comment of multiple mgrRspnInLine.
Within the nested loop, I used pguardianrio's suggestion to delete existing multiple mgrRspnInLine, then adding QHarr's answer on .partial_entry
while True:
container = driver.find_elements_by_xpath('.//*[contains(#class,"review-container")]')
for items in container:
element = WebDriverWait(driver, 1000).until(EC.element_to_be_clickable((By.XPATH,'.//*[contains(#class,"taLnk ulBlueLinks")]')))
element.click()
time.sleep(10)
contained = items.find_elements_by_xpath('.//*[contains(#class,"ui_column is-9")]')
for item in contained:
try:
driver.execute_script("[...document.querySelectorAll('.mgrRspnInLine')].map(el => el.parentNode.removeChild(el))")
rating = item.find_element_by_xpath('//*[contains(#class,"ui_bubble_rating bubble_")]')
rate = rating .get_attribute("class")
rate = str(rate)
rate = rate[-2:]
score_list.append(rate)
time.sleep(2)
stay = item.find_element_by_xpath('.//*[contains(#class,"recommend-titleInline")]')
stayed = stay.text
stayed = stayed.split(', ')
stayed.append(stayed[0])
travel_type.append(stayed[1])
WebDriverWait(driver, 50).until(EC.presence_of_element_located((By.XPATH,'.//*[contains(#class,"noQuotes")]')))
summary = item.find_element_by_xpath('.//*[contains(#class,"noQuotes")]')
comment = summary.text
comments.append(comment)
WebDriverWait(driver, 50).until(EC.presence_of_element_located((By.XPATH,'.//*[contains(#class,"ratingDate")]')))
rating_date = item.find_element_by_xpath('.//*[contains(#class,"ratingDate")]')
date = rating_date.get_attribute("title")
date = str(date)
review_date.append(date)
WebDriverWait(driver, 50).until(EC.presence_of_element_located((By.XPATH,'.//*[contains(#class,"partial_entry")]')))
review = item.find_element_by_css_selector(".partial_entry")
comment = review.text
print(comment)
except (NoSuchElementException) as e:
continue
try:
element = WebDriverWait(driver, 100).until(EC.element_to_be_clickable((By.XPATH,'.//*[contains(#class,"nav next taLnk ui_button primary")]')))
element.click()
time.sleep(2)
except (ElementClickInterceptedException,NoSuchElementException) as e:
print(e)
break