WebDriverWait is terminated before reach to the last page

WebDriverWait is terminated before reach to the last page - python

I scraped review score data from booking.com. My code is shown as follow with step-by-step explanation
import pandas as pd
import time
from random import randint
from selenium import webdriver
from selenium.webdriver.support.select import Select
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
def get_random_wait(lower=5, upper=15):
max_wait = randint(lower, upper)
waiting_time = randint(lower - 1, max_wait)
print("waiting for {} seconds".format(waiting_time))
return waiting_time
name = "Eden The Residence at The Sea"
# Invoke Chrome and go to booking.com
driver = webdriver.Chrome(executable_path="../drivers/chromedriver.exe")
driver.get("https://booking.com")
print("Accessing " + driver.title)
print(driver.current_url)
# Enter hotel name
print("Searching for {}.".format(name))
time.sleep(get_random_wait())
driver.find_element_by_css_selector("input[class*='sb-searchbox__input']").send_keys(name)
# Click search
print("Clicking search button.")
time.sleep(get_random_wait())
driver.find_element_by_css_selector("button[type='submit']").click()
# Click the hotel name, i.e. Eden the Residence at The Sea
print("Selecting {} from return list.".format(name))
time.sleep(get_random_wait())
driver.find_element_by_xpath("//span[contains(text(),'" + name + "')]").click()
# Click to show toggle side bar that store reviews
print("Clicking on review tab.")
time.sleep(get_random_wait())
driver.switch_to.window(driver.window_handles[1])
# Click to show all reviews
print("Showing all reviews.")
time.sleep(get_random_wait())
driver.find_element_by_css_selector('#show_reviews_tab').click()
# Choose sorting by date from drop-down menu
print("Sorting review by date in descending order.")
time.sleep(get_random_wait())
dropdown = Select(driver.find_element_by_css_selector("select[id='review_sort'] "))
dropdown.select_by_visible_text("Date (newer to older)")
date_reviews = []
score_reviews = []
scrape_data = pd.DataFrame()
page = 1
while True:
try:
print("Scraping reviews on page {}".format(page))
# time.sleep(get_random_wait())
container = WebDriverWait(driver, 10).until(EC.visibility_of_all_elements_located((By.CSS_SELECTOR, "div[class='c-review-block']")))
# Extracting dates
dates = [c.find_element_by_css_selector("div[class='c-review-block__row'] > span[class='c-review-block__date']").text.split("Reviewed: ")[-1] for c in container]
date_reviews = date_reviews + dates
# Extracting scores
scores = [float(c.find_element_by_css_selector("div[class='bui-review-score__badge']").text) for c in container]
score_reviews = score_reviews + scores
buffer = pd.DataFrame({"date": date_reviews,
"score": score_reviews})
scrape_data = pd.concat([scrape_data, buffer], axis=0, ignore_index=True, sort=False)
WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.CSS_SELECTOR, "a[class='pagenext']"))).click()
page += 1
except Exception:
scrape_data["date"] = pd.to_datetime(scrape_data["date"], format="%B %d, %Y")
break
With this hotel, the code should click nextpage until the 10th page but it stop at the 8th page. I could not figure out what went wrong with the code. May I have your suggestions?

Some reviews are hidden by moderators and looks like "This review is hidden because it doesn't meet our guidelines.". You cannot extract date or score from these reviews and this is why your code breakes
Try to replace below 2 lines
dates = [c.find_element_by_css_selector("div[class='c-review-block__row'] > span[class='c-review-block__date']").text.split("Reviewed: ")[-1] for c in container]
scores = [float(c.find_element_by_css_selector("div[class='bui-review-score__badge']").text) for c in container]
with
dates_selector = "div[class='c-review-block__row'] > span[class='c-review-block__date']"
dates = [c.find_element_by_css_selector(dates_selector).text.split("Reviewed: ")[-1] if c.find_elements_by_css_selector(dates_selector) else None for c in container]
score_selector = "div[class='bui-review-score__badge']"
scores = [float(c.find_element_by_css_selector(score_selector).text) if c.find_elements_by_css_selector(score_selector) else None for c in container]
In case of hidden review you'll get None as date and score (you can of cource set another value)

Related

How to extract all the google reviews from google map

I need to scrap all the google reviews. There are 90,564 reviews in my page. However the code i wrote can scrap only top 9 reviews. The other reviews are not scraped.
The code is given below:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
# specify the url of the business page on Google
url = 'https://www.google.com/maps/place/ISKCON+temple+Bangalore/#13.0098328,77.5510964,15z/data=!4m7!3m6!1s0x0:0x7a7fb24a41a6b2b3!8m2!3d13.0098328!4d77.5510964!9m1!1b1'
# create an instance of the Chrome driver
driver = webdriver.Chrome()
# navigate to the specified url
driver.get(url)
# Wait for the reviews to load
wait = WebDriverWait(driver, 20) # increased the waiting time
review_elements = wait.until(EC.presence_of_all_elements_located((By.CLASS_NAME, 'wiI7pd')))
# extract the text of each review
reviews = [element.text for element in review_elements]
# print the reviews
print(reviews)
# close the browser
driver.quit()
what should i edit/modify the code to extract all the reviews?

Here is the working code for you after launching the url
totalRev = "div div.fontBodySmall"
username = ".d4r55"
reviews = "wiI7pd"
wait = WebDriverWait(driver, 20)
totalRevCount = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, totalRev))).get_attribute("textContent").split(' ')[0].replace(',','').replace('.','')
print("totalRevCount - ", totalRevCount)
wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, totalRev))).click()
mydict = {}
found = 0
while found < int(totalRevCount):
review_elements = wait.until(EC.presence_of_all_elements_located((By.CLASS_NAME, reviews)))
reviewer_names = wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, username)))
found = len(mydict)
for rev, name in zip(review_elements, reviewer_names):
mydict[name.text] = rev.text
if len(rev.text) == 0:
found = int(totalRevCount) + 1
break
for i in range(8):
ActionChains(driver).key_down(Keys.ARROW_DOWN).perform()
print("found - ", found)
print(mydict)
time.sleep(2)
Explanation -
Get the locators for user name and review since we are going to create a key-value pair which will be useful in creating a non-duplicate result
You need to first get the total number of reviews/ratings that are present for that given location.
Get the username and review for the "visible" part of the webpage and store it in the dictionary
Scroll down the page and wait a few seconds
Get the username and review again and add them to dictionary. Only new ones will be added
As soon as a review that has no text (only rating), the loop will close and you have your results.
NOTE - If you want all reviews irrespective of the review text present or not, you can remove the "if" loop

I think you'll need to scoll down at first, and the get all the reviews.
scroll_value = 230
driver.execute_script( 'window.scrollBy( 0, '+str(scroll_value)+ ' )' ) # to scroll by value
# to get the current scroll value on the y axis
scroll_Y = driver.execute_script( 'return window.scrollY' )
That might be because the elements don't get loaded elsewise.
Since they are over 90'000, you might consider scolling down a little, then getting the reviews, repeat.
Resource: https://stackoverflow.com/a/74508235/20443541

How to extract data in the right order with Beautiful Soup

I am trying to extract the balance sheet for an example ticker "MSFT" (Microsoft) from Yahoo Finance.
Using Selenium to click on the button "Expand All" before any scraping is done. This part seems to work.
By the way, when the Chrome web driver is launched, I manually click on the button(s) to accept or reject cookies. In a later step, I plan to add some more code so that this part is also automated. My question is though not on this one now.
Below is how the code currently looks like.
# for scraping the balance sheet from Yahoo Finance
import pandas as pd
import requests
from datetime import datetime
from bs4 import BeautifulSoup
# importing selenium to click on the "Expand All" button before scraping the financial statements
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
def get_balance_sheet_from_yfinance(ticker):
url = f"https://finance.yahoo.com/quote/{ticker}/balance-sheet?p={ticker}"
options = Options()
options.add_argument("start-maximized")
driver = webdriver.Chrome(chrome_options=options)
driver.get(url)
WebDriverWait(driver, 3600).until(EC.element_to_be_clickable((
By.XPATH, "//section[#data-test='qsp-financial']//span[text()='Expand All']"))).click()
#content whole page in html format
soup = BeautifulSoup(driver.page_source, 'html.parser')
# get the column headers (i.e. 'Breakdown' row)
div = soup.find_all('div', attrs={'class': 'D(tbhg)'})
if len(div) < 1:
print("Fail to retrieve table column header")
exit(0)
# get the list of columns from the column headers
col = []
for h in div[0].find_all('span'):
text = h.get_text()
if text != "Breakdown":
col.append(datetime.strptime(text, "%m/%d/%Y"))
df = pd.DataFrame(columns=col)
# the following code returns an empty list for index (why?)
# and values in a list that need actually be in a DataFrame
idx = []
for div in soup.find_all('div', attrs={'data-test': 'fin-row'}):
for h in div.find_all('title'):
text = h.get_text()
idx.append(text)
val = []
for div in soup.find_all('div', attrs={'data-test': 'fin-col'}):
for h in div.find_all('span'):
num = int(h.get_text().replace(",", "")) * 1000
val.append(num)
# if the above part is commented out and this block is used instead
# the following code manages to work well until the row "Cash Equivalents"
# that is because there are no entries for years 2020 and 2019 on this row
""" for div in soup.find_all('div', attrs={'data-test': 'fin-row'}):
i = 0
idx = ""
val = []
for h in div.find_all('span'):
if i % 5 == 0:
idx = h.get_text()
else:
num = int(h.get_text().replace(",", "")) * 1000
val.append(num)
i += 1
row = pd.DataFrame([val], columns=col, index=[idx])
df = pd.concat([df, row], axis=0) """
return idx, val
get_balance_sheet_from_yfinance("MSFT")
I could not get the data scraped from the expanded table in a usable tabular format. Instead, the function above returns what I managed to scrape from the webpage. There are some additional comments in the code.
Could you give me some ideas on how to properly extract the data and put it into a DataFrame object with index which should be the text under the "Breakdown" column? Basically, the DataFrame should look like the snapshot below, with what is under the first column in there being the index.
balance-sheet-df

i've spent a long time on this, hope it helps, basically your function now returns a dataFrame with the following formatting:
2022-06-29 2021-06-29 2020-06-29 2019-06-29
Total Assets 364,840,000 333,779,000 301,311,000 286,556,000
Current Assets 169,684,000 184,406,000 181,915,000 175,552,000
Cash, Cash Equivalents & Short Term Investments 104,749,000 130,334,000 136,527,000 133,819,000
Cash And Cash Equivalents 13,931,000 14,224,000 13,576,000 11,356,000
Cash 8,258,000 7,272,000 - -
... ... ... ... ...
Tangible Book Value 87,720,000 84,477,000 67,915,000 52,554,000
Total Debt 61,270,000 67,775,000 70,998,000 78,366,000
Net Debt 35,850,000 43,922,000 49,751,000 60,822,000
Share Issued 7,464,000 7,519,000 7,571,000 7,643,000
Ordinary Shares Number 7,464,000 7,519,000 7,571,000 7,643,000
and here's the final code:
# for scraping the balance sheet from Yahoo Finance
from time import sleep
import pandas as pd
import requests
from datetime import datetime
from bs4 import BeautifulSoup
# importing selenium to click on the "Expand All" button before scraping the financial statements
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
def get_balance_sheet_from_yfinance(ticker):
url = f"https://finance.yahoo.com/quote/{ticker}/balance-sheet?p={ticker}"
options = Options()
options.add_argument("start-maximized")
driver = webdriver.Chrome(chrome_options=options)
driver.get(url)
WebDriverWait(driver, 3600).until(EC.element_to_be_clickable((
By.XPATH, "//section[#data-test='qsp-financial']//span[text()='Expand All']"))).click()
# content whole page in html format
soup = BeautifulSoup(driver.page_source, 'html.parser')
# get the column headers (i.e. 'Breakdown' row)
div = soup.find_all('div', attrs={'class': 'D(tbhg)'})
if len(div) < 1:
print("Fail to retrieve table column header")
exit(0)
# get the list of columns from the column headers
col = []
for h in div[0].find_all('span'):
text = h.get_text()
if text != "Breakdown":
col.append(datetime.strptime(text, "%m/%d/%Y"))
row = {}
for div in soup.find_all('div', attrs={'data-test': 'fin-row'}):
head = div.find('span').get_text()
i = 4
for h in div.find_all('span'):
if h.get_text().replace(',', '').isdigit() or h.get_text()[0] == '-':
row[head].append(h.get_text())
i += 1
else:
while i < 4:
row[head].append('')
i += 1
else:
head = h.get_text()
row[head] = []
i = 0
for k, v in row.items():
while len(v) < 4:
row[k].append('-')
df = pd.DataFrame(columns=col, index=row.keys(), data=row.values())
print(df)
return df
get_balance_sheet_from_yfinance("MSFT")
i've removed some od the unused code and added a new scrapping method, but i have kept your method of getting the dates of all the columns.
if you have any questions don't hesitate to ask in the comments.

Already complete scraping scrapes everything on the page. I would like to limit the scraping to only a certain section

I placed the code of a complete and properly functioning scraping that I own. Successfully scrapes all elements on the page.
However, I would like to scrape only a small limited section of the page with the same elements as scraping. This limited section is already scraped correctly along with all elements of the page, but I would like to scrape only it and not "all + it". The link is here
There are 4 tables on the page, but I would like to scrape just one, that is the table called "Programma", ie the html section "event-summary event" or "leagues-static event-summary-leagues ". But of this section only the elements of the last round (Matchday 14). Matchday 14 only. No round 15. So obviously that with each update of the page rounds, the last round is always scraped as well.
So I would need to insert something that makes scraping understand to download only the elements (which it already owns and scrapes) of of that section and the last round.
The code is already complete and works fine, so I'm not looking for code services, but for a little hint to tell me how to limit the scraping to just the section mentioned above. Scraping is in Selenium. I would like to stick with Selenium and my code as it is already functional and complete. Thanks
import selenium
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
driver = webdriver.Firefox()
driver.get("url")
driver.implicitly_wait(12)
#driver.minimize_window()
wait = WebDriverWait(driver, 10)
all_rows = driver.find_elements(By.CSS_SELECTOR, "div[class^='event__round'],div[class^='event__match']")
current_round = '?'
for bundesliga in all_rows:
classes = bundesliga.get_attribute('class')
#print(classes)
if 'event__round' in classes:
#round = row.find_elements(By.CSS_SELECTOR, "[class^='event__round event__round--static']")
#current_round = row.text # full text `Round 20`
current_round = bundesliga.text.split(" ")[-1] # only `20` without `Round`
else:
datetime = bundesliga.find_element(By.CSS_SELECTOR, "[class^='event__time']")
#Divide la data e l'ora
date, time = datetime.text.split(" ")
date = date.rstrip('.') # right-strip to remove `.` at the end of date
team_home = bundesliga.find_element(By.CSS_SELECTOR, "[class^='event__participant event__participant--home']")
team_away = bundesliga.find_element(By.CSS_SELECTOR, "[class^='event__participant event__participant--away']")
score_home = bundesliga.find_element(By.CSS_SELECTOR, "[class^='event__score event__score--home']")
score_away = bundesliga.find_element(By.CSS_SELECTOR, "[class^='event__score event__score--away']")
bundesliga = [current_round, date, time, team_home.text, team_away.text, score_home.text, score_away.text]
bundesliga.append(bundesliga)
print(bundesliga)

I think all you need to do is limit all_rows variable. One way to do this is finding the tab you are looking for with text and then getting the parent elements.
import selenium
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException
driver = webdriver.Firefox()
driver.get("https://www.someurl/some/other/page")
driver.implicitly_wait(12)
#driver.minimize_window()
wait = WebDriverWait(driver, 10)
# all_rows = driver.find_elements(By.CSS_SELECTOR, "div[class^='event__round'],div[class^='event__match']")
############### UPDATE ####################
def parent_element(element):
return element.find_element(By.XPATH, './..')
programma_element = WebDriverWait(driver, 10).until(
EC.visibility_of_element_located((By.XPATH, "//div[text()='Programma']")))
programma_element_p1 = parent_element(programma_element)
programma_element_p2 = parent_element(programma_element_p1)
programma_element_p3 = parent_element(programma_element_p2)
all_rows = programma_element_p3.find_elements(By.CSS_SELECTOR, "div[class^='event__round'],div[class^='event__match']")
filter_rows = []
for row in all_rows:
if "event__match--last" in row.get_attribute('class'):
filter_rows.append(row)
break
else:
filter_rows.append(row)
############### UPDATE ####################
current_round = '?'
for bundesliga in filter_rows:
classes = bundesliga.get_attribute('class')
#print(classes)
if 'event__round' in classes:
#round = row.find_elements(By.CSS_SELECTOR, "[class^='event__round event__round--static']")
#current_round = row.text # full text `Round 20`
current_round = bundesliga.text.split(" ")[-1] # only `20` without `Round`
else:
datetime = bundesliga.find_element(By.CSS_SELECTOR, "[class^='event__time']")
#Divide la data e l'ora
date, time = datetime.text.split(" ")
date = date.rstrip('.') # right-strip to remove `.` at the end of date
team_home = bundesliga.find_element(By.CSS_SELECTOR, "[class^='event__participant event__participant--home']")
team_away = bundesliga.find_element(By.CSS_SELECTOR, "[class^='event__participant event__participant--away']")
# score_home = bundesliga.find_element(By.CSS_SELECTOR, "[class^='event__score event__score--home']")
# score_away = bundesliga.find_element(By.CSS_SELECTOR, "[class^='event__score event__score--away']")
try:
score_home = bundesliga.find_element(By.CSS_SELECTOR, "[class^='event__score event__score--home']")
except (TimeoutException, NoSuchElementException):
MyObject = type('MyObject', (object,), {})
score_home = MyObject()
score_home.text = "-"
try:
score_away = bundesliga.find_element(By.CSS_SELECTOR, "[class^='event__score event__score--away']")
except (TimeoutException, NoSuchElementException):
MyObject = type('MyObject', (object,), {})
score_away = MyObject()
score_away.text = "-"
bundesliga = [current_round, date, time, team_home.text, team_away.text, score_home.text, score_away.text]
bundesliga.append(bundesliga)
print(bundesliga)

Python Selenium: Changing from three loops to one loop repeat the same information

I am extracting google reviews of a resturant. I am interested in extracting reviewer name, rating given by reviewer, and text of the review. I used following code for the extraction:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.action_chains import ActionChains
import time
driver = webdriver.Chrome('')
base_url = 'https://www.google.com/search?tbs=lf:1,lf_ui:9&tbm=lcl&sxsrf=AOaemvJFjYToqQmQGGnZUovsXC1CObNK1g:1633336974491&q=10+famous+restaurants+in+Dunedin&rflfq=1&num=10&sa=X&ved=2ahUKEwiTsqaxrrDzAhXe4zgGHZPODcoQjGp6BAgKEGo&biw=1280&bih=557&dpr=2#lrd=0xa82eac0dc8bdbb4b:0x4fc9070ad0f2ac70,1,,,&rlfi=hd:;si:5749134142351780976,l,CiAxMCBmYW1vdXMgcmVzdGF1cmFudHMgaW4gRHVuZWRpbiJDUjEvZ2VvL3R5cGUvZXN0YWJsaXNobWVudF9wb2kvcG9wdWxhcl93aXRoX3RvdXJpc3Rz2gENCgcI5Q8QChgFEgIIFkiDlJ7y7YCAgAhaMhAAEAEQAhgCGAQiIDEwIGZhbW91cyByZXN0YXVyYW50cyBpbiBkdW5lZGluKgQIAxACkgESaXRhbGlhbl9yZXN0YXVyYW50mgEkQ2hkRFNVaE5NRzluUzBWSlEwRm5TVU56ZW5WaFVsOUJSUkFCqgEMEAEqCCIEZm9vZCgA,y,2qOYUvKQ1C8;mv:[[-45.8349553,170.6616387],[-45.9156414,170.4803685]]'
driver.get(base_url)
WebDriverWait(driver,10).until(EC.element_to_be_clickable((By.XPATH,"//div[./span[text()='Newest']]"))).click()
total_reviews_text =driver.find_element_by_xpath("//div[#class='review-score-container']//div//div//span//span[#class='z5jxId']").text
num_reviews = int (total_reviews_text.split()[0])
all_reviews = WebDriverWait(driver, 20).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, 'div.gws-localreviews__google-review')))
time.sleep(2)
total_reviews = len(all_reviews)
while total_reviews < num_reviews:
driver.execute_script('arguments[0].scrollIntoView(true);', all_reviews[-1])
WebDriverWait(driver, 5, 0.25).until_not(EC.presence_of_element_located((By.CSS_SELECTOR, 'div[class$="activityIndicator"]')))
#all_reviews = driver.find_elements_by_css_selector('div.gws-localreviews__google-review')
time.sleep(5)
all_reviews = WebDriverWait(driver, 5).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, 'div.gws-localreviews__google-review')))
print(total_reviews)
total_reviews +=5
person_info = driver.find_elements_by_xpath("//div[#id='reviewSort']//div[contains(#class,'google-review')]")
rating_info = driver.find_elements_by_xpath("//div[#class='PuaHbe']")
review_text = driver.find_elements_by_xpath("//div[#class='Jtu6Td']")
for person in person_info:
name = person.find_element_by_xpath("./div/div/div/a").text
print(name)
for rating in rating_info:
rating_txt = person.find_element_by_xpath("./g-review-stars/span").get_attribute('aria-label')
print(rating_txt)
for text in review_text:
texts = text.find_element_by_xpath("./span").text
print(texts)
The above code worked as per expectations. I want to make slight change in above code. Instead of using three loops to display name, rating, and review_text. I wanted to extract the same information using one loop. So I made following changes in the above code:
reviews_info = driver.find_elements_by_xpath("//div[#class='jxjCjc']")
for review_info in reviews_info:
name = review_info.find_element_by_xpath("./div/div/a").text
rating = review_info.find_element_by_xpath("//div[#class='PuaHbe']//g-review-stars//span").get_attribute('aria-label')
text = review_info.find_element_by_xpath("//div[#class='Jtu6Td']//span").text
print(name)
print(rating)
print(text)
print()
The problem with a change in code is that it displays the same information (i.e. rating and text) for all reviewers names. I am not sure where am I making the mistake. Any help to fix the issue would be really appreciated.

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
driver = webdriver.Chrome()
base_url = 'https://www.google.com/search?tbs=lf:1,lf_ui:9&tbm=lcl&sxsrf=AOaemvJFjYToqQmQGGnZUovsXC1CObNK1g:1633336974491&q=10+famous+restaurants+in+Dunedin&rflfq=1&num=10&sa=X&ved=2ahUKEwiTsqaxrrDzAhXe4zgGHZPODcoQjGp6BAgKEGo&biw=1280&bih=557&dpr=2#lrd=0xa82eac0dc8bdbb4b:0x4fc9070ad0f2ac70,1,,,&rlfi=hd:;si:5749134142351780976,l,CiAxMCBmYW1vdXMgcmVzdGF1cmFudHMgaW4gRHVuZWRpbiJDUjEvZ2VvL3R5cGUvZXN0YWJsaXNobWVudF9wb2kvcG9wdWxhcl93aXRoX3RvdXJpc3Rz2gENCgcI5Q8QChgFEgIIFkiDlJ7y7YCAgAhaMhAAEAEQAhgCGAQiIDEwIGZhbW91cyByZXN0YXVyYW50cyBpbiBkdW5lZGluKgQIAxACkgESaXRhbGlhbl9yZXN0YXVyYW50mgEkQ2hkRFNVaE5NRzluUzBWSlEwRm5TVU56ZW5WaFVsOUJSUkFCqgEMEAEqCCIEZm9vZCgA,y,2qOYUvKQ1C8;mv:[[-45.8349553,170.6616387],[-45.9156414,170.4803685]]'
driver.get(base_url)
WebDriverWait(driver,10).until(EC.element_to_be_clickable((By.XPATH,"//div[./span[text()='Newest']]"))).click()
total_reviews_text =driver.find_element_by_xpath("//div[#class='review-score-container']//div//div//span//span[#class='z5jxId']").text
num_reviews = int (total_reviews_text.split()[0])
print("NUm reviews=", num_reviews)
all_reviews = WebDriverWait(driver, 20).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, 'div.gws-localreviews__google-review')))
time.sleep(2)
total_reviews = len(all_reviews)
print("Total reviews=", total_reviews)
s = "(//div[#id='reviewSort']//div[contains(#class,'google-review')])[0]"
b = '0'
a = 1 # Index of Review button
for i in range(10):
c = str(a)
s = s.replace(b, c) # Updating Xpath's index in every loop so that it can focus on new review everytime.
b = str(a)
a = a + 1
WebDriverWait(driver, 5, 0.25).until_not(EC.presence_of_element_located((By.CSS_SELECTOR, 'div[class$="activityIndicator"]')))
time.sleep(5)
all_reviews = WebDriverWait(driver, 5).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, 'div.gws-localreviews__google-review')))
total_reviews +=1
Info = driver.find_element_by_xpath(s).text
print(Info)
print("<------------------------------------------------------>\n\n")
Output:-
Click Here to See Program Output

I am very new to scraping please bear with me and this is my 1st project. I am trying to scrape a site using selenium

"problem lines"
for_tariff_loop = driver.find_elements_by_xpath("//span[#class='phx-radio__element']")
radio_label_list = for_tariff_loop[i].find_element_by_css_selector('span[class="phx-radio__label"]')
print(radio_label_list)
time.sleep(1)
website I'm scraping https://www.telekom.de/unterwegs/apple/apple-iphone-13-pro/graphit-512gb
label image
I was not able to print the radio buttons label according to checked button. I don't know what is the mistake and where I did it. could anyone help on this. It will be helpful for me to learn. Change tariff links given below links,
import xlwt
from selenium import webdriver
import re
import time
from datetime import date
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.keys import Keys
class telekommobiles:
def __init__(self):
self.url="https://www.telekom.de/mobilfunk/geraete/smartphone?page=1&pageFilter=promotion"
self.country='DE'
self.currency='GBP'
self.VAT='Included'
self.shipping = 'free shipping within 3-4 weeks'
self.Pre_PromotionPrice ='N/A'
self.color ='N/A'
def telekom(self):
#try:
driver=webdriver.Chrome()
driver.maximize_window()
driver.get(self.url)
today = date.today()
time.sleep(5)
cookies = driver.find_element_by_css_selector('button.cl-btn.cl-btn--accept-all').click()
print("cookies accepted")
links_prod_check = []
prod_models = []
prod_manufacturer =[]
prod_memorys = []
product_colors =[]
product_price_monthly_payments = []
product_price_one_time_payments =[]
product_links = []
containers = driver.find_elements_by_css_selector('div[class="styles_item__12Aw4"]')
i = 1
for container in containers:
p_links =container.find_element_by_tag_name('a').get_attribute('href')
i = i + 1
product_links.append(p_links)
#print(p_links)
for links in product_links:
driver.get(links)
#time.sleep(5)
#print(driver.current_url)
#links_prod_check.append(driver.current_url)
coloroptions = WebDriverWait(driver, 30).until(EC.presence_of_all_elements_located((By.XPATH,"//li[#data-qa='list_ColorVariant']")))
#print(coloroptions)
for i in range(len(coloroptions)):
coloroption = driver.find_elements_by_xpath("//li[#data-qa='list_ColorVariant']")
coloroption[i].click()
#print(coloroption[i])
time.sleep(3)
memoryoptions = WebDriverWait(driver, 30).until(EC.presence_of_all_elements_located((By.XPATH,"//span[#class='phx-radio__element']")))
for i in range(len(memoryoptions)):
memoryoption = driver.find_elements_by_xpath("//span[#class='phx-radio__element']")
try:
memoryoption[i].click()
except:
pass
time.sleep(5)
change_traiff = driver.find_element_by_css_selector('button[class="phx-link phx-list-of-links__link js-mod tracking-added"]').click()
time.sleep(3)
#looping for each section
section_loops = driver.find_elements_by_css_selector('section[class="tariff-catalog--layer"]')
#print(len(section_loops))
for section_loop in section_loops:
#print(section_loop)
time.sleep(5)
#Headings
heading_1 = section_loop.find_element_by_css_selector('h2[class="page-title page-title--lowercase"]').text
print(heading_1)
# looping for each separate boxes
each_box_subcontainers = section_loop.find_elements_by_css_selector('.phx-tariff-box__section')
#print(len(each_box_subcontainers))
for subcontainer in each_box_subcontainers:
#print(subcontainer)
looping_for_tariff = WebDriverWait(driver, 10).until(EC.presence_of_all_elements_located((By.XPATH,"//span[#class='phx-radio__element']")))
#print(looping_for_tariff)
for i in range(len(looping_for_tariff)):
#print(i)
try:
for_tariff_loop = driver.find_elements_by_xpath("//span[#class='phx-radio__element']")
for_tariff_loop[i].click()
time.sleep(3)
except:
pass
for_tariff_loop = driver.find_elements_by_xpath("//span[#class='phx-radio__element']")
radio_label_list = for_tariff_loop[i].find_element_by_css_selector('span[class="phx-radio__label"]')
print(radio_label_list)
time.sleep(1)
change_traiff_close_button = driver.find_element_by_css_selector('span[class="icon-after-yellow-close right close popup-close-tr js-popup-close"]').click()
telekom_de=telekommobiles()
telekom_de.telekom()

You are trying to find element within an element. Finding radio_label_list using for_tariff_loop[i], xpath for radio_label_list will become like below:
//span[#class='phx-radio__element']//span[#class="phx-radio__label"]
Which does not exist in the DOM.
I tried the last part of the code. And was able to print the Memory size like below. Do try and confirm:
Replaced css-selector for radio_label_list with this xpath ./following-sibling::span
looping_for_tariff = WebDriverWait(driver, 10).until(EC.presence_of_all_elements_located((By.XPATH, "//span[#class='phx-radio__element']")))
# print(looping_for_tariff)
for i in range(len(looping_for_tariff)):
# print(i)
try:
for_tariff_loop = driver.find_elements_by_xpath("//span[#class='phx-radio__element']")
for_tariff_loop[i].click()
time.sleep(3)
except:
pass
for_tariff_loop = driver.find_elements_by_xpath("//span[#class='phx-radio__element']")
radio_label_list = for_tariff_loop[i].find_element_by_xpath("./following-sibling::span").text
print(radio_label_list)
time.sleep(1)
As per the comments, check this code:
driver.get("https://www.telekom.de/unterwegs/apple/apple-iphone-13-pro/graphit-512gb")
wait = WebDriverWait(driver,30)
wait.until(EC.element_to_be_clickable((By.XPATH,"//button[text()='Accept All']"))).click()
wait.until(EC.element_to_be_clickable((By.XPATH,"//ul[contains(#class,'phx-tariff-notification-box-new__element--desktop-tablet')]/li[2]/button"))).click()
length = len(driver.find_elements_by_class_name("phx-tariff-box__section"))
for i in range(length):
print("----------------------------------------------------------------------------------------------------------")
options = driver.find_elements_by_class_name("phx-tariff-box__section")
datas = options[i].find_element_by_xpath(".//div[contains(#class,'phx-tariff-box__volume')]").get_attribute("innerText")
print("data: {}".format(datas))
len_types = len(options[i].find_elements_by_xpath(".//div[#class='phx-tariff-box__radios-inner']//label"))
types = options[i].find_elements_by_xpath(".//div[#class='phx-tariff-box__radios-inner']//label")
if len(types) == 0:
price = options[i].find_element_by_xpath(".//p[#data-qa='block_TariffPrice']").get_attribute("innerText")
print(price)
else:
for j in range(len_types):
types[j].click()
time.sleep(2)
options = driver.find_elements_by_class_name("phx-tariff-box__section")
types = options[i].find_elements_by_xpath(".//div[#class='phx-tariff-box__radios-inner']//label")
try:
types[j].find_element_by_xpath("./input[#checked]")
type = types[j].find_element_by_xpath("./span[2]").get_attribute("innerText")
price = options[i].find_element_by_xpath(".//p[#data-qa='block_TariffPrice']").get_attribute("innerText")
print(f"{type}: {price}")
except:
pass

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

WebDriverWait is terminated before reach to the last page - python

Related

How to extract all the google reviews from google map

How to extract data in the right order with Beautiful Soup

Already complete scraping scrapes everything on the page. I would like to limit the scraping to only a certain section

Python Selenium: Changing from three loops to one loop repeat the same information

I am very new to scraping please bear with me and this is my 1st project. I am trying to scrape a site using selenium

Categories

Resources