Python Selenium: how to click on table content when changing table page - python

from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.wait import WebDriverWait
from bs4 import BeautifulSoup
import time
url = "https://www.bungol.ca/"
driver = webdriver.Firefox(executable_path ='/usr/local/bin/geckodriver')
driver.get(url)
#Select toronto by default
driver.find_element_by_xpath("""/html/body/section/div[2]/div/div[1]/form/div/select/optgroup[1]/option[1]""").click()
time.sleep(1)
driver.find_element_by_xpath("""/html/body/section/div[2]/div/div[1]/form/div/button""").click()
driver.find_element_by_xpath("""/html/body/nav/div[1]/ul[1]/li[3]/select/option[8]""").click()
#select last 2 years
driver.find_element_by_xpath("""//*[#id="activeListings"]""").click()
#opening sold listing in that area
driver.find_element_by_xpath("""/html/body/div[5]/i""").click() #closes property type slide
driver.find_element_by_xpath("""//*[#id="navbarDropdown"]""").click()
driver.find_element_by_xpath("""//*[#id="listViewToggle"]""").click()
def data_collector():
hidden_next = driver.find_element_by_class_name("nextPaginate")
#inputs in textbox
inputElement = driver.find_element_by_id('navbarSearchAddressInput')
inputElement.send_keys('M3B2B6')
time.sleep(1)
#inputElement.send_keys(Keys.ENTER)
row_count = 3
table = driver.find_elements_by_css_selector("""#listViewTableBody""")
while hidden_next.is_displayed(): #while there is a next page button to be pressed
time.sleep(3) #delay for table refresh
#row_count = len(driver.find_elements_by_css_selector("""html body#body div#listView.table-responsive table#listViewTable.table.table-hover.mb-0 tbody#listViewTableBody tr.mb-2"""))
for row in range(row_count): #loop through the rows found
#alternate row by changing the tr index
driver.find_element_by_xpath("""/html/body/div[8]/table/tbody/tr[""" + str(row + 1) + """]/td[1]""").click()
time.sleep(2)
print(driver.find_element_by_css_selector("""#listingStatus""").text) #sold price
#closes the pop up after getting the data
driver.find_element_by_css_selector('.modal-xl > div:nth-child(1) > div:nth-child(1) > button:nth-child(1)').click()
time.sleep(1)
#clicks next page button for the table
driver.find_element_by_xpath("""//*[#id="listViewNextPaginate"]""").click()
if __name__ == "__main__":
data_collector()
The code loops through all the rows in the first table (currently set to 3 for testing), clicks on each row - pop-up shows up, grabs the information and close the pop-up. But when it clicks to the next page, it doesn't click on any of the rows of the second page. It doesn't show an error for not finding the row xpath either. But instead shows error for the pop-window close button because the popup did not open due to not pressing on the row to display pop-up window.
How do I make it click the rows when the table flips to the next page?
for table reference:
https://www.bungol.ca/map/location/toronto/?
close the property slider on the left
click tool -> open list

In my browser I also can't open the pop up, when I click on the row in the second page. So I think this can be the fault of the website.
If You want check if the element exists, You can use this code:
def check_exists_by_xpath(xpath, driver):
try:
driver.find_element_by_xpath(xpath)
except NoSuchElementException:
return False
return True

Try this. My understanding is your script goes through the listings, opens a listing, grabs the listings status, close the listing and does the same for all the listings.
If my understanding is correct, the below code may help you. Its better to change implicit and time.sleep() to explicit wait and clean up the functions.
Having said that, I did not fully test the code, but the code did navigate to more than one page of listings and collected data
from selenium.webdriver import Firefox
from selenium.webdriver.support.select import Select
import time
driver = Firefox(executable_path=r'path to geckodriver.exe')
driver.get('https://www.bungol.ca/')
driver.maximize_window()
driver.implicitly_wait(10)
# Select toronto by default
driver.find_element_by_css_selector('#locationChoice button[type="submit"]').click()
sold_in_the_last = Select(driver.find_element_by_id('soldInTheLast'))
sold_in_the_last.select_by_visible_text('2 Years')
driver.find_element_by_id('activeListings').click()
# opening sold listing in that area
driver.find_element_by_css_selector('#leftSidebarClose>i').click()
driver.find_element_by_id('navbarDropdown').click()
driver.find_element_by_id('listViewToggle').click()
def get_listings():
listings_table = driver.find_element_by_id('listViewTableBody')
listings_table_rows = listings_table.find_elements_by_tag_name('tr')
return listings_table_rows
def get_sold_price(listing):
listing.find_element_by_css_selector('td:nth-child(1)').click()
time.sleep(2)
sold_price = driver.find_element_by_id('listingStatus').text
time.sleep(2)
close = driver.find_elements_by_css_selector('.modal-content>.modal-body>button[class="close"]')
close[2].click()
time.sleep(2)
return sold_price
def data_collector():
data = []
time.sleep(2)
next = driver.find_element_by_id('listViewNextPaginate')
# get all the listing prior to the last page
while next.is_displayed():
listings = get_listings()
for listing in listings:
data.append(get_sold_price(listing))
next.click()
# get listings from last page
listings = get_listings()
for listing in listings:
data.append(get_sold_price(listing))
return data
if __name__ == '__main__':
from pprint import pprint
data = data_collector()
pprint(data)
print(len(data))

Related

How to extract all the google reviews from google map

I need to scrap all the google reviews. There are 90,564 reviews in my page. However the code i wrote can scrap only top 9 reviews. The other reviews are not scraped.
The code is given below:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
# specify the url of the business page on Google
url = 'https://www.google.com/maps/place/ISKCON+temple+Bangalore/#13.0098328,77.5510964,15z/data=!4m7!3m6!1s0x0:0x7a7fb24a41a6b2b3!8m2!3d13.0098328!4d77.5510964!9m1!1b1'
# create an instance of the Chrome driver
driver = webdriver.Chrome()
# navigate to the specified url
driver.get(url)
# Wait for the reviews to load
wait = WebDriverWait(driver, 20) # increased the waiting time
review_elements = wait.until(EC.presence_of_all_elements_located((By.CLASS_NAME, 'wiI7pd')))
# extract the text of each review
reviews = [element.text for element in review_elements]
# print the reviews
print(reviews)
# close the browser
driver.quit()
what should i edit/modify the code to extract all the reviews?
Here is the working code for you after launching the url
totalRev = "div div.fontBodySmall"
username = ".d4r55"
reviews = "wiI7pd"
wait = WebDriverWait(driver, 20)
totalRevCount = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, totalRev))).get_attribute("textContent").split(' ')[0].replace(',','').replace('.','')
print("totalRevCount - ", totalRevCount)
wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, totalRev))).click()
mydict = {}
found = 0
while found < int(totalRevCount):
review_elements = wait.until(EC.presence_of_all_elements_located((By.CLASS_NAME, reviews)))
reviewer_names = wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, username)))
found = len(mydict)
for rev, name in zip(review_elements, reviewer_names):
mydict[name.text] = rev.text
if len(rev.text) == 0:
found = int(totalRevCount) + 1
break
for i in range(8):
ActionChains(driver).key_down(Keys.ARROW_DOWN).perform()
print("found - ", found)
print(mydict)
time.sleep(2)
Explanation -
Get the locators for user name and review since we are going to create a key-value pair which will be useful in creating a non-duplicate result
You need to first get the total number of reviews/ratings that are present for that given location.
Get the username and review for the "visible" part of the webpage and store it in the dictionary
Scroll down the page and wait a few seconds
Get the username and review again and add them to dictionary. Only new ones will be added
As soon as a review that has no text (only rating), the loop will close and you have your results.
NOTE - If you want all reviews irrespective of the review text present or not, you can remove the "if" loop
I think you'll need to scoll down at first, and the get all the reviews.
scroll_value = 230
driver.execute_script( 'window.scrollBy( 0, '+str(scroll_value)+ ' )' ) # to scroll by value
# to get the current scroll value on the y axis
scroll_Y = driver.execute_script( 'return window.scrollY' )
That might be because the elements don't get loaded elsewise.
Since they are over 90'000, you might consider scolling down a little, then getting the reviews, repeat.
Resource: https://stackoverflow.com/a/74508235/20443541

How to stop selenium scraper from redirecting to another internal weblink of the scraped website?

Was wondering if anyone knows of a way for instructing a selenium script to avoid visiting/redirecting to an internal page that wasn't part of the code. Essentially, my code opens up this page:
https://cryptwerk.com/companies/?coins=1,6,11,2,3,8,17,7,13,4,25,29,24,32,9,38,15,30,43,42,41,12,40,44,20
keeps clicking on show more button until there's none (at end of page) - which by then - it should have collected the links of all the products listed on the page it scrolled through till the end, then visit each one respectively.
What happens instead, it successfully clicks on show more till the end of the page, but then it visits this weird promotion page of the same website instead of following each of the gathered links respectively and then scraping further data points located off each of those newly opened ones.
In a nutshell, would incredibly appreciate it if someone can explain how to avoid this automated redirection on its own! And this is the code in case someone can gratefully nudge me in the right direction :)
from selenium.webdriver import Chrome
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
import time
from selenium.common.exceptions import NoSuchElementException, ElementNotVisibleException
import json
import selenium.common.exceptions as exception
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.keys import Keys
from selenium import webdriver
webdriver = '/Users/karimnabil/projects/selenium_js/chromedriver-1'
driver = Chrome(webdriver)
driver.implicitly_wait(5)
url = 'https://cryptwerk.com/companies/?coins=1,6,11,2,3,8,17,7,13,4,25,29,24,32,9,38,15,30,43,42,41,12,40,44,20'
driver.get(url)
links_list = []
coins_list = []
all_names = []
all_cryptos = []
all_links = []
all_twitter = []
all_locations = []
all_categories = []
all_categories2 = []
wait = WebDriverWait(driver, 2)
sign_in = driver.find_element_by_xpath("//li[#class='nav-item nav-guest']/a")
sign_in.click()
time.sleep(2)
user_name = wait.until(EC.presence_of_element_located((By.XPATH, "//input[#name='login']")))
user_name.send_keys("karimnsaber95#gmail.com")
password = wait.until(EC.presence_of_element_located((By.XPATH, "//input[#name='password']")))
password.send_keys("PleomaxCW#2")
signIn_Leave = driver.find_element_by_xpath("//div[#class='form-group text-center']/button")
signIn_Leave.click()
time.sleep(3)
while True:
try:
loadMoreButton = driver.find_element_by_xpath("//button[#class='btn btn-outline-primary']")
time.sleep(2)
loadMoreButton.click()
time.sleep(2)
except exception.StaleElementReferenceException:
print('stale element')
break
print('no more elements to show')
try:
company_links = driver.find_elements_by_xpath("//div[#class='companies-list items-infinity']/div[position() > 3]/div[#class='media-body']/div[#class='title']/a")
for link in company_links:
links_list.append(link.get_attribute('href'))
except:
pass
try:
with open("links_list.json", "w") as f:
json.dump(links_list, f)
with open("links_list.json", "r") as f:
links_list = json.load(f)
except:
pass
try:
for link in links_list:
driver.get(link)
name = driver.find_element_by_xpath("//div[#class='title']/h1").text
try:
show_more_coins = driver.find_element_by_xpath("//a[#data-original-title='Show more']")
show_more_coins.click()
time.sleep(1)
except:
pass
try:
categories = driver.find_elements_by_xpath("//div[contains(#class, 'categories-list')]/a")
categories_list = []
for category in categories:
categories_list.append(category.text)
except:
pass
try:
top_page_categories = driver.find_elements_by_xpath("//ol[#class='breadcrumb']/li/a")
top_page_categories_list = []
for category in top_page_categories:
top_page_categories_list.append(category.text)
except:
pass
coins_links = driver.find_elements_by_xpath("//div[contains(#class, 'company-coins')]/a")
all_coins = []
for coin in coins_links:
all_coins.append(coin.get_attribute('href'))
try:
location = driver.find_element_by_xpath("//div[#class='addresses mt-3']/div/div/div/div/a").text
except:
pass
try:
twitter = driver.find_element_by_xpath("//div[#class='links mt-2']/a[2]").get_attribute('href')
except:
pass
try:
print('-----------')
print('Company name is: {}'.format(name))
print('Potential Categories are: {}'.format(categories_list))
print('Potential top page categories are: {}'.format(top_page_categories_list))
print('Supporting Crypto is:{}'.format(all_coins))
print('Registered location is: {}'.format(location))
print('Company twitter profile is: {}'.format(twitter))
time.sleep(1)
except:
pass
all_names.append(name)
all_categories.append(categories_list)
all_categories2.append(top_page_categories_list)
all_cryptos.append(all_coins)
all_twitter.append(twitter)
all_locations.append(location)
except:
pass
df = pd.DataFrame(list(zip(all_names, all_categories, all_categories2, all_cryptos, all_twitter, all_locations)), columns=['Company name', 'Categories1', 'Categories2', 'Supporting Crypto', 'Twitter Handle', 'Registered Location'])
CryptoWerk_Data = df.to_csv('CryptoWerk4.csv', index=False)
Redirect calls happen for two reasons, in your case either by executing some javascript code when clicking the last time on the load more button or by receiving an HTTP 3xx code, which is the least likely in your case.
So you need to identify when this javascript code is executed and send an ESC_KEY before it loads and then executing the rest of your script.
You could also scrape the links and append them to your list before clicking the load more button and each time it is clicked, make an if statement the verify the link of the page you're in, if it is that of the promotion page then execute the rest of your code, else click load more.
while page_is_same:
scrape_elements_add_to_list()
click_load_more()
verify_current_page_link()
if current_link_is_same != link_of_scraped_page:
page_is_same = False
# rest of the code here

How to periodically fetch records from a website using selenium?

I have a small script that fetches company data from a website. This website gets regularly updated with new company information. How can I update my csv with new records on a periodic basis? Also as you can see in the code I have used an explicit range for the pages, what other solutions are possible?
The following is the code -
from selenium.webdriver import Firefox
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from time import sleep
import csv
#navigate to the ystory companies page
#start collecting data from ystory
START_URL = 'https://yourstory.com/companies/search?page=1&hitsPerPage=30'
#when the collection populates 30 elements then click on next page
class CompDeetz():
def __init__(self):
self.browser = Firefox()
self.browser.get(START_URL)
sleep(20)
self.browser.find_element_by_xpath('/html/body/div[12]/div/div/button').click()
sleep(5)
self.browser.find_element_by_xpath('/html/body/div[1]/div[4]').click()
self.database = []
def write_row(self,record):
with open('test.csv', 'a') as t:
writer = csv.writer(t)
writer.writerows(record)
def get_everything(self):
all_list = [ (a.text) for a in self.browser.find_elements_by_xpath('//tr[#class="hit"]')]
all_records = []
for company in all_list:
record = company.split('\n')
all_records.append(record)
self.write_row(all_records)
def next_page(self):
self.browser.find_element_by_xpath('//ul[#class="ais-Pagination-list"]/li[7]/a').click()
sleep(20)
def main():
t = CompDeetz()
t.get_everything()
for i in range(33):
t.next_page()
t.get_everything()
if __name__ == "__main__":
main()
Instead of having two different methods get_everything and next_page and calling them multiple times. You can have one method get_everything and call it once.
def get_everything(self):
all_records = []
nextPage = True
while nextPage:
all_list = [ (a.text) for a in self.browser.find_elements_by_xpath('//tr[#class="hit"]')]
for company in all_list:
record = company.split('\n')
all_records.append(record)
try:
nextPagelink = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, "//a[#aria-label='Next page']")))
driver.execute_script("arguments[0].scrollIntoView();", nextPagelink)
driver.execute_script("arguments[0].click();", nextPagelink)
time.sleep(5) # for next [age to load
#As on last page, next page link is not available. It will throw exception
except NoSuchElementException:
nextpage = False
self.write_row(all_records)
Note : take care of Pop up coming on page. I hope you already have mechanism to handle it.

how to save data from multiple pages using webdriver into a single csv

so i'm trying to save data from googlescholar using selenium (webdriver) and so far i can print the data that i want, but i when i saved it into a csv it only saves the first page
from selenium import webdriver
from selenium.webdriver.common.by import By
# Import statements for explicit wait
from selenium.webdriver.support.ui import WebDriverWait as W
from selenium.webdriver.support import expected_conditions as EC
import time
import csv
from csv import writer
exec_path = r"C:\Users\gvste\Desktop\proyecto\chromedriver.exe"
URL = r"https://scholar.google.com/citations?view_op=view_org&hl=en&authuser=2&org=8337597745079551909"
button_locators = ['//*[#id="gsc_authors_bottom_pag"]/div/button[2]', '//*[#id="gsc_authors_bottom_pag"]/div/button[2]','//*[#id="gsc_authors_bottom_pag"]/div/button[2]']
wait_time = 3
driver = webdriver.Chrome(executable_path=exec_path)
driver.get(URL)
wait = W(driver, wait_time)
#driver.maximize_window()
for j in range(len(button_locators)):
button_link = wait.until(EC.element_to_be_clickable((By.XPATH, button_locators[j])))
address = driver.find_elements_by_class_name("gsc_1usr")
#for post in address:
#print(post.text)
time.sleep(4)
with open('post.csv','a') as s:
for i in range(len(address)):
addresst = address
#if addresst == 'NONE':
# addresst = str(address)
#else:
addresst = address[i].text.replace('\n',',')
s.write(addresst+ '\n')
button_link.click()
time.sleep(4)
#driver.quit()
You only get one first page data because your program stops after it clicks next page button. You have to put all that in a for loop.
Notice i wrote in range(7), because I know there are 7 pages to open, in reality we should never do that. Imagine if we have thousands of pages. We should add some logic to check if the "next page button" exists or something and loop until it doesn't
exec_path = r"C:\Users\gvste\Desktop\proyecto\chromedriver.exe"
URL = r"https://scholar.google.com/citations?view_op=view_org&hl=en&authuser=2&org=8337597745079551909"
button_locators = "/html/body/div/div[8]/div[2]/div/div[12]/div/button[2]"
wait_time = 3
driver = webdriver.Chrome(executable_path=exec_path)
driver.get(URL)
wait = W(driver, wait_time)
time.sleep(4)
# 7 pages. In reality, we should get this number programmatically
for page in range(7):
# read data from new page
address = driver.find_elements_by_class_name("gsc_1usr")
# write to file
with open('post.csv','a') as s:
for i in range(len(address)):
addresst = address[i].text.replace('\n',',')
s.write(addresst+ '\n')
# find and click next page button
button_link = wait.until(EC.element_to_be_clickable((By.XPATH, button_locators)))
button_link.click()
time.sleep(4)
also in the future you should look to change all these time.sleeps to wait.until. Because sometimes your page loads quicker, and the program could do it's job faster. Or even worse, your network might get a lag and that would screw up your script.

How to scroll down in an instagram pop-up frame with Selenium

I have a python script using selenium to go to a given Instagram profile and iterate over the user's followers. On the instagram website when one clicks to see the list of followers, a pop-up opens with the accounts listed (here's a screenshot of the site)
However both visually and in the html, only 12 accounts are shown. In order to see more one has to scroll down, so I tried doing this with the Keys.PAGE_DOWN input.
from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.keys import Keys
import time
...
username = 'Username'
password = 'Password'
message = 'blahblah'
tryTime = 2
#create driver and log in
driver = webdriver.Chrome()
logIn(driver, username, password, tryTime)
#gets rid of preference pop-up
a = driver.find_elements_by_class_name("HoLwm")
a[0].click()
#go to profile
driver.get("https://www.instagram.com/{}/".format(username))
#go to followers list
followers = driver.find_element_by_xpath("//a[#href='/{}/followers/']".format(username))
followers.click()
time.sleep(tryTime)
#find all li elements in list
fBody = driver.find_element_by_xpath("//div[#role='dialog']")
fBody.send_keys(Keys.PAGE_DOWN)
fList = fBody.find_elements_by_tag("li")
print("fList len is {}".format(len(fList)))
time.sleep(tryTime)
print("ended")
driver.quit()
When I try to run this I get the following error:
Message: unknown error: cannot focus element
I know this is probably because I'm using the wrong element for fBody, but I don't know which would be the right one. Does anybody know which element I should send the PAGE_DOWN key to, or if there is another way to load the accounts?
Any help is much appreciated!
the element you're looking is //div[#class='isgrP'] and Keys.PAGE_DOWN is not work for scrollable div.
Your variable fList hold old value, you need to find again the elements after scroll.
#find all li elements in list
fBody = driver.find_element_by_xpath("//div[#class='isgrP']")
scroll = 0
while scroll < 5: # scroll 5 times
driver.execute_script('arguments[0].scrollTop = arguments[0].scrollTop + arguments[0].offsetHeight;', fBody)
time.sleep(tryTime)
scroll += 1
fList = driver.find_elements_by_xpath("//div[#class='isgrP']//li")
print("fList len is {}".format(len(fList)))
print("ended")
#driver.quit()
The above code works fine if you add iteration (for) with range
for i in range(1, 4):
try:
#find all li elements in list
fBody = self.driver.find_element_by_xpath("//div[#class='isgrP']")
scroll = 0
while scroll < 5: # scroll 5 times
self.driver.execute_script('arguments[0].scrollTop = arguments[0].scrollTop + arguments[0].offsetHeight;', fBody)
time.sleep(2)
scroll += 1
fList = self.driver.find_elements_by_xpath("//div[#class='isgrP']//li")
print("fList len is {}".format(len(fList)))
except Exception as e:
print(e, "canot scrol")
try:
#get tags with a
hrefs_in_view = self.driver.find_elements_by_tag_name('a')
# finding relevant hrefs
hrefs_in_view = [elem.get_attribute('title') for elem in hrefs_in_view]
[pic_hrefs.append(title) for title in hrefs_in_view if title not in pic_hrefs]
print("Check: pic href length " + str(len(pic_hrefs)))
except Exception as tag:
print(tag, "can not find tag")
So, the for loop makes it to possible scrol even if the while loop miss

Categories

Resources