I used selenium to scrap a scrolling website and conducted the code below
import requests
from bs4 import BeautifulSoup
import csv
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
import unittest
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
import time
import unittest
import re
output_file = open("Kijubi.csv", "w", newline='')
class Crawling(unittest.TestCase):
def setUp(self):
self.driver = webdriver.Firefox()
self.driver.set_window_size(1024, 768)
self.base_url = "http://www.viatorcom.de/"
self.accept_next_alert = True
def test_sel(self):
driver = self.driver
delay = 3
driver.get(self.base_url + "de/7132/Seoul/d973-allthingstodo")
for i in range(1,1):
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(2)
html_source = driver.page_source
data = html_source.encode("utf-8")
My next step was to crawl specific information from the website like the price.
Hence, I added the following code:
all_spans = driver.find_elements_by_xpath("/html/body/div[5]/div/div[3]/div[2]/div[2]/div[1]/div[1]/div")
print(all_spans)
for price in all_spans:
Header = driver.find_elements_by_xpath("/html/body/div[5]/div/div[3]/div[2]/div[2]/div[1]/div[1]/div/div[2]/div[2]/span[2]")
for span in Header:
print(span.text)
But I get just one price instead all of them. Could you provide me feedback on what I could improve my code? Thanks:)
EDIT
Thanks to your guys I managed to get it running. Here is the additional code:
elements = driver.find_elements_by_xpath("//div[#id='productList']/div/div")
innerElements = 15
outerElements = len(elements)/innerElements
print(innerElements, "\t", outerElements, "\t", len(elements))
for j in range(1, int(outerElements)):
for i in range(1, int(innerElements)):
headline = driver.find_element_by_xpath("//div[#id='productList']/div["+str(j)+"]/div["+str(i)+"]/div/div[2]/h2/a").text
price = driver.find_element_by_xpath("//div[#id='productList']/div["+str(j)+"]/div["+str(i)+"]/div/div[2]/div[2]/span[2]").text
deeplink = driver.find_element_by_xpath("//div[#id='productList']/div["+str(j)+"]/div["+str(i)+"]/div/div[2]/h2/a").get_attribute("href")
print("Header: " + headline + " | " + "Price: " + price + " | " + "Deeplink: " + deeplink)
Now my last issue is that I still do not get the last 20 prices back, which have a English description. I only get back the prices which have German description. For English ones, they do not get fetched although they share the same html structure.
E.g. html structure for the English items
headline = driver.find_element_by_xpath("//div[#id='productList']/div[6]/div[1]/div/div[2]/h2/a")
Do you guys know what I have to modify? Any feedback is appreciated:)
To grab all prices on that page you should use such XPATH:
Header = driver.find_elements_by_xpath("//span[contains(concat(' ', normalize-space(#class), ' '), 'price-amount')]")
which means: find all span elements with class=price-amount, why so complex - see here
But more simply to find the same elements is by CSS locator:
.price-amount
Related
I'm quite new to python and have written a script using selenium to scrape a website. I've tried everything but can't get the loop to cycle through pages. It currently just repeats the data on the first page 5 times. I want to scrape all the pages for 'BR1' any help would be great, currently the script below only scrapes the first page 5 times.
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
with open('rightmove.csv', 'w') as file:
file.write('PropertyCardcontent \n')
PATH = ("/usr/local/bin/chromedriver")
driver = webdriver.Chrome(PATH)
driver.get("https://www.rightmove.co.uk/house-prices.html")
print(driver.title)
elem = driver.find_element(By.NAME, 'searchLocation') # Find the search box
elem.send_keys('BR1' + Keys.RETURN)
try:
content = WebDriverWait(driver, 15).until(
EC.presence_of_element_located((By.ID,'content'))
)
finally:
time.sleep(3)
for p in range(5):
sold = content.find_elements(By.CLASS_NAME, 'sold-prices-content-wrapper ')
for solds in sold:
address = solds.find_elements(By.CLASS_NAME, 'sold-prices-content ')
for addresses in address:
result = addresses.find_elements(By.CLASS_NAME, 'results ')
for results in result:
card = results.find_elements(By.CLASS_NAME,'propertyCard')
for propertyCard in card:
header = propertyCard.find_elements(By.CLASS_NAME,'propertyCard-content')
for propertyCardcontent in header:
road = propertyCardcontent.find_elements(By.CLASS_NAME,'title')
for propertyCardcontent in header:
road = propertyCardcontent.find_elements(By.CLASS_NAME,'subTitle')
for subtitle in road:
bed = subtitle.find_elements(By.CLASS_NAME, 'propertyType')
with open('rightmove.csv', 'a') as file:
for i in range(len(result)):
file.write(header[i].text + '\n')
button = driver.find_element(By.XPATH, '//*[#id="content"]/div[2]/div[2]/div[4]/div[27]/div[3]/div')
button.click()
file.close()
time.sleep(3)
driver.quit()
Since the website link has page number on it, I recommend you put the base url as "https://www.rightmove.co.uk/house-prices/br1.html?page=1", and loop through the pages while changing the last index of the url with methods like format string.
One other thing, you don't need to implement all those for loops, you can simply assign each variable to its specific value since everything you need is inside an html block which is easy to navigate on it.
Update:
I'm sorry for being late, had unexpected stuff(...).
I've made some changes as I use Brave, so make sure you select your browser, Chrome I believe, the chromedriver(ver:102) stays the same (or depending your Chrome version).
I've also got the Price and Date and stored them in a tuple.
Every record is stored in a list[Title, propertyType, tupleof(Price_Date)]
At the end, it creates a csv and stores everything inside with a ";" as delimter.
You can if you prefer split the price and date for later use, up to you.
Note: This looping method only applies to websites in which the number of page is included within the URL. In this case, both the key and number of page is included in the URL.
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
import time
import random
import itertools
options = Options()
options.binary_location = r'C:\Program Files\BraveSoftware\Brave-Browser\Application\brave.exe'
driver = webdriver.Chrome(options = options, service = Service("chromedriver.exe"))
key_word = "BR1".lower()
base_url = f"https://www.rightmove.co.uk/house-prices/{key_word}.html?page=1"
driver.get(base_url)
#Number of pages
pages = driver.find_element(By.XPATH, '//span[#class="pagination-label"][2]').text
pages = int(pages.strip('of'))
WebDriverWait(driver, 15).until(
EC.presence_of_element_located((By.CLASS_NAME, 'results '))
)
data = []
pc = 0
for p in range(1,pages+1):
driver.get(f"https://www.rightmove.co.uk/house-prices/{key_word}.html?page={p}")
WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.XPATH, '//div//div[#class="propertyCard"]'))
)
propertyCards = driver.find_elements(By.XPATH, '//div//div[#class="propertyCard"]')
for propertyCard in propertyCards:
title = propertyCard.find_element(By.CLASS_NAME, 'title').text
propertyType = propertyCard.find_element(By.CLASS_NAME, 'propertyType').text
price_list = propertyCard.find_elements(By.CLASS_NAME, 'price')
date_list = propertyCard.find_elements(By.CLASS_NAME, 'date-sold')
data.append([title,propertyType])
for p, d in itertools.zip_longest(price_list, date_list , fillvalue = None):
try:
price = p.text
date = d.text
data[pc].append((price, date))
except Exception as e:
print(e)
pc+=1
time.sleep(random.randint(1,4))
print(data)
with open('rightmove.csv', 'w') as file:
header = "Title;propertyType;Price_Date\n"
file.write(header)
for record in data:
file.write("{};{};{}\n".format(record[0],record[1],record[2:]))
driver.quit()
You don't have to go down to dom elem by elem, you can just use xpath or class_name (if it's unique, otherwise it's better xpath or css-selector) and get the item you are looking for.
Anyway follow this:
import time
import selenium.webdriver as webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
driver = webdriver.Chrome("/usr/local/bin/chromedriver")
driver.get("https://www.rightmove.co.uk/house-prices.html")
# send query
query = "BR1"
search_bar = driver.find_element(By.XPATH, '//input[#class="searchBox ac_input"]')
search_bar.send_keys(query)
search_bar.send_keys(Keys.ENTER)
# wait to result been loaded
WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.CLASS_NAME, 'propertyCard'))
)
#get amount of pages
pages = driver.find_element(By.XPATH, '//span[#class="pagination-label"][2]').text
pages = int(pages.replace('of ', ''))
data = []
i = 1
while i <= pages:
WebDriverWait(driver, 10).until(
EC.element_to_be_clickable((By.XPATH, '//div[contains(text(), "Next")]'))
).click()
# wait page load result
WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.XPATH, '//div//div[#class="propertyCard"]'))
)
propertyCards = driver.find_elements(By.XPATH, '//div//div[#class="propertyCard"]')
# loop over result and store data
for propertyCard in propertyCards:
title = propertyCard.find_element(By.CLASS_NAME, 'title').text
propertyType = propertyCard.find_element(By.CLASS_NAME, 'propertyType').text
data.append((title, propertyType))
time.sleep(1)
i += 1
print("you reach the last page")
#get number of results
printf(data)
driver.close()
I use a list of tuple cause in your example you want store 2 item, if you want store more data you can use a dict and then convert into csv with Dictwriter directly. Enjoy.
I am trying to build a web scraper that will go through a website's pages and download the excel files from a dropdown menu at the bottom of the page.
The webpages only allow me to download the 50 locations that are displayed on each page and I cannot download all of them at once.
I am able to download the first page's Excel file, but the following pages yield nothing else.
I get the following output after running the code I have provided below.
Skipped a page
No more pages.
If I exclude the lines where it asks to download the pages, it is able to go through each page until the end successfully.
I'll provide an example below for what I am trying to get accomplished.
I would appreciate any help and advice! Thank you!
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import Select
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
state = 'oklahoma'
rent_to_own = 'rent to own'
driver = webdriver.Chrome(ChromeDriverManager().install())
driver.maximize_window()
driver.get('https://www.careeronestop.org/toolkit/jobs/find-businesses.aspx')
industry = driver.find_element(By.ID, "txtKeyword")
industry.send_keys(rent_to_own)
location = driver.find_element(By.ID, "txtLocation")
location.send_keys(state)
driver.find_element(By.ID, "btnSubmit").click()
driver.implicitly_wait(3)
def web_scrape():
more_drawer = driver.find_element(By.XPATH, "//div[#class='more-drawer']//a[#href='/toolkit/jobs/find-businesses.aspx?keyword="+rent_to_own+"&ajax=0&location="+state+"&lang=en&Desfillall=y#Des']")
more_drawer.click()
driver.implicitly_wait(5)
get_50 = Select(driver.find_element(By.ID, 'ViewPerPage'))
get_50.select_by_value('50')
driver.implicitly_wait(5)
filter_description = driver.find_element(By.XPATH, "//ul[#class='filters-list']//a[#href='/toolkit/jobs/find-businesses.aspx?keyword="+rent_to_own+"&ajax=0&location="+state+"&lang=en&Desfillall=y&pagesize=50¤tpage=1&descfilter=Furniture~B~Renting ~F~ Leasing']")
filter_description.click()
while True:
try:
download_excel = Select(driver.find_element(By.ID, 'ResultsDownload'))
download_excel.select_by_value('Excel')
driver.implicitly_wait(20)
first_50 = driver.find_element(By.XPATH, "//div[#id='relatedOccupations']//a[#onclick='hideMoreRelatedOccupations()']")
first_50.click()
driver.implicitly_wait(20)
next_page = driver.find_element(By.XPATH, "//div[#class='pagination-wrap']//div//a[#class='next-page']")
next_page.click()
driver.implicitly_wait(20)
print("Skipped a page.")
except:
print("No more pages.")
return
web_scrape()
Below is something that works. Again I would think the way I went about this could be improved. I stuck with Selenium but you really don't even need to open the webpage and can just webscrape using correct URL params with Beautiful Soup. Also the fastest way was probably not to write every item into excel one at a time but it works, better way is probably using pandas and then creating an excel workbook at the end. But anyway if you have any questions let me know.
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import Select
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
import openpyxl as xl
import os
import math
cwd = os.getcwd() #Or whatever dir you want
filename = '\test123.xlsx'
location = 'oklahoma'
keyword = 'rent to own'
driver = webdriver.Chrome(ChromeDriverManager().install())
driver.maximize_window()
driver.get('https://www.careeronestop.org/toolkit/jobs/find-businesses.aspx?keyword=' + keyword + '&ajax=0&location=' + location + '&radius=50&pagesize=50¤tpage=1&lang=en')
driver.implicitly_wait(3)
wb = xl.Workbook()
ws = wb.worksheets[0]
#get number of pages
ret = driver.find_element(By.ID, 'recordNumber')
lp = math.ceil(float(ret.text)/50)
r = 1
for i in range(1, lp):
print(i)
driver.get('https://www.careeronestop.org/toolkit/jobs/find-businesses.aspx?keyword=' + keyword + '&ajax=0&location=' + location + '&radius=50&pagesize=50¤tpage=' + str(i) + '&lang=en')
table_id = driver.find_elements(By.CLASS_NAME, 'res-table')[0]
rows = table_id.find_elements(By.TAG_NAME, "tr")
for count, row in enumerate(rows, start=1):
if count >= 0:
cols = row.find_elements(By.TAG_NAME, "td")
refs = row.find_elements(By.TAG_NAME, "a")
for c, ref in enumerate(refs, start=1):
ws.cell(row=r, column=c).value = '=HYPERLINK("{}", "{}")'.format(ref.get_attribute("href"), ref.text)
for c, col in enumerate(cols, start=1):
if c > 1:
ws.cell(row=r, column=c).value = col.text
r += 1
wb.save(cwd + filename)
print('done')
This returns an excel file with 750+ rows of data with links included.
I placed the code of a complete and properly functioning scraping that I own. Successfully scrapes all elements on the page.
However, I would like to scrape only a small limited section of the page with the same elements as scraping. This limited section is already scraped correctly along with all elements of the page, but I would like to scrape only it and not "all + it". The link is here
There are 4 tables on the page, but I would like to scrape just one, that is the table called "Programma", ie the html section "event-summary event" or "leagues-static event-summary-leagues ". But of this section only the elements of the last round (Matchday 14). Matchday 14 only. No round 15. So obviously that with each update of the page rounds, the last round is always scraped as well.
So I would need to insert something that makes scraping understand to download only the elements (which it already owns and scrapes) of of that section and the last round.
The code is already complete and works fine, so I'm not looking for code services, but for a little hint to tell me how to limit the scraping to just the section mentioned above. Scraping is in Selenium. I would like to stick with Selenium and my code as it is already functional and complete. Thanks
import selenium
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
driver = webdriver.Firefox()
driver.get("url")
driver.implicitly_wait(12)
#driver.minimize_window()
wait = WebDriverWait(driver, 10)
all_rows = driver.find_elements(By.CSS_SELECTOR, "div[class^='event__round'],div[class^='event__match']")
current_round = '?'
for bundesliga in all_rows:
classes = bundesliga.get_attribute('class')
#print(classes)
if 'event__round' in classes:
#round = row.find_elements(By.CSS_SELECTOR, "[class^='event__round event__round--static']")
#current_round = row.text # full text `Round 20`
current_round = bundesliga.text.split(" ")[-1] # only `20` without `Round`
else:
datetime = bundesliga.find_element(By.CSS_SELECTOR, "[class^='event__time']")
#Divide la data e l'ora
date, time = datetime.text.split(" ")
date = date.rstrip('.') # right-strip to remove `.` at the end of date
team_home = bundesliga.find_element(By.CSS_SELECTOR, "[class^='event__participant event__participant--home']")
team_away = bundesliga.find_element(By.CSS_SELECTOR, "[class^='event__participant event__participant--away']")
score_home = bundesliga.find_element(By.CSS_SELECTOR, "[class^='event__score event__score--home']")
score_away = bundesliga.find_element(By.CSS_SELECTOR, "[class^='event__score event__score--away']")
bundesliga = [current_round, date, time, team_home.text, team_away.text, score_home.text, score_away.text]
bundesliga.append(bundesliga)
print(bundesliga)
I think all you need to do is limit all_rows variable. One way to do this is finding the tab you are looking for with text and then getting the parent elements.
import selenium
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException
driver = webdriver.Firefox()
driver.get("https://www.someurl/some/other/page")
driver.implicitly_wait(12)
#driver.minimize_window()
wait = WebDriverWait(driver, 10)
# all_rows = driver.find_elements(By.CSS_SELECTOR, "div[class^='event__round'],div[class^='event__match']")
############### UPDATE ####################
def parent_element(element):
return element.find_element(By.XPATH, './..')
programma_element = WebDriverWait(driver, 10).until(
EC.visibility_of_element_located((By.XPATH, "//div[text()='Programma']")))
programma_element_p1 = parent_element(programma_element)
programma_element_p2 = parent_element(programma_element_p1)
programma_element_p3 = parent_element(programma_element_p2)
all_rows = programma_element_p3.find_elements(By.CSS_SELECTOR, "div[class^='event__round'],div[class^='event__match']")
filter_rows = []
for row in all_rows:
if "event__match--last" in row.get_attribute('class'):
filter_rows.append(row)
break
else:
filter_rows.append(row)
############### UPDATE ####################
current_round = '?'
for bundesliga in filter_rows:
classes = bundesliga.get_attribute('class')
#print(classes)
if 'event__round' in classes:
#round = row.find_elements(By.CSS_SELECTOR, "[class^='event__round event__round--static']")
#current_round = row.text # full text `Round 20`
current_round = bundesliga.text.split(" ")[-1] # only `20` without `Round`
else:
datetime = bundesliga.find_element(By.CSS_SELECTOR, "[class^='event__time']")
#Divide la data e l'ora
date, time = datetime.text.split(" ")
date = date.rstrip('.') # right-strip to remove `.` at the end of date
team_home = bundesliga.find_element(By.CSS_SELECTOR, "[class^='event__participant event__participant--home']")
team_away = bundesliga.find_element(By.CSS_SELECTOR, "[class^='event__participant event__participant--away']")
# score_home = bundesliga.find_element(By.CSS_SELECTOR, "[class^='event__score event__score--home']")
# score_away = bundesliga.find_element(By.CSS_SELECTOR, "[class^='event__score event__score--away']")
try:
score_home = bundesliga.find_element(By.CSS_SELECTOR, "[class^='event__score event__score--home']")
except (TimeoutException, NoSuchElementException):
MyObject = type('MyObject', (object,), {})
score_home = MyObject()
score_home.text = "-"
try:
score_away = bundesliga.find_element(By.CSS_SELECTOR, "[class^='event__score event__score--away']")
except (TimeoutException, NoSuchElementException):
MyObject = type('MyObject', (object,), {})
score_away = MyObject()
score_away.text = "-"
bundesliga = [current_round, date, time, team_home.text, team_away.text, score_home.text, score_away.text]
bundesliga.append(bundesliga)
print(bundesliga)
I've tried to write a web scraper for https://www.waug.com/area/?idx=15:
#!/usr/bin/env python3
#_*_coding:utf8_*_
import requests
from bs4 import BeautifulSoup
url = requests.get('https://www.abcd.com/area/?abc=15')
html = url.text
soup = BeautifulSoup(html, 'html.parser')
count = 1
names = soup.select('#good_{} > div > div.class_name > div > div'.format(count))
prices = soup.select('#good_{} > div > div.class_name > div.class_name'.format(count))
for name in names:
while count < 45:
print(name.text)
count = count + 1
for price in prices:
while count < 45:
print(price.text)
count = count + 1
The output is only 45 times first item name and no price. How can I get all item name and price? I want to get item name and price on same line. (I've changed the url and some of the class names just in case)
In order to be sure to get the right name for the right title I'd get the whole "item-good" class.
Then using a for loop would allow me to be sure that the title I am getting matches its price.
Here's an example of how to parse a website with BeautifulSoup:
#!/usr/bin/env python3
#_*_coding:utf8_*_
import requests
from bs4 import BeautifulSoup
url = requests.get('https://www.waug.com/area/?idx=15')
html = url.text
soup = BeautifulSoup(html, 'html.parser')
count = 1
items = soup.findAll("div", {"class": "item-good"})
for item in items:
item_title = item.find("div", {"class": "good-title-text"})
item_price = item.find("div", {"class": "price-selling"})
print item_title.text + " " + item_price.text
# If you get encoding errors delete the row above and uncomment the one below
#print item_title.text.encode("utf-8") + " " + item_price.text.encode("utf-8")
As per OP's request this is not enough because there is a "more" button to push in the webpage in order to retrieve all the results.
This can be done using Selenium Webdriver.
=== IMPORTANT NOTE ===
In order to make this work you'll need to copy in your script folder also the "chromedriver" file.
You can download it from this Google website.
Here's the script:
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
browser = webdriver.Chrome()
browser.get('https://www.waug.com/area/?idx=15')
for number in range(10):
try:
WebDriverWait(browser, 60).until(EC.presence_of_element_located((By.ID, "more_good")))
more_button = browser.find_element_by_id('more_good')
more_button.click()
time.sleep(10)
except:
print "Scrolling is now complete!"
source = browser.page_source
# This source variable should be used as input for BeautifulSoup
print source
Now it is tie to merge the two explained soultions in order to get the final requested result.
Please keep it mind that this is just a quick'n'dirty hack and needs proper error handling and polishing but it should be enough to get you started:
#!/usr/bin/env python3
#_*_coding:utf8_*_
from bs4 import BeautifulSoup
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
browser = webdriver.Chrome()
browser.get('https://www.waug.com/area/?idx=15')
def is_page_load_complete():
close_button = browser.find_element_by_id('close_good');
return close_button.is_displayed();
while(True):
WebDriverWait(browser, 60).until(EC.presence_of_element_located((By.ID, "more_good")))
time.sleep(10)
more_button = browser.find_element_by_id('more_good')
if (more_button.is_displayed()):
more_button.click()
else:
if (is_page_load_complete()):
break
source = browser.page_source
soup = BeautifulSoup(source, 'html.parser')
items = soup.findAll("div", {"class": "item-good"})
for item in items:
item_title = item.find("div", {"class": "good-title-text"})
item_price = item.find("div", {"class": "price-selling"})
print item_title.text + " " + item_price.text
# If you get encoding errors comment the row above and uncomment the one below
#print item_title.text.encode("utf-8") + " " + item_price.text.encode("utf-8")
print "Total items found: " + str(len(items))
Actually i want to get the value from here. Getting product hyper link is working fine. i want to get product information, price etc.. from above link in same in for loop. How to put result data into CSV file. please help me.
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
import time
chrome_path = r"C:\Users\Venkatesh\AppData\Local\Programs\Python\Python35\chromedriver.exe"
driver = webdriver.Chrome(chrome_path)
driver.get("https://www.flipkart.com/mobiles")
search = driver.find_element_by_xpath("""//*[#id="container"]/div/div[2]/div/div[2]/div/div/div[1]/section/div[3]/div/div/a""").click()
delay = 20 # seconds
try:
WebDriverWait(driver, delay).until(EC.presence_of_element_located((By.XPATH, "//*[#id='container']/div/div[2]/div[2]/div/div[2]/div/div[3]/div[1]/div/div[1]/a/div[2]/div[1]/div[1]")))
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
print("Page is ready")
except TimeoutException:
print("Loading took too much time")
time.sleep(10)
for post in driver.find_elements_by_class_name("_1UoZlX"):
print(post.get_attribute("href"))
time.sleep(2)
driver.quit()
Output:
Page is ready
https://www.flipkart.com/moto-g5-plus-fine-gold-32-gb/p/itmes2zjvwfncxxr?pid=MOBEQHMGED7F9CZ2&srno=b_1_1&otracker=browse
&lid=LSTMOBEQHMGED7F9CZ2KHTBI8
https://www.flipkart.com/moto-g5-plus-lunar-grey-32-gb/p/itmes2zjvwfncxxr?pid=MOBEQHMGMAUXS5BF&srno=b_1_2&otracker=brows
e&lid=LSTMOBEQHMGMAUXS5BFVCF0ZO
https://www.flipkart.com/moto-e3-power-black-16-gb/p/itmekgt2fbywqgcv?pid=MOBEKGT2HGDGADFW&srno=b_1_3&otracker=browse&li
d=LSTMOBEKGT2HGDGADFWP5NHBY
https://www.flipkart.com/micromax-bolt-q381-plus-coffee-16-gb/p/itmeskgycnfghsat?pid=MOBESAMDG2GNUBC5&srno=b_1_4&otracke
r=browse&lid=LSTMOBESAMDG2GNUBC5KRPH8Q
https://www.flipkart.com/lenovo-k6-power-grey-dark-grey-32-gb/p/itmezenfhm4mvptw?pid=MOBEZENFZBPW8UMF&srno=b_1_5&otracke
r=browse&lid=LSTMOBEZENFZBPW8UMF7P8NY0
https://www.flipkart.com/lenovo-k6-power-gold-32-gb/p/itmezenfhm4mvptw?pid=MOBEZEMYH7FQBGBQ&srno=b_1_6&otracker=browse&l
id=LSTMOBEZEMYH7FQBGBQRHVU0S
https://www.flipkart.com/lenovo-k6-power-silver-32-gb/p/itmezenfhm4mvptw?pid=MOBEZEMX6CZHCJVY&srno=b_1_7&otracker=browse
&lid=LSTMOBEZEMX6CZHCJVYOIBM0E
https://www.flipkart.com/lenovo-vibe-k5-note-grey-64-gb/p/itmepcfqfdx9bdxs?pid=MOBEPCFQRJ6KFYZS&srno=b_1_8&otracker=brow
se&lid=LSTMOBEPCFQRJ6KFYZSI4DRRB
https://www.flipkart.com/lenovo-vibe-k5-note-gold-64-gb/p/itmepcfqfdx9bdxs?pid=MOBEPCFQ3ZSYTRUZ&srno=b_1_9&otracker=brow
se&lid=LSTMOBEPCFQ3ZSYTRUZGFSZCU
https://www.flipkart.com/samsung-galaxy-nxt-gold-32-gb/p/itmemzd4gepexjya?pid=MOBEMZD4KHRF5VZX&srno=b_1_10&otracker=brow
se&lid=LSTMOBEMZD4KHRF5VZX7FNU5S
https://www.flipkart.com/moto-e3-power-white-16-gb/p/itmekgt23fgwdgkg?pid=MOBEKGT2SVHPAHTM&srno=b_1_11&otracker=browse&l
id=LSTMOBEKGT2SVHPAHTMJA8RQ1
https://www.flipkart.com/lenovo-k6-power-silver-32-gb/p/itmezenfghddrfmc?pid=MOBEZENFKXZ4HSCG&srno=b_1_12&otracker=brows
e&lid=LSTMOBEZENFKXZ4HSCGC1OOAM
https://www.flipkart.com/lenovo-k6-power-gold-32-gb/p/itmezenfghddrfmc?pid=MOBEZENFSZGTQGWF&srno=b_1_13&otracker=browse&
lid=LSTMOBEZENFSZGTQGWFUR1LY1
https://www.flipkart.com/lenovo-k6-power-dark-gray-32-gb/p/itmezenfghddrfmc?pid=MOBEZENFG8BPDPSU&srno=b_1_14&otracker=br
owse&lid=LSTMOBEZENFG8BPDPSUUANLO6
https://www.flipkart.com/lava-arc-blue/p/itmezgyfszhmwfzt?pid=MOBEF6D24ZT6YHFJ&srno=b_1_15&otracker=browse&lid=LSTMOBEF6
D24ZT6YHFJZ6N7XC
https://www.flipkart.com/lenovo-vibe-k5-plus-3-gb-silver-16-gb/p/itmektn3t9rg9hnn?pid=MOBEKEF8ATFZZ8GN&srno=b_1_16&otrac
ker=browse&lid=LSTMOBEKEF8ATFZZ8GNY7WZBU
https://www.flipkart.com/lenovo-vibe-k5-plus-3-gb-gold-16-gb/p/itmektn3t9rg9hnn?pid=MOBEKEF8JYGKZCTF&srno=b_1_17&otracke
r=browse&lid=LSTMOBEKEF8JYGKZCTFUTCYS4
https://www.flipkart.com/lenovo-vibe-k5-plus-3-gb-dark-grey-16-gb/p/itmektn3t9rg9hnn?pid=MOBEKEF86VVUE8G2&srno=b_1_18&ot
racker=browse&lid=LSTMOBEKEF86VVUE8G2YCW5OP
https://www.flipkart.com/samsung-galaxy-nxt-black-32-gb/p/itmemzd4byrufyu7?pid=MOBEMZD4G83T5HKZ&srno=b_1_19&otracker=bro
wse&lid=LSTMOBEMZD4G83T5HKZVMFKK6
https://www.flipkart.com/samsung-galaxy-on8-gold-16-gb/p/itmemvarkqg5dyay?pid=MOBEMJR2NDM4EAHQ&srno=b_1_20&otracker=brow
se&lid=LSTMOBEMJR2NDM4EAHQ8BMJIN
https://www.flipkart.com/samsung-galaxy-on7-black-8-gb/p/itmedhx3jgmu2gps?pid=MOBECCA5SMRSKCNY&srno=b_1_21&otracker=brow
se&lid=LSTMOBECCA5SMRSKCNYWC8DYC
https://www.flipkart.com/samsung-galaxy-on7-gold-8-gb/p/itmedhx3jgmu2gps?pid=MOBECCA5Y5HBYR3Q&srno=b_1_22&otracker=brows
e&lid=LSTMOBECCA5Y5HBYR3QPDPGLJ
https://www.flipkart.com/samsung-galaxy-on5-gold-8-gb/p/itmedhx3uy3qsfks?pid=MOBECCA5FHQD43KA&srno=b_1_23&otracker=brows
e&lid=LSTMOBECCA5FHQD43KAFXOZYB
https://www.flipkart.com/lenovo-p2-gold-32-gb/p/itmeq5ygvgq9vyfn?pid=MOBEZFHHURMWYSFN&srno=b_1_24&otracker=browse&lid=LS
TMOBEZFHHURMWYSFNBBG6L0
https://www.flipkart.com/asus-zenfone-max-black-32-gb/p/itmege3d5pjpmknc?pid=MOBEGE3DYZM3ZYWB&srno=b_1_25&otracker=brows
e&lid=LSTMOBEGE3DYZM3ZYWBPCOZHP
https://www.flipkart.com/lenovo-vibe-k5-note-grey-32-gb/p/itmejj6kmhh2khk9?pid=MOBEJJ6KYARZGWJC&srno=b_1_26&otracker=bro
wse&lid=LSTMOBEJJ6KYARZGWJCCV4LRX
https://www.flipkart.com/swipe-elite-sense-4g-volte/p/itmeh6yfycypxfdz?pid=MOBEH6YFZYZZNCZK&srno=b_1_27&otracker=browse&
lid=LSTMOBEH6YFZYZZNCZKWVY6ES
https://www.flipkart.com/swipe-elite-sense-4g-volte/p/itmeh6yfycypxfdz?pid=MOBEH6YFZRTEMDBG&srno=b_1_28&otracker=browse&
lid=LSTMOBEH6YFZRTEMDBGYJNCJI
https://www.flipkart.com/xolo-era-1x-4g-volte-black-gun-metal-8-gb/p/itmerhq8uhtehukg?pid=MOBEHMEKGCZCGMB8&srno=b_1_29&o
tracker=browse&lid=LSTMOBEHMEKGCZCGMB8DCWHIY
https://www.flipkart.com/swipe-konnect-grand-black-8-gb/p/itmeqcgxvkyfzsgj?pid=MOBEQCGXN6HTZE2C&srno=b_1_30&otracker=bro
wse&lid=LSTMOBEQCGXN6HTZE2CXUT5W1
https://www.flipkart.com/lenovo-vibe-k5-note-gold-32-gb/p/itmejj6kczvxej4g?pid=MOBEJJ6K5A3GQ9SU&srno=b_1_31&otracker=bro
wse&lid=LSTMOBEJJ6K5A3GQ9SUZERSAR
https://www.flipkart.com/lyf-water-f1-black-32-gb/p/itmezh76z9jqsa8z?pid=MOBEZH76AFWSZVNH&srno=b_1_32&otracker=browse&li
d=LSTMOBEZH76AFWSZVNHOOBURN
https://www.flipkart.com/samsung-galaxy-j5-6-new-2016-edition-black-16-gb/p/itmegmrnzqjcpfg9?pid=MOBEG4XWHJDWMQDF&srno=b
_1_33&otracker=browse&lid=LSTMOBEG4XWHJDWMQDFZIWO93
https://www.flipkart.com/samsung-galaxy-j5-6-new-2016-edition-white-16-gb/p/itmegmrnzqjcpfg9?pid=MOBEG4XWJG7F9A6Z&srno=b
_1_34&otracker=browse&lid=LSTMOBEG4XWJG7F9A6ZHJOVBG
https://www.flipkart.com/samsung-galaxy-j5-6-new-2016-edition-gold-16-gb/p/itmegmrnzqjcpfg9?pid=MOBEG4XWFTBRMMBY&srno=b_
1_35&otracker=browse&lid=LSTMOBEG4XWFTBRMMBYZPYEGS
https://www.flipkart.com/moto-m-grey-64-gb/p/itmenqavgcezzk2y?pid=MOBENQATHQTKG7AV&srno=b_1_36&otracker=browse&lid=LSTMO
BENQATHQTKG7AVGFQI4N
https://www.flipkart.com/moto-m-gold-64-gb/p/itmenqavgcezzk2y?pid=MOBENQAVANRMEGAP&srno=b_1_37&otracker=browse&lid=LSTMO
BENQAVANRMEGAPHWU47I
https://www.flipkart.com/moto-m-silver-64-gb/p/itmenqavgcezzk2y?pid=MOBENQAVFTG6FPXX&srno=b_1_38&otracker=browse&lid=LST
MOBENQAVFTG6FPXXHZBIGV
https://www.flipkart.com/apple-iphone-6-silver-16-gb/p/itme8dvfeuxxbm4r?pid=MOBEYHZ2NUZGCHKN&srno=b_1_39&otracker=browse
&lid=LSTMOBEYHZ2NUZGCHKN7PMDIN
https://www.flipkart.com/samsung-galaxy-on8-black-16-gb/p/itmemvarprh8hegn?pid=MOBEMJRFZXZBESQW&srno=b_1_40&otracker=bro
wse&lid=LSTMOBEMJRFZXZBESQWCFHWJ0
https://www.flipkart.com/panasonic-eluga-tapp-silver-grey-16-gb/p/itmezf54ey3gf8ne?pid=MOBENRHGWZWKEGGF&srno=b_1_41&otra
cker=browse&lid=LSTMOBENRHGWZWKEGGFMJELY2
https://www.flipkart.com/panasonic-eluga-tapp-champagne-gold-16-gb/p/itmezf54ey3gf8ne?pid=MOBENRHGEQEJHSZM&srno=b_1_42&o
tracker=browse&lid=LSTMOBENRHGEQEJHSZMD8R5FE
https://www.flipkart.com/apple-iphone-6s-rose-gold-32-gb/p/itmen2yymnfcrxsz?pid=MOBEN2XYK8WFEGM8&srno=b_1_43&otracker=br
owse&lid=LSTMOBEN2XYK8WFEGM8QJW5XA
https://www.flipkart.com/lenovo-p2-grey-graphite-grey-32-gb/p/itmeq5ygvgq9vyfn?pid=MOBEZFHH2JYGXSNF&srno=b_1_44&otracker
=browse&lid=LSTMOBEZFHH2JYGXSNFNWKEAD
https://www.flipkart.com/forme-n1/p/itmeff8s2hdrfhyg?pid=MOBEFF8SHZPYKCRY&srno=b_1_45&otracker=browse&lid=LSTMOBEFF8SHZP
YKCRYEKQPPR
https://www.flipkart.com/forme-n1/p/itmeff8s2hdrfhyg?pid=MOBEFF8SSZNHCUND&srno=b_1_46&otracker=browse&lid=LSTMOBEFF8SSZN
HCUNDRC6GLT
https://www.flipkart.com/samsung-galaxy-on5-black-8-gb/p/itmekszmsqgpgygy?pid=MOBECCA5BJUVUGNP&srno=b_1_47&otracker=brow
se&lid=LSTMOBECCA5BJUVUGNPRKEGMG
https://www.flipkart.com/lenovo-p2-grey-graphite-grey-32-gb/p/itmeq5ygebzgqgfb?pid=MOBEZFHHVD8KXE7G&srno=b_1_48&otracker
=browse&lid=LSTMOBEZFHHVD8KXE7GB0OS6I
https://www.flipkart.com/lenovo-p2-gold-32-gb/p/itmeq5ygebzgqgfb?pid=MOBEZFHHGE2RXQUY&srno=b_1_49&otracker=browse&lid=LS
TMOBEZFHHGE2RXQUY2XDB97
https://www.flipkart.com/samsung-galaxy-j7-gold-16-gb/p/itmeafbfjhsydbpw?pid=MOBE93GWSMGZHFSK&srno=b_1_50&otracker=brows
e&lid=LSTMOBE93GWSMGZHFSKT6OZOB
https://www.flipkart.com/samsung-z2-gold-8-gb/p/itmenkygvprd5dwt?pid=MOBENKYGHFUHT6BH&srno=b_1_51&otracker=browse&lid=LS
TMOBENKYGHFUHT6BHVSHMDE
https://www.flipkart.com/leeco-le-2-grey-32-gb/p/itmejeucxaxmnk8k?pid=MOBEJFTH4C9Z2YZR&srno=b_1_52&otracker=browse&lid=L
STMOBEJFTH4C9Z2YZRVVL0EL
https://www.flipkart.com/lyf-water-10-black-16-gb/p/itmemj7d8qfkfu4r?pid=MOBEMJ7C7YMDMVDQ&srno=b_1_53&otracker=browse&li
d=LSTMOBEMJ7C7YMDMVDQPCFALX
https://www.flipkart.com/micromax-canvas-nitro-2-grey-silver-16-gb/p/itme7nhzw56hv2ga?pid=MOBE7NHZP7GHZ7SG&srno=b_1_54&o
tracker=browse&lid=LSTMOBE7NHZP7GHZ7SGCYGNI3
https://www.flipkart.com/moto-g-turbo-white-16-gb/p/itmecc4uhbue7ve6?pid=MOBECC4UQTJ5QZFR&srno=b_1_55&otracker=browse&li
d=LSTMOBECC4UQTJ5QZFR9CAUPO
https://www.flipkart.com/moto-g-turbo-black-16-gb/p/itmecc4uhbue7ve6?pid=MOBECC4UZTSGKWWZ&srno=b_1_56&otracker=browse&li
d=LSTMOBECC4UZTSGKWWZOQKAIZ
https://www.flipkart.com/apple-iphone-6-space-grey-16-gb/p/itme8dvfeuxxbm4r?pid=MOBEYHZ2YAXZMF2J&srno=b_1_57&otracker=br
owse&lid=LSTMOBEYHZ2YAXZMF2JEVWVNC
https://www.flipkart.com/yu-yunicorn-rush-silver-32-gb/p/itmenffyjfp8ubyg?pid=MOBEJ3MFUQAF8XJS&srno=b_1_58&otracker=brow
se&lid=LSTMOBEJ3MFUQAF8XJSBPC8L4
https://www.flipkart.com/yu-yunicorn-gold-rush-32-gb/p/itmenffyjfp8ubyg?pid=MOBEJ3MF23Q9MGMH&srno=b_1_59&otracker=browse
&lid=LSTMOBEJ3MF23Q9MGMHZ49MG2
https://www.flipkart.com/micromax-canvas-nitro-2-white-gold-16-gb/p/itme7nhzw56hv2ga?pid=MOBE8TJBHGQYHNPT&srno=b_1_60&ot
racker=browse&lid=LSTMOBE8TJBHGQYHNPTVL3HS0
Used openpyxl to create a csv for each run with the filename+timestamp. Links that are fetched are written to the csv file eventually.
I couldn't find the exact links that have been given and hence I chose my own links which are similar in case. This code has different links per se, but the solution scales up to be same for your case #venkatesh
One more thing: Try to keep xpaths as relative as possible, and the classes with such gibberish as _13oc-S Would not hold good as they tend to change dynamically for each DOM refresh or each browser instance.
from webdriver_manager.chrome import ChromeDriverManager
from selenium import webdriver
from selenium.webdriver.common.by import By
import time
import openpyxl
current_time = time.strftime('%Y%m%d%H%M%S')
xlpath = "linktracker" + current_time + ".csv"
driver = webdriver.Chrome(ChromeDriverManager().install())
driver.get("https://www.flipkart.com/mobiles")
driver.maximize_window()
# Searches for a certain brand of phpnes (POCO). Inefficient way of locator finding though
search = driver.find_element(By.XPATH, "(//*[#alt='Shop Now'])[2]").click()
time.sleep(10) # bad practice, but used for now. Webdriverwait to be used instead.
each_element = "//a[#rel='noopener noreferrer']" # locates each desired element in the search page (each phone block)
posts = driver.find_elements(By.XPATH, each_element)
print(len(posts))
ls=[]
for post in range(len(posts)-1): # len-1 because the last item is a footer and not the desired link in my view
# concatanates the subscript to element xpath: e.g.: (//*[#element = 'ele'])[1] ... (//*[#element = 'ele'])[n]
each_post = driver.find_element(By.XPATH, '(' + each_element + ')' + '[' + str(post + 1) + ']')
each_link = each_post.get_attribute("href")
ls.append(each_link)
wb = openpyxl.Workbook() # creates a workbook
sheet = wb.active
c=0
# looping through the created list and writing the values to the created workbook
for i in ls:
sheet.cell(row=c+1, column=1).value = i
c+=1 # incrementing the row for each iteration of i
wb.save(xlpath) # saving the workbook with the name as given in the xlpath variable above
driver.quit()
Result in csv - image