I'm quite new to python and have written a script using selenium to scrape a website. I've tried everything but can't get the loop to cycle through pages. It currently just repeats the data on the first page 5 times. I want to scrape all the pages for 'BR1' any help would be great, currently the script below only scrapes the first page 5 times.
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
with open('rightmove.csv', 'w') as file:
file.write('PropertyCardcontent \n')
PATH = ("/usr/local/bin/chromedriver")
driver = webdriver.Chrome(PATH)
driver.get("https://www.rightmove.co.uk/house-prices.html")
print(driver.title)
elem = driver.find_element(By.NAME, 'searchLocation') # Find the search box
elem.send_keys('BR1' + Keys.RETURN)
try:
content = WebDriverWait(driver, 15).until(
EC.presence_of_element_located((By.ID,'content'))
)
finally:
time.sleep(3)
for p in range(5):
sold = content.find_elements(By.CLASS_NAME, 'sold-prices-content-wrapper ')
for solds in sold:
address = solds.find_elements(By.CLASS_NAME, 'sold-prices-content ')
for addresses in address:
result = addresses.find_elements(By.CLASS_NAME, 'results ')
for results in result:
card = results.find_elements(By.CLASS_NAME,'propertyCard')
for propertyCard in card:
header = propertyCard.find_elements(By.CLASS_NAME,'propertyCard-content')
for propertyCardcontent in header:
road = propertyCardcontent.find_elements(By.CLASS_NAME,'title')
for propertyCardcontent in header:
road = propertyCardcontent.find_elements(By.CLASS_NAME,'subTitle')
for subtitle in road:
bed = subtitle.find_elements(By.CLASS_NAME, 'propertyType')
with open('rightmove.csv', 'a') as file:
for i in range(len(result)):
file.write(header[i].text + '\n')
button = driver.find_element(By.XPATH, '//*[#id="content"]/div[2]/div[2]/div[4]/div[27]/div[3]/div')
button.click()
file.close()
time.sleep(3)
driver.quit()
Since the website link has page number on it, I recommend you put the base url as "https://www.rightmove.co.uk/house-prices/br1.html?page=1", and loop through the pages while changing the last index of the url with methods like format string.
One other thing, you don't need to implement all those for loops, you can simply assign each variable to its specific value since everything you need is inside an html block which is easy to navigate on it.
Update:
I'm sorry for being late, had unexpected stuff(...).
I've made some changes as I use Brave, so make sure you select your browser, Chrome I believe, the chromedriver(ver:102) stays the same (or depending your Chrome version).
I've also got the Price and Date and stored them in a tuple.
Every record is stored in a list[Title, propertyType, tupleof(Price_Date)]
At the end, it creates a csv and stores everything inside with a ";" as delimter.
You can if you prefer split the price and date for later use, up to you.
Note: This looping method only applies to websites in which the number of page is included within the URL. In this case, both the key and number of page is included in the URL.
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
import time
import random
import itertools
options = Options()
options.binary_location = r'C:\Program Files\BraveSoftware\Brave-Browser\Application\brave.exe'
driver = webdriver.Chrome(options = options, service = Service("chromedriver.exe"))
key_word = "BR1".lower()
base_url = f"https://www.rightmove.co.uk/house-prices/{key_word}.html?page=1"
driver.get(base_url)
#Number of pages
pages = driver.find_element(By.XPATH, '//span[#class="pagination-label"][2]').text
pages = int(pages.strip('of'))
WebDriverWait(driver, 15).until(
EC.presence_of_element_located((By.CLASS_NAME, 'results '))
)
data = []
pc = 0
for p in range(1,pages+1):
driver.get(f"https://www.rightmove.co.uk/house-prices/{key_word}.html?page={p}")
WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.XPATH, '//div//div[#class="propertyCard"]'))
)
propertyCards = driver.find_elements(By.XPATH, '//div//div[#class="propertyCard"]')
for propertyCard in propertyCards:
title = propertyCard.find_element(By.CLASS_NAME, 'title').text
propertyType = propertyCard.find_element(By.CLASS_NAME, 'propertyType').text
price_list = propertyCard.find_elements(By.CLASS_NAME, 'price')
date_list = propertyCard.find_elements(By.CLASS_NAME, 'date-sold')
data.append([title,propertyType])
for p, d in itertools.zip_longest(price_list, date_list , fillvalue = None):
try:
price = p.text
date = d.text
data[pc].append((price, date))
except Exception as e:
print(e)
pc+=1
time.sleep(random.randint(1,4))
print(data)
with open('rightmove.csv', 'w') as file:
header = "Title;propertyType;Price_Date\n"
file.write(header)
for record in data:
file.write("{};{};{}\n".format(record[0],record[1],record[2:]))
driver.quit()
You don't have to go down to dom elem by elem, you can just use xpath or class_name (if it's unique, otherwise it's better xpath or css-selector) and get the item you are looking for.
Anyway follow this:
import time
import selenium.webdriver as webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
driver = webdriver.Chrome("/usr/local/bin/chromedriver")
driver.get("https://www.rightmove.co.uk/house-prices.html")
# send query
query = "BR1"
search_bar = driver.find_element(By.XPATH, '//input[#class="searchBox ac_input"]')
search_bar.send_keys(query)
search_bar.send_keys(Keys.ENTER)
# wait to result been loaded
WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.CLASS_NAME, 'propertyCard'))
)
#get amount of pages
pages = driver.find_element(By.XPATH, '//span[#class="pagination-label"][2]').text
pages = int(pages.replace('of ', ''))
data = []
i = 1
while i <= pages:
WebDriverWait(driver, 10).until(
EC.element_to_be_clickable((By.XPATH, '//div[contains(text(), "Next")]'))
).click()
# wait page load result
WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.XPATH, '//div//div[#class="propertyCard"]'))
)
propertyCards = driver.find_elements(By.XPATH, '//div//div[#class="propertyCard"]')
# loop over result and store data
for propertyCard in propertyCards:
title = propertyCard.find_element(By.CLASS_NAME, 'title').text
propertyType = propertyCard.find_element(By.CLASS_NAME, 'propertyType').text
data.append((title, propertyType))
time.sleep(1)
i += 1
print("you reach the last page")
#get number of results
printf(data)
driver.close()
I use a list of tuple cause in your example you want store 2 item, if you want store more data you can use a dict and then convert into csv with Dictwriter directly. Enjoy.
Related
I have the following code that scrapes some information I need from a website. However, there are 61 pages I need to go through and scrape the same data that requires me to click on the 'Next' button to go to the next page with the url remaining the same.
I know it is possible to use driver.find_element_by_link_text('Next').click() to go to the next page but I am not sure how to include this in my code.
from selenium import webdriver
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
import time
driver = webdriver.Chrome()
driver.get('https://mspotrace.org.my/Sccs_list')
time.sleep(20)
# Get list of elements
elements = WebDriverWait(driver, 20).until(EC.presence_of_all_elements_located((By.XPATH, "//a[#title='View on Map']")))
# Loop through element popups and pull details of facilities into DF
pos = 0
df = pd.DataFrame(columns=['facility_name','other_details'])
for element in elements:
try:
data = []
element.click()
time.sleep(10)
facility_name = driver.find_element_by_xpath('//h4[#class="modal-title"]').text
other_details = driver.find_element_by_xpath('//div[#class="modal-body"]').text
time.sleep(5)
data.append(facility_name)
data.append(other_details)
df.loc[pos] = data
WebDriverWait(driver,10).until(EC.element_to_be_clickable((By.CSS_SELECTOR, "button[aria-label='Close'] > span"))).click() # close popup window
print("Scraping info for",facility_name,"")
time.sleep(15)
pos+=1
except Exception:
alert = driver.switch_to.alert
print("No geo location information")
alert.accept()
pass
print(df)
Answering to your question, "I don't know how I would put it in my code"
Counter iii is used to repeat your existing code 60 times.
I cannot test the entire code, but I tested the loops.
For the sake of simplicity, in the code below I removed the element scraping so I could focus the test on repeating the clicks in the Next button, which is your question.
If you are going to test on your side, ensure you replace
print('your stuff would stay here!')
with the actual element scraping block that you have in your original code.
Hope it helps!
from selenium import webdriver
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
import time
driver = webdriver.Chrome()
driver.get('https://mspotrace.org.my/Sccs_list')
time.sleep(20)
# Get list of elements
elements = WebDriverWait(driver, 20).until(EC.presence_of_all_elements_located((By.XPATH, "//a[#title='View on Map']")))
# Loop through element popups and pull details of facilities into DF
pos = 0
df = pd.DataFrame(columns=['facility_name','other_details'])
for iii in range(1,60):
for element in elements:
print('your stuff would stay here!')
#click next
btnNext = driver.find_element(By.XPATH,'//*[#id="dTable_next"]/a')
driver.execute_script("arguments[0].scrollIntoView();", btnNext)
driver.execute_script("arguments[0].click();", btnNext)
time.sleep(5)
#print current df. You may want to store it and print in the end only?
print(df)
# Get list of elements again
elements = WebDriverWait(driver, 20).until(EC.presence_of_all_elements_located((By.XPATH, "//a[#title='View on Map']")))
# Resetting vars again
pos = 0
df = pd.DataFrame(columns=['facility_name','other_details'])
I am currently learning Python in order to webscrape and am running into an issue with my current script. After closing the pop-up on Page 2 of Indeed and cycling through the pages, the script only returns one page into the data frame to CSV. However, it does print out each page in my terminal area. It also on occasion only returns part of the data from a page. EX page 2 will return info for the first 3 jobs as part of my print(df_da), but nothing for the next 12. Additionally, it seems to take a very long time to run the script (averaging around 6 minutes and 45 seconds for the 5 pages, around 1 minute to 1.5 minutes per page). Any suggestions? I've attached my script and can also attach the return I get from my Print(df_da) if needed below. Thank you in advance!
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
options = Options()
options.add_argument("window-size=1400,1400")
PATH = "C://Program Files (x86)//chromedriver.exe"
driver = webdriver.Chrome(PATH)
for i in range(0,50,10):
driver.get('https://www.indeed.com/jobs?q=chemical%20engineer&l=united%20states&start='+str(i))
driver.implicitly_wait(5)
jobtitles = []
companies = []
locations = []
descriptions = []
jobs = driver.find_elements_by_class_name("slider_container")
for job in jobs:
jobtitle = job.find_element_by_class_name('jobTitle').text.replace("new", "").strip()
jobtitles.append(jobtitle)
company = job.find_element_by_class_name('companyName').text.replace("new", "").strip()
companies.append(company)
location = job.find_element_by_class_name('companyLocation').text.replace("new", "").strip()
locations.append(location)
description = job.find_element_by_class_name('job-snippet').text.replace("new", "").strip()
descriptions.append(description)
try:
WebDriverWait(driver, 5).until(EC.visibility_of_element_located((By.CSS_SELECTOR, "button.popover-x-button-close.icl-CloseButton"))).click()
except:
pass
df_da=pd.DataFrame()
df_da['JobTitle']=jobtitles
df_da['Company']=companies
df_da['Location']=locations
df_da['Description']=descriptions
print(df_da)
df_da.to_csv('C:/Users/Dan/Desktop/AZNext/file_name1.csv')
You are defining the df_da inside the outer for loop so that the df_da will contain the data from the last page only.
You should define it out of the loops and put the total data there only after all the data have been collected.
I guess you are getting not all the jobs on the second page because of the pop-up. So, you should close it before collecting the job details on that page.
Also, you can reduce waiting for the pop-up element from all the loop iterations and leave it for the second loop iteration only.
Your code can be something like this:
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
options = Options()
options.add_argument("window-size=1400,1400")
PATH = "C://Program Files (x86)//chromedriver.exe"
driver = webdriver.Chrome(PATH)
jobtitles = []
companies = []
locations = []
descriptions = []
for i in range(0,50,10):
driver.get('https://www.indeed.com/jobs?q=chemical%20engineer&l=united%20states&start='+str(i))
driver.implicitly_wait(5)
jobs = driver.find_elements_by_class_name("slider_container")
for idx, job in enumerate(jobs):
if(idx == 1):
try:
WebDriverWait(driver, 5).until(EC.visibility_of_element_located((By.CSS_SELECTOR, "button.popover-x-button-close.icl-CloseButton"))).click()
except:
pass
jobtitle = job.find_element_by_class_name('jobTitle').text.replace("new", "").strip()
jobtitles.append(jobtitle)
company = job.find_element_by_class_name('companyName').text.replace("new", "").strip()
companies.append(company)
location = job.find_element_by_class_name('companyLocation').text.replace("new", "").strip()
locations.append(location)
description = job.find_element_by_class_name('job-snippet').text.replace("new", "").strip()
descriptions.append(description)
df_da=pd.DataFrame()
df_da['JobTitle']=jobtitles
df_da['Company']=companies
df_da['Location']=locations
df_da['Description']=descriptions
print(df_da)
df_da.to_csv('C:/Users/Dan/Desktop/AZNext/file_name1.csv')
I am extracting google reviews of a resturant. I am interested in extracting reviewer name, rating given by reviewer, and text of the review. I used following code for the extraction:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.action_chains import ActionChains
import time
driver = webdriver.Chrome('')
base_url = 'https://www.google.com/search?tbs=lf:1,lf_ui:9&tbm=lcl&sxsrf=AOaemvJFjYToqQmQGGnZUovsXC1CObNK1g:1633336974491&q=10+famous+restaurants+in+Dunedin&rflfq=1&num=10&sa=X&ved=2ahUKEwiTsqaxrrDzAhXe4zgGHZPODcoQjGp6BAgKEGo&biw=1280&bih=557&dpr=2#lrd=0xa82eac0dc8bdbb4b:0x4fc9070ad0f2ac70,1,,,&rlfi=hd:;si:5749134142351780976,l,CiAxMCBmYW1vdXMgcmVzdGF1cmFudHMgaW4gRHVuZWRpbiJDUjEvZ2VvL3R5cGUvZXN0YWJsaXNobWVudF9wb2kvcG9wdWxhcl93aXRoX3RvdXJpc3Rz2gENCgcI5Q8QChgFEgIIFkiDlJ7y7YCAgAhaMhAAEAEQAhgCGAQiIDEwIGZhbW91cyByZXN0YXVyYW50cyBpbiBkdW5lZGluKgQIAxACkgESaXRhbGlhbl9yZXN0YXVyYW50mgEkQ2hkRFNVaE5NRzluUzBWSlEwRm5TVU56ZW5WaFVsOUJSUkFCqgEMEAEqCCIEZm9vZCgA,y,2qOYUvKQ1C8;mv:[[-45.8349553,170.6616387],[-45.9156414,170.4803685]]'
driver.get(base_url)
WebDriverWait(driver,10).until(EC.element_to_be_clickable((By.XPATH,"//div[./span[text()='Newest']]"))).click()
total_reviews_text =driver.find_element_by_xpath("//div[#class='review-score-container']//div//div//span//span[#class='z5jxId']").text
num_reviews = int (total_reviews_text.split()[0])
all_reviews = WebDriverWait(driver, 20).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, 'div.gws-localreviews__google-review')))
time.sleep(2)
total_reviews = len(all_reviews)
while total_reviews < num_reviews:
driver.execute_script('arguments[0].scrollIntoView(true);', all_reviews[-1])
WebDriverWait(driver, 5, 0.25).until_not(EC.presence_of_element_located((By.CSS_SELECTOR, 'div[class$="activityIndicator"]')))
#all_reviews = driver.find_elements_by_css_selector('div.gws-localreviews__google-review')
time.sleep(5)
all_reviews = WebDriverWait(driver, 5).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, 'div.gws-localreviews__google-review')))
print(total_reviews)
total_reviews +=5
person_info = driver.find_elements_by_xpath("//div[#id='reviewSort']//div[contains(#class,'google-review')]")
rating_info = driver.find_elements_by_xpath("//div[#class='PuaHbe']")
review_text = driver.find_elements_by_xpath("//div[#class='Jtu6Td']")
for person in person_info:
name = person.find_element_by_xpath("./div/div/div/a").text
print(name)
for rating in rating_info:
rating_txt = person.find_element_by_xpath("./g-review-stars/span").get_attribute('aria-label')
print(rating_txt)
for text in review_text:
texts = text.find_element_by_xpath("./span").text
print(texts)
The above code worked as per expectations. I want to make slight change in above code. Instead of using three loops to display name, rating, and review_text. I wanted to extract the same information using one loop. So I made following changes in the above code:
reviews_info = driver.find_elements_by_xpath("//div[#class='jxjCjc']")
for review_info in reviews_info:
name = review_info.find_element_by_xpath("./div/div/a").text
rating = review_info.find_element_by_xpath("//div[#class='PuaHbe']//g-review-stars//span").get_attribute('aria-label')
text = review_info.find_element_by_xpath("//div[#class='Jtu6Td']//span").text
print(name)
print(rating)
print(text)
print()
The problem with a change in code is that it displays the same information (i.e. rating and text) for all reviewers names. I am not sure where am I making the mistake. Any help to fix the issue would be really appreciated.
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
driver = webdriver.Chrome()
base_url = 'https://www.google.com/search?tbs=lf:1,lf_ui:9&tbm=lcl&sxsrf=AOaemvJFjYToqQmQGGnZUovsXC1CObNK1g:1633336974491&q=10+famous+restaurants+in+Dunedin&rflfq=1&num=10&sa=X&ved=2ahUKEwiTsqaxrrDzAhXe4zgGHZPODcoQjGp6BAgKEGo&biw=1280&bih=557&dpr=2#lrd=0xa82eac0dc8bdbb4b:0x4fc9070ad0f2ac70,1,,,&rlfi=hd:;si:5749134142351780976,l,CiAxMCBmYW1vdXMgcmVzdGF1cmFudHMgaW4gRHVuZWRpbiJDUjEvZ2VvL3R5cGUvZXN0YWJsaXNobWVudF9wb2kvcG9wdWxhcl93aXRoX3RvdXJpc3Rz2gENCgcI5Q8QChgFEgIIFkiDlJ7y7YCAgAhaMhAAEAEQAhgCGAQiIDEwIGZhbW91cyByZXN0YXVyYW50cyBpbiBkdW5lZGluKgQIAxACkgESaXRhbGlhbl9yZXN0YXVyYW50mgEkQ2hkRFNVaE5NRzluUzBWSlEwRm5TVU56ZW5WaFVsOUJSUkFCqgEMEAEqCCIEZm9vZCgA,y,2qOYUvKQ1C8;mv:[[-45.8349553,170.6616387],[-45.9156414,170.4803685]]'
driver.get(base_url)
WebDriverWait(driver,10).until(EC.element_to_be_clickable((By.XPATH,"//div[./span[text()='Newest']]"))).click()
total_reviews_text =driver.find_element_by_xpath("//div[#class='review-score-container']//div//div//span//span[#class='z5jxId']").text
num_reviews = int (total_reviews_text.split()[0])
print("NUm reviews=", num_reviews)
all_reviews = WebDriverWait(driver, 20).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, 'div.gws-localreviews__google-review')))
time.sleep(2)
total_reviews = len(all_reviews)
print("Total reviews=", total_reviews)
s = "(//div[#id='reviewSort']//div[contains(#class,'google-review')])[0]"
b = '0'
a = 1 # Index of Review button
for i in range(10):
c = str(a)
s = s.replace(b, c) # Updating Xpath's index in every loop so that it can focus on new review everytime.
b = str(a)
a = a + 1
WebDriverWait(driver, 5, 0.25).until_not(EC.presence_of_element_located((By.CSS_SELECTOR, 'div[class$="activityIndicator"]')))
time.sleep(5)
all_reviews = WebDriverWait(driver, 5).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, 'div.gws-localreviews__google-review')))
total_reviews +=1
Info = driver.find_element_by_xpath(s).text
print(Info)
print("<------------------------------------------------------>\n\n")
Output:-
Click Here to See Program Output
i try to scrape the contact data from companies from this website:
https://de.statista.com/companydb/suche?idCountry=276&idBranch=0&revenueFrom=-1000000000000000000&revenueTo=1000000000000000000&employeesFrom=0&employeesTo=100000000&sortMethod=revenueDesc&p=4
I can do this with the following Code:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
import pandas as pd
import time
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
company_list= [] #create empty list
driver = webdriver.Chrome('/Users/rieder/Anaconda3/chromedriver_win32/chromedriver.exe') #define driver
driver.get('https://de.statista.com/companydb/suche?idCountry=276&idBranch=0&revenueFrom=-1000000000000000000&revenueTo=1000000000000000000&employeesFrom=0&employeesTo=100000000&sortMethod=revenueDesc&p=1') # open Website
driver.find_element_by_id("cookiesNotificationConfirm").click(); #accept cookies
driver.find_element_by_xpath("//*[#id='content']/section[3]/div/div/form/div/div[2]/div[2]/table/tr[2]/td[1]/a").click(); #click on the first company namelink
contact_data = WebDriverWait(driver, 20).until(EC.visibility_of_all_elements_located((By.XPATH, "/html/body/div[3]/div[4]/section[6]/div/div[2]/div[2]/div/div"))) #get the contactdata from the company you chose before
for cn in contact_data:
company_list.append(cn.text) # this stores the text in the list
driver.back() #navigate to previous site
time.sleep(5) #wait for the pop-up window to appear
driver.find_element_by_xpath("/html/body/div[15]/div[3]/div[3]/div[1]/button[1]").click(), #deny the websites popup
time.sleep(5) #wait for the popup to vanish
driver.find_element_by_xpath("//*[#id='content']/section[3]/div/div/form/div/div[2]/div[2]/table/tr[3]/td[1]/a").click(); #click on the next company namelink
contact_data2 = WebDriverWait(driver, 20).until(EC.visibility_of_all_elements_located((By.XPATH, "/html/body/div[3]/div[4]/section[6]/div/div[2]/div[2]/div/div"))) #get the contactdata from the company you chose before
for cn in contact_data2:
company_list.append(cn.text) # this stores the text in the list
print(company_list) #show the list
My Output is this:
['GUTex GmbH\nGerhard-Unland-Str. 1\n26683\nSaterland\nDeutschland', 'Robert Bosch GmbH\nRobert-Bosch-Platz 1\n70839\nGerlingen\nDeutschland']
Problem:
I want, that my code does this to the whole list on page 1 and then goes on on the next page and do it again. This shall go on until I have for example 100 adresses in the list. I would do this with a "while loop" but my xpaths for finding the adress are too specified, so it would always loop the same companies.
Thanks a lot inbefore
Try below code for one page data extract. Update the code for iterating over the next page records.
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
company_list= [] #create empty list
driver = webdriver.Chrome() #define driver
driver.get('https://de.statista.com/companydb/suche?idCountry=276&idBranch=0&revenueFrom=-1000000000000000000&revenueTo=1000000000000000000&employeesFrom=0&employeesTo=100000000&sortMethod=revenueDesc&p=1') # open Website
if len(driver.find_elements_by_id("cookiesNotificationConfirm")) > 0:
driver.find_element_by_id("cookiesNotificationConfirm").click(); # accept cookies
WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.XPATH, '//table[#class="zebraTable zebraTable--companies"]//td[1]')))
elementsSize = len(driver.find_elements_by_xpath('//table[#class="zebraTable zebraTable--companies"]//td[1]'))
# To iterate over the company list and click on the company name then capture the address on navigated page
# come back to previous page and repeat the same.
for i in range(elementsSize):
WebDriverWait(driver, 20).until(
EC.element_to_be_clickable((By.XPATH, '//table[#class="zebraTable zebraTable--companies"]//td[1]')))
elements = driver.find_elements_by_xpath('//table[#class="zebraTable zebraTable--companies"]//td[1]/a')
company_name = elements[i].text
elements[i].click() # click on the first company namelink
WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.XPATH,
'//*[#id="contactInformation"]//div[#class="companyContactBox"]'))) # get the contactdata from the company you chose before
contact_data = driver.execute_script("return document.getElementsByClassName('companyContactBox')[0].innerText")
# print(contact_data)
company_list.append(company_name + " : " + contact_data)
driver.back() # navigate to previous site
print(company_list)
Thanks to Dilip Meghwals comment above i could finish my Code:
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
import time
company_list= [] #create empty list
count = 25
chrome_options = webdriver.ChromeOptions()
prefs = {"profile.default_content_setting_values.notifications" : 2}
chrome_options.add_experimental_option("prefs",prefs)
driver = webdriver.Chrome('/Users/rieder/Anaconda3/chromedriver_win32/chromedriver.exe', chrome_options=chrome_options) #define driver
driver.get('https://de.statista.com/companydb/suche?idCountry=276&idBranch=0&revenueFrom=-1000000000000000000&revenueTo=1000000000000000000&employeesFrom=0&employeesTo=100000000&sortMethod=revenueDesc&p=1') # open Website
if len(driver.find_elements_by_id("cookiesNotificationConfirm")) > 0:
driver.find_element_by_id("cookiesNotificationConfirm").click(); # accept cookies
while len(company_list) < 1000:
WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.XPATH, '//table[#class="zebraTable zebraTable--companies"]//td[1]')))
elementsSize = len(driver.find_elements_by_xpath('//table[#class="zebraTable zebraTable--companies"]//td[1]'))
# To iterate over the company list and click on the company name then capture the address on navigated page
# come back to previous page and repeat the same.
for i in range(elementsSize):
WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.XPATH, '//table[#class="zebraTable zebraTable--companies"]//td[1]')))
elements = driver.find_elements_by_xpath('//table[#class="zebraTable zebraTable--companies"]//td[1]/a')
company_name = elements[i].text
elements[i].click() # click on the first company namelink
WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.XPATH,'//*[#id="contactInformation"]//div[#class="companyContactBox"]'))) # get the contactdata from the company you chose before
contact_data = driver.execute_script("return document.getElementsByClassName('companyContactBox')[0].innerText")
# print(contact_data)
company_list.append(contact_data)
driver.back() # navigate to previous site
time.sleep(5)
driver.find_element_by_xpath("//*[#id='content']/section[3]/div/div/form/div/div[2]/div[2]/div[2]/div/button[2]").click();
company_list = [w.replace('\n', ', ') for w in company_list]
print(company_list)
df_company_name = pd.DataFrame(company_list, columns =['Name'])
df_company_name.to_excel("company_name.xlsx")
Actually i want to get the value from here. Getting product hyper link is working fine. i want to get product information, price etc.. from above link in same in for loop. How to put result data into CSV file. please help me.
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
import time
chrome_path = r"C:\Users\Venkatesh\AppData\Local\Programs\Python\Python35\chromedriver.exe"
driver = webdriver.Chrome(chrome_path)
driver.get("https://www.flipkart.com/mobiles")
search = driver.find_element_by_xpath("""//*[#id="container"]/div/div[2]/div/div[2]/div/div/div[1]/section/div[3]/div/div/a""").click()
delay = 20 # seconds
try:
WebDriverWait(driver, delay).until(EC.presence_of_element_located((By.XPATH, "//*[#id='container']/div/div[2]/div[2]/div/div[2]/div/div[3]/div[1]/div/div[1]/a/div[2]/div[1]/div[1]")))
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
print("Page is ready")
except TimeoutException:
print("Loading took too much time")
time.sleep(10)
for post in driver.find_elements_by_class_name("_1UoZlX"):
print(post.get_attribute("href"))
time.sleep(2)
driver.quit()
Output:
Page is ready
https://www.flipkart.com/moto-g5-plus-fine-gold-32-gb/p/itmes2zjvwfncxxr?pid=MOBEQHMGED7F9CZ2&srno=b_1_1&otracker=browse
&lid=LSTMOBEQHMGED7F9CZ2KHTBI8
https://www.flipkart.com/moto-g5-plus-lunar-grey-32-gb/p/itmes2zjvwfncxxr?pid=MOBEQHMGMAUXS5BF&srno=b_1_2&otracker=brows
e&lid=LSTMOBEQHMGMAUXS5BFVCF0ZO
https://www.flipkart.com/moto-e3-power-black-16-gb/p/itmekgt2fbywqgcv?pid=MOBEKGT2HGDGADFW&srno=b_1_3&otracker=browse&li
d=LSTMOBEKGT2HGDGADFWP5NHBY
https://www.flipkart.com/micromax-bolt-q381-plus-coffee-16-gb/p/itmeskgycnfghsat?pid=MOBESAMDG2GNUBC5&srno=b_1_4&otracke
r=browse&lid=LSTMOBESAMDG2GNUBC5KRPH8Q
https://www.flipkart.com/lenovo-k6-power-grey-dark-grey-32-gb/p/itmezenfhm4mvptw?pid=MOBEZENFZBPW8UMF&srno=b_1_5&otracke
r=browse&lid=LSTMOBEZENFZBPW8UMF7P8NY0
https://www.flipkart.com/lenovo-k6-power-gold-32-gb/p/itmezenfhm4mvptw?pid=MOBEZEMYH7FQBGBQ&srno=b_1_6&otracker=browse&l
id=LSTMOBEZEMYH7FQBGBQRHVU0S
https://www.flipkart.com/lenovo-k6-power-silver-32-gb/p/itmezenfhm4mvptw?pid=MOBEZEMX6CZHCJVY&srno=b_1_7&otracker=browse
&lid=LSTMOBEZEMX6CZHCJVYOIBM0E
https://www.flipkart.com/lenovo-vibe-k5-note-grey-64-gb/p/itmepcfqfdx9bdxs?pid=MOBEPCFQRJ6KFYZS&srno=b_1_8&otracker=brow
se&lid=LSTMOBEPCFQRJ6KFYZSI4DRRB
https://www.flipkart.com/lenovo-vibe-k5-note-gold-64-gb/p/itmepcfqfdx9bdxs?pid=MOBEPCFQ3ZSYTRUZ&srno=b_1_9&otracker=brow
se&lid=LSTMOBEPCFQ3ZSYTRUZGFSZCU
https://www.flipkart.com/samsung-galaxy-nxt-gold-32-gb/p/itmemzd4gepexjya?pid=MOBEMZD4KHRF5VZX&srno=b_1_10&otracker=brow
se&lid=LSTMOBEMZD4KHRF5VZX7FNU5S
https://www.flipkart.com/moto-e3-power-white-16-gb/p/itmekgt23fgwdgkg?pid=MOBEKGT2SVHPAHTM&srno=b_1_11&otracker=browse&l
id=LSTMOBEKGT2SVHPAHTMJA8RQ1
https://www.flipkart.com/lenovo-k6-power-silver-32-gb/p/itmezenfghddrfmc?pid=MOBEZENFKXZ4HSCG&srno=b_1_12&otracker=brows
e&lid=LSTMOBEZENFKXZ4HSCGC1OOAM
https://www.flipkart.com/lenovo-k6-power-gold-32-gb/p/itmezenfghddrfmc?pid=MOBEZENFSZGTQGWF&srno=b_1_13&otracker=browse&
lid=LSTMOBEZENFSZGTQGWFUR1LY1
https://www.flipkart.com/lenovo-k6-power-dark-gray-32-gb/p/itmezenfghddrfmc?pid=MOBEZENFG8BPDPSU&srno=b_1_14&otracker=br
owse&lid=LSTMOBEZENFG8BPDPSUUANLO6
https://www.flipkart.com/lava-arc-blue/p/itmezgyfszhmwfzt?pid=MOBEF6D24ZT6YHFJ&srno=b_1_15&otracker=browse&lid=LSTMOBEF6
D24ZT6YHFJZ6N7XC
https://www.flipkart.com/lenovo-vibe-k5-plus-3-gb-silver-16-gb/p/itmektn3t9rg9hnn?pid=MOBEKEF8ATFZZ8GN&srno=b_1_16&otrac
ker=browse&lid=LSTMOBEKEF8ATFZZ8GNY7WZBU
https://www.flipkart.com/lenovo-vibe-k5-plus-3-gb-gold-16-gb/p/itmektn3t9rg9hnn?pid=MOBEKEF8JYGKZCTF&srno=b_1_17&otracke
r=browse&lid=LSTMOBEKEF8JYGKZCTFUTCYS4
https://www.flipkart.com/lenovo-vibe-k5-plus-3-gb-dark-grey-16-gb/p/itmektn3t9rg9hnn?pid=MOBEKEF86VVUE8G2&srno=b_1_18&ot
racker=browse&lid=LSTMOBEKEF86VVUE8G2YCW5OP
https://www.flipkart.com/samsung-galaxy-nxt-black-32-gb/p/itmemzd4byrufyu7?pid=MOBEMZD4G83T5HKZ&srno=b_1_19&otracker=bro
wse&lid=LSTMOBEMZD4G83T5HKZVMFKK6
https://www.flipkart.com/samsung-galaxy-on8-gold-16-gb/p/itmemvarkqg5dyay?pid=MOBEMJR2NDM4EAHQ&srno=b_1_20&otracker=brow
se&lid=LSTMOBEMJR2NDM4EAHQ8BMJIN
https://www.flipkart.com/samsung-galaxy-on7-black-8-gb/p/itmedhx3jgmu2gps?pid=MOBECCA5SMRSKCNY&srno=b_1_21&otracker=brow
se&lid=LSTMOBECCA5SMRSKCNYWC8DYC
https://www.flipkart.com/samsung-galaxy-on7-gold-8-gb/p/itmedhx3jgmu2gps?pid=MOBECCA5Y5HBYR3Q&srno=b_1_22&otracker=brows
e&lid=LSTMOBECCA5Y5HBYR3QPDPGLJ
https://www.flipkart.com/samsung-galaxy-on5-gold-8-gb/p/itmedhx3uy3qsfks?pid=MOBECCA5FHQD43KA&srno=b_1_23&otracker=brows
e&lid=LSTMOBECCA5FHQD43KAFXOZYB
https://www.flipkart.com/lenovo-p2-gold-32-gb/p/itmeq5ygvgq9vyfn?pid=MOBEZFHHURMWYSFN&srno=b_1_24&otracker=browse&lid=LS
TMOBEZFHHURMWYSFNBBG6L0
https://www.flipkart.com/asus-zenfone-max-black-32-gb/p/itmege3d5pjpmknc?pid=MOBEGE3DYZM3ZYWB&srno=b_1_25&otracker=brows
e&lid=LSTMOBEGE3DYZM3ZYWBPCOZHP
https://www.flipkart.com/lenovo-vibe-k5-note-grey-32-gb/p/itmejj6kmhh2khk9?pid=MOBEJJ6KYARZGWJC&srno=b_1_26&otracker=bro
wse&lid=LSTMOBEJJ6KYARZGWJCCV4LRX
https://www.flipkart.com/swipe-elite-sense-4g-volte/p/itmeh6yfycypxfdz?pid=MOBEH6YFZYZZNCZK&srno=b_1_27&otracker=browse&
lid=LSTMOBEH6YFZYZZNCZKWVY6ES
https://www.flipkart.com/swipe-elite-sense-4g-volte/p/itmeh6yfycypxfdz?pid=MOBEH6YFZRTEMDBG&srno=b_1_28&otracker=browse&
lid=LSTMOBEH6YFZRTEMDBGYJNCJI
https://www.flipkart.com/xolo-era-1x-4g-volte-black-gun-metal-8-gb/p/itmerhq8uhtehukg?pid=MOBEHMEKGCZCGMB8&srno=b_1_29&o
tracker=browse&lid=LSTMOBEHMEKGCZCGMB8DCWHIY
https://www.flipkart.com/swipe-konnect-grand-black-8-gb/p/itmeqcgxvkyfzsgj?pid=MOBEQCGXN6HTZE2C&srno=b_1_30&otracker=bro
wse&lid=LSTMOBEQCGXN6HTZE2CXUT5W1
https://www.flipkart.com/lenovo-vibe-k5-note-gold-32-gb/p/itmejj6kczvxej4g?pid=MOBEJJ6K5A3GQ9SU&srno=b_1_31&otracker=bro
wse&lid=LSTMOBEJJ6K5A3GQ9SUZERSAR
https://www.flipkart.com/lyf-water-f1-black-32-gb/p/itmezh76z9jqsa8z?pid=MOBEZH76AFWSZVNH&srno=b_1_32&otracker=browse&li
d=LSTMOBEZH76AFWSZVNHOOBURN
https://www.flipkart.com/samsung-galaxy-j5-6-new-2016-edition-black-16-gb/p/itmegmrnzqjcpfg9?pid=MOBEG4XWHJDWMQDF&srno=b
_1_33&otracker=browse&lid=LSTMOBEG4XWHJDWMQDFZIWO93
https://www.flipkart.com/samsung-galaxy-j5-6-new-2016-edition-white-16-gb/p/itmegmrnzqjcpfg9?pid=MOBEG4XWJG7F9A6Z&srno=b
_1_34&otracker=browse&lid=LSTMOBEG4XWJG7F9A6ZHJOVBG
https://www.flipkart.com/samsung-galaxy-j5-6-new-2016-edition-gold-16-gb/p/itmegmrnzqjcpfg9?pid=MOBEG4XWFTBRMMBY&srno=b_
1_35&otracker=browse&lid=LSTMOBEG4XWFTBRMMBYZPYEGS
https://www.flipkart.com/moto-m-grey-64-gb/p/itmenqavgcezzk2y?pid=MOBENQATHQTKG7AV&srno=b_1_36&otracker=browse&lid=LSTMO
BENQATHQTKG7AVGFQI4N
https://www.flipkart.com/moto-m-gold-64-gb/p/itmenqavgcezzk2y?pid=MOBENQAVANRMEGAP&srno=b_1_37&otracker=browse&lid=LSTMO
BENQAVANRMEGAPHWU47I
https://www.flipkart.com/moto-m-silver-64-gb/p/itmenqavgcezzk2y?pid=MOBENQAVFTG6FPXX&srno=b_1_38&otracker=browse&lid=LST
MOBENQAVFTG6FPXXHZBIGV
https://www.flipkart.com/apple-iphone-6-silver-16-gb/p/itme8dvfeuxxbm4r?pid=MOBEYHZ2NUZGCHKN&srno=b_1_39&otracker=browse
&lid=LSTMOBEYHZ2NUZGCHKN7PMDIN
https://www.flipkart.com/samsung-galaxy-on8-black-16-gb/p/itmemvarprh8hegn?pid=MOBEMJRFZXZBESQW&srno=b_1_40&otracker=bro
wse&lid=LSTMOBEMJRFZXZBESQWCFHWJ0
https://www.flipkart.com/panasonic-eluga-tapp-silver-grey-16-gb/p/itmezf54ey3gf8ne?pid=MOBENRHGWZWKEGGF&srno=b_1_41&otra
cker=browse&lid=LSTMOBENRHGWZWKEGGFMJELY2
https://www.flipkart.com/panasonic-eluga-tapp-champagne-gold-16-gb/p/itmezf54ey3gf8ne?pid=MOBENRHGEQEJHSZM&srno=b_1_42&o
tracker=browse&lid=LSTMOBENRHGEQEJHSZMD8R5FE
https://www.flipkart.com/apple-iphone-6s-rose-gold-32-gb/p/itmen2yymnfcrxsz?pid=MOBEN2XYK8WFEGM8&srno=b_1_43&otracker=br
owse&lid=LSTMOBEN2XYK8WFEGM8QJW5XA
https://www.flipkart.com/lenovo-p2-grey-graphite-grey-32-gb/p/itmeq5ygvgq9vyfn?pid=MOBEZFHH2JYGXSNF&srno=b_1_44&otracker
=browse&lid=LSTMOBEZFHH2JYGXSNFNWKEAD
https://www.flipkart.com/forme-n1/p/itmeff8s2hdrfhyg?pid=MOBEFF8SHZPYKCRY&srno=b_1_45&otracker=browse&lid=LSTMOBEFF8SHZP
YKCRYEKQPPR
https://www.flipkart.com/forme-n1/p/itmeff8s2hdrfhyg?pid=MOBEFF8SSZNHCUND&srno=b_1_46&otracker=browse&lid=LSTMOBEFF8SSZN
HCUNDRC6GLT
https://www.flipkart.com/samsung-galaxy-on5-black-8-gb/p/itmekszmsqgpgygy?pid=MOBECCA5BJUVUGNP&srno=b_1_47&otracker=brow
se&lid=LSTMOBECCA5BJUVUGNPRKEGMG
https://www.flipkart.com/lenovo-p2-grey-graphite-grey-32-gb/p/itmeq5ygebzgqgfb?pid=MOBEZFHHVD8KXE7G&srno=b_1_48&otracker
=browse&lid=LSTMOBEZFHHVD8KXE7GB0OS6I
https://www.flipkart.com/lenovo-p2-gold-32-gb/p/itmeq5ygebzgqgfb?pid=MOBEZFHHGE2RXQUY&srno=b_1_49&otracker=browse&lid=LS
TMOBEZFHHGE2RXQUY2XDB97
https://www.flipkart.com/samsung-galaxy-j7-gold-16-gb/p/itmeafbfjhsydbpw?pid=MOBE93GWSMGZHFSK&srno=b_1_50&otracker=brows
e&lid=LSTMOBE93GWSMGZHFSKT6OZOB
https://www.flipkart.com/samsung-z2-gold-8-gb/p/itmenkygvprd5dwt?pid=MOBENKYGHFUHT6BH&srno=b_1_51&otracker=browse&lid=LS
TMOBENKYGHFUHT6BHVSHMDE
https://www.flipkart.com/leeco-le-2-grey-32-gb/p/itmejeucxaxmnk8k?pid=MOBEJFTH4C9Z2YZR&srno=b_1_52&otracker=browse&lid=L
STMOBEJFTH4C9Z2YZRVVL0EL
https://www.flipkart.com/lyf-water-10-black-16-gb/p/itmemj7d8qfkfu4r?pid=MOBEMJ7C7YMDMVDQ&srno=b_1_53&otracker=browse&li
d=LSTMOBEMJ7C7YMDMVDQPCFALX
https://www.flipkart.com/micromax-canvas-nitro-2-grey-silver-16-gb/p/itme7nhzw56hv2ga?pid=MOBE7NHZP7GHZ7SG&srno=b_1_54&o
tracker=browse&lid=LSTMOBE7NHZP7GHZ7SGCYGNI3
https://www.flipkart.com/moto-g-turbo-white-16-gb/p/itmecc4uhbue7ve6?pid=MOBECC4UQTJ5QZFR&srno=b_1_55&otracker=browse&li
d=LSTMOBECC4UQTJ5QZFR9CAUPO
https://www.flipkart.com/moto-g-turbo-black-16-gb/p/itmecc4uhbue7ve6?pid=MOBECC4UZTSGKWWZ&srno=b_1_56&otracker=browse&li
d=LSTMOBECC4UZTSGKWWZOQKAIZ
https://www.flipkart.com/apple-iphone-6-space-grey-16-gb/p/itme8dvfeuxxbm4r?pid=MOBEYHZ2YAXZMF2J&srno=b_1_57&otracker=br
owse&lid=LSTMOBEYHZ2YAXZMF2JEVWVNC
https://www.flipkart.com/yu-yunicorn-rush-silver-32-gb/p/itmenffyjfp8ubyg?pid=MOBEJ3MFUQAF8XJS&srno=b_1_58&otracker=brow
se&lid=LSTMOBEJ3MFUQAF8XJSBPC8L4
https://www.flipkart.com/yu-yunicorn-gold-rush-32-gb/p/itmenffyjfp8ubyg?pid=MOBEJ3MF23Q9MGMH&srno=b_1_59&otracker=browse
&lid=LSTMOBEJ3MF23Q9MGMHZ49MG2
https://www.flipkart.com/micromax-canvas-nitro-2-white-gold-16-gb/p/itme7nhzw56hv2ga?pid=MOBE8TJBHGQYHNPT&srno=b_1_60&ot
racker=browse&lid=LSTMOBE8TJBHGQYHNPTVL3HS0
Used openpyxl to create a csv for each run with the filename+timestamp. Links that are fetched are written to the csv file eventually.
I couldn't find the exact links that have been given and hence I chose my own links which are similar in case. This code has different links per se, but the solution scales up to be same for your case #venkatesh
One more thing: Try to keep xpaths as relative as possible, and the classes with such gibberish as _13oc-S Would not hold good as they tend to change dynamically for each DOM refresh or each browser instance.
from webdriver_manager.chrome import ChromeDriverManager
from selenium import webdriver
from selenium.webdriver.common.by import By
import time
import openpyxl
current_time = time.strftime('%Y%m%d%H%M%S')
xlpath = "linktracker" + current_time + ".csv"
driver = webdriver.Chrome(ChromeDriverManager().install())
driver.get("https://www.flipkart.com/mobiles")
driver.maximize_window()
# Searches for a certain brand of phpnes (POCO). Inefficient way of locator finding though
search = driver.find_element(By.XPATH, "(//*[#alt='Shop Now'])[2]").click()
time.sleep(10) # bad practice, but used for now. Webdriverwait to be used instead.
each_element = "//a[#rel='noopener noreferrer']" # locates each desired element in the search page (each phone block)
posts = driver.find_elements(By.XPATH, each_element)
print(len(posts))
ls=[]
for post in range(len(posts)-1): # len-1 because the last item is a footer and not the desired link in my view
# concatanates the subscript to element xpath: e.g.: (//*[#element = 'ele'])[1] ... (//*[#element = 'ele'])[n]
each_post = driver.find_element(By.XPATH, '(' + each_element + ')' + '[' + str(post + 1) + ']')
each_link = each_post.get_attribute("href")
ls.append(each_link)
wb = openpyxl.Workbook() # creates a workbook
sheet = wb.active
c=0
# looping through the created list and writing the values to the created workbook
for i in ls:
sheet.cell(row=c+1, column=1).value = i
c+=1 # incrementing the row for each iteration of i
wb.save(xlpath) # saving the workbook with the name as given in the xlpath variable above
driver.quit()
Result in csv - image