So i am working on a custom web scraper for any kind of ecommerce site, i want it to scrape names and prices of listings on a site and then export them to csv, but the problem is it exports only one line of (name, price) and it prints it on every line of csv, i couldnt find a good solution for this, i hope im not asking an extremely stupid thing, although i think the fix is easy. I hope someone will read my code and help me, thank you !
###imports
from selenium.webdriver.common.keys import Keys
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import csv
import pandas as pd
#driver path
driver = webdriver.Firefox(executable_path="D:\Programy\geckoDriver\geckodriver.exe")
#init + search
driver.get("https://pc.bazos.sk/pc/")
time.sleep(1)
nazov = driver.find_element_by_name("hledat")
nazov.send_keys("xeon")
cenamin = driver.find_element_by_name("cenaod")
cenamin.send_keys("")
cenamax = driver.find_element_by_name("cenado")
cenamax.send_keys("300")
driver.find_element_by_name("Submit").click()
##cookie acceptor
driver.find_element_by_xpath("/html/body/div[1]/button").click()
##main
x = 3
for i in range(x):
try:
main = WebDriverWait(driver, 7).until(
EC.presence_of_element_located((By.XPATH, "/html/body/div[1]/table/tbody/tr/td[2]"))
)
##find listings in table
inzeraty = main.find_elements_by_class_name("vypis")
for vypis in inzeraty:
nadpis = vypis.find_element_by_class_name("nadpis")
##print listings to check correctness
nadpist = nadpis.text
print(nadpist)
##find the price and print
for vypis in inzeraty:
cena = vypis.find_element_by_class_name("cena")
cenat = cena.text
print(cenat)
##export to csv - not working
time.sleep(1)
print("Writing to csv")
d = {"Nazov": [nadpist]*20*x,"Cena": [cenat]*20*x}
df = pd.DataFrame(data=d)
df.to_csv("bobo.csv")
time.sleep(1)
print("Writing to csv done !")
##next page
dalsia = driver.find_element_by_link_text("Ďalšia")
dalsia.click()
except:
driver.quit()
i want the csv to look like:
name,price
name2, price2
it would be great is the csv had only two columns and x rows depending on the number of listings
from selenium.webdriver.common.keys import Keys
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import pandas as pd
#driver path
driver = webdriver.Chrome()
#init + search
driver.get("https://pc.bazos.sk/pc/")
time.sleep(1)
nazov = driver.find_element_by_name("hledat")
nazov.send_keys("xeon")
cenamin = driver.find_element_by_name("cenaod")
cenamin.send_keys("")
cenamax = driver.find_element_by_name("cenado")
cenamax.send_keys("300")
driver.find_element_by_name("Submit").click()
##cookie acceptor
time.sleep(10)
driver.find_element_by_xpath("/html/body/div[1]/button").click()
##main
x = 3
d = []
for i in range(x):
try:
main = WebDriverWait(driver, 7).until(
EC.presence_of_element_located(
(By.XPATH, "/html/body/div[1]/table/tbody/tr/td[2]")))
##find listings in table
inzeraty = main.find_elements_by_class_name("vypis")
for vypis in inzeraty:
d.append({"Nazov": vypis.find_element_by_class_name("nadpis").text,
"Cena": vypis.find_element_by_class_name("cena").text
})
##next page
dalsia = driver.find_element_by_link_text("Ďalšia")
dalsia.click()
except:
driver.quit()
time.sleep(1)
print("Writing to csv")
df = pd.DataFrame(data=d)
df.to_csv("bobo.csv",index=False)
this gives me 59 items with price. first added to dict then to list, then send that to pandas.
All you need to do is create two empty lists nadpist_l, cenat_l and append data to that lists, finally save the lists as a dataframe.
UPDATED as per the comment
Check if this works
###imports
from selenium.webdriver.common.keys import Keys
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import pandas as pd
#driver path
driver = webdriver.Chrome()
#init + search
driver.get("https://pc.bazos.sk/pc/")
time.sleep(1)
nazov = driver.find_element_by_name("hledat")
nazov.send_keys("xeon")
cenamin = driver.find_element_by_name("cenaod")
cenamin.send_keys("")
cenamax = driver.find_element_by_name("cenado")
cenamax.send_keys("300")
driver.find_element_by_name("Submit").click()
##cookie acceptor
time.sleep(10)
driver.find_element_by_xpath("/html/body/div[1]/button").click()
##main
x = 3
d = {}
for i in range(x):
try:
main = WebDriverWait(driver, 7).until(
EC.presence_of_element_located(
(By.XPATH, "/html/body/div[1]/table/tbody/tr/td[2]")))
##find listings in table
inzeraty = main.find_elements_by_class_name("vypis")
nadpist_l = []
for vypis in inzeraty:
nadpis = vypis.find_element_by_class_name("nadpis")
##print listings to check correctness
nadpist = nadpis.text
nadpist_l.append(nadpist)
# print(nadpist)
##find the price and print
cenat_l = []
for vypis in inzeraty:
cena = vypis.find_element_by_class_name("cena")
cenat = cena.text
cenat_l.append(cenat)
print(len(cenat_l))
##export to csv - not working
d.update({"Nazov": [nadpist_l] * 20 * x, "Cena": [cenat_l] * 20 * x})
##next page
dalsia = driver.find_element_by_link_text("Ďalšia")
dalsia.click()
except:
driver.quit()
time.sleep(1)
print("Writing to csv")
df = pd.DataFrame(data=d)
df.to_csv("bobo.csv")
time.sleep(1)
print("Writing to csv done !")
Related
I'm quite new to python and have written a script using selenium to scrape a website. I've tried everything but can't get the loop to cycle through pages. It currently just repeats the data on the first page 5 times. I want to scrape all the pages for 'BR1' any help would be great, currently the script below only scrapes the first page 5 times.
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
with open('rightmove.csv', 'w') as file:
file.write('PropertyCardcontent \n')
PATH = ("/usr/local/bin/chromedriver")
driver = webdriver.Chrome(PATH)
driver.get("https://www.rightmove.co.uk/house-prices.html")
print(driver.title)
elem = driver.find_element(By.NAME, 'searchLocation') # Find the search box
elem.send_keys('BR1' + Keys.RETURN)
try:
content = WebDriverWait(driver, 15).until(
EC.presence_of_element_located((By.ID,'content'))
)
finally:
time.sleep(3)
for p in range(5):
sold = content.find_elements(By.CLASS_NAME, 'sold-prices-content-wrapper ')
for solds in sold:
address = solds.find_elements(By.CLASS_NAME, 'sold-prices-content ')
for addresses in address:
result = addresses.find_elements(By.CLASS_NAME, 'results ')
for results in result:
card = results.find_elements(By.CLASS_NAME,'propertyCard')
for propertyCard in card:
header = propertyCard.find_elements(By.CLASS_NAME,'propertyCard-content')
for propertyCardcontent in header:
road = propertyCardcontent.find_elements(By.CLASS_NAME,'title')
for propertyCardcontent in header:
road = propertyCardcontent.find_elements(By.CLASS_NAME,'subTitle')
for subtitle in road:
bed = subtitle.find_elements(By.CLASS_NAME, 'propertyType')
with open('rightmove.csv', 'a') as file:
for i in range(len(result)):
file.write(header[i].text + '\n')
button = driver.find_element(By.XPATH, '//*[#id="content"]/div[2]/div[2]/div[4]/div[27]/div[3]/div')
button.click()
file.close()
time.sleep(3)
driver.quit()
Since the website link has page number on it, I recommend you put the base url as "https://www.rightmove.co.uk/house-prices/br1.html?page=1", and loop through the pages while changing the last index of the url with methods like format string.
One other thing, you don't need to implement all those for loops, you can simply assign each variable to its specific value since everything you need is inside an html block which is easy to navigate on it.
Update:
I'm sorry for being late, had unexpected stuff(...).
I've made some changes as I use Brave, so make sure you select your browser, Chrome I believe, the chromedriver(ver:102) stays the same (or depending your Chrome version).
I've also got the Price and Date and stored them in a tuple.
Every record is stored in a list[Title, propertyType, tupleof(Price_Date)]
At the end, it creates a csv and stores everything inside with a ";" as delimter.
You can if you prefer split the price and date for later use, up to you.
Note: This looping method only applies to websites in which the number of page is included within the URL. In this case, both the key and number of page is included in the URL.
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
import time
import random
import itertools
options = Options()
options.binary_location = r'C:\Program Files\BraveSoftware\Brave-Browser\Application\brave.exe'
driver = webdriver.Chrome(options = options, service = Service("chromedriver.exe"))
key_word = "BR1".lower()
base_url = f"https://www.rightmove.co.uk/house-prices/{key_word}.html?page=1"
driver.get(base_url)
#Number of pages
pages = driver.find_element(By.XPATH, '//span[#class="pagination-label"][2]').text
pages = int(pages.strip('of'))
WebDriverWait(driver, 15).until(
EC.presence_of_element_located((By.CLASS_NAME, 'results '))
)
data = []
pc = 0
for p in range(1,pages+1):
driver.get(f"https://www.rightmove.co.uk/house-prices/{key_word}.html?page={p}")
WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.XPATH, '//div//div[#class="propertyCard"]'))
)
propertyCards = driver.find_elements(By.XPATH, '//div//div[#class="propertyCard"]')
for propertyCard in propertyCards:
title = propertyCard.find_element(By.CLASS_NAME, 'title').text
propertyType = propertyCard.find_element(By.CLASS_NAME, 'propertyType').text
price_list = propertyCard.find_elements(By.CLASS_NAME, 'price')
date_list = propertyCard.find_elements(By.CLASS_NAME, 'date-sold')
data.append([title,propertyType])
for p, d in itertools.zip_longest(price_list, date_list , fillvalue = None):
try:
price = p.text
date = d.text
data[pc].append((price, date))
except Exception as e:
print(e)
pc+=1
time.sleep(random.randint(1,4))
print(data)
with open('rightmove.csv', 'w') as file:
header = "Title;propertyType;Price_Date\n"
file.write(header)
for record in data:
file.write("{};{};{}\n".format(record[0],record[1],record[2:]))
driver.quit()
You don't have to go down to dom elem by elem, you can just use xpath or class_name (if it's unique, otherwise it's better xpath or css-selector) and get the item you are looking for.
Anyway follow this:
import time
import selenium.webdriver as webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
driver = webdriver.Chrome("/usr/local/bin/chromedriver")
driver.get("https://www.rightmove.co.uk/house-prices.html")
# send query
query = "BR1"
search_bar = driver.find_element(By.XPATH, '//input[#class="searchBox ac_input"]')
search_bar.send_keys(query)
search_bar.send_keys(Keys.ENTER)
# wait to result been loaded
WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.CLASS_NAME, 'propertyCard'))
)
#get amount of pages
pages = driver.find_element(By.XPATH, '//span[#class="pagination-label"][2]').text
pages = int(pages.replace('of ', ''))
data = []
i = 1
while i <= pages:
WebDriverWait(driver, 10).until(
EC.element_to_be_clickable((By.XPATH, '//div[contains(text(), "Next")]'))
).click()
# wait page load result
WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.XPATH, '//div//div[#class="propertyCard"]'))
)
propertyCards = driver.find_elements(By.XPATH, '//div//div[#class="propertyCard"]')
# loop over result and store data
for propertyCard in propertyCards:
title = propertyCard.find_element(By.CLASS_NAME, 'title').text
propertyType = propertyCard.find_element(By.CLASS_NAME, 'propertyType').text
data.append((title, propertyType))
time.sleep(1)
i += 1
print("you reach the last page")
#get number of results
printf(data)
driver.close()
I use a list of tuple cause in your example you want store 2 item, if you want store more data you can use a dict and then convert into csv with Dictwriter directly. Enjoy.
I am trying to scrape each product page from this website: https://www.aliexpress.com/wholesale?catId=0&initiative_id=SB_20220315022920&SearchText=bluetooth+earphones
Especially I want to get comments and custumer countries as I mentionned in the photo:
enter image description here
The main issue is that my code does not inspect the right elements and this is what I am struggling with .
First, I tried my scraping on this product : https://www.aliexpress.com/item/1005003801507855.html?spm=a2g0o.productlist.0.0.1e951bc72xISfE&algo_pvid=6d3ed61e-f378-43d0-a429-5f6cddf3d6ad&algo_exp_id=6d3ed61e-f378-43d0-a429-5f6cddf3d6ad-8&pdp_ext_f=%7B%22sku_id%22%3A%2212000027213624098%22%7D&pdp_pi=-1%3B40.81%3B-1%3B-1%40salePrice%3BMAD%3Bsearch-mainSearch
Here is my code :
from selenium import webdriver
from selenium.webdriver.common.by import By
from lxml import html
import cssselect
from time import sleep
from itertools import zip_longest
import csv
driver = webdriver.Edge(executable_path=r"C:/Users/OUISSAL/Desktop/wscraping/XEW/scraping/codes/msedgedriver")
url = "https://www.aliexpress.com/item/1005003801507855.html?spm=a2g0o.productlist.0.0.1e951bc72xISfE&algo_pvid=6d3ed61e-f378-43d0-a429-5f6cddf3d6ad&algo_exp_id=6d3ed61e-f378-43d0-a429-5f6cddf3d6ad-8&pdp_ext_f=%7B%22sku_id%22%3A%2212000027213624098%22%7D&pdp_pi=-1%3B40.81%3B-1%3B-1%40salePrice%3BMAD%3Bsearch-mainSearch"
with open ("data.csv", "w", encoding="utf-8") as csvfile:
wr = csv.writer(csvfile)
wr.writerow(["Comment","Custumer country"])
driver.get(url)
driver.execute_script('window.scrollTo(0, document.body.scrollHeight);')
review_buttom = driver.find_element_by_xpath('//li[#ae_button_type="tab_feedback"]')
review_buttom.click()
html_source = driver.find_element_by_xpath('//div[#id="transction-feedback"]')
tree = html.fromstring(html_source)
#tree = html.fromstring(driver.page_source)
for rvw in tree.xpath('//div[#class="feedback-item clearfix"]'):
country = rvw.xpath('//div[#class="user-country"]//b/text()')
if country:
country = country[0]
else:
country = ''
print('country:', country)
comment = rvw.xpath('//dt[#id="buyer-feedback"]//span/text()')
if comment:
comment = comment[0]
else:
comment = ''
print('comment:', comment)
driver.close()
Thank you !!
What happens?
There is one main issue, the feedback you are looking for is in an iframe, so you wont get your information by calling the elements directly.
How to fix?
Scroll into view of element that holds the iframe navigate to its source and interact with its pagination to get all the feedbacks.
Example
from selenium import webdriver
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
url = 'https://www.aliexpress.com/item/1005003801507855.html?spm=a2g0o.productlist.0.0.1e951bc72xISfE&algo_pvid=6d3ed61e-f378-43d0-a429-5f6cddf3d6ad&algo_exp_id=6d3ed61e-f378-43d0-a429-5f6cddf3d6ad-8&pdp_ext_f=%7B%22sku_id%22%3A%2212000027213624098%22%7D&pdp_pi=-1%3B40.81%3B-1%3B-1%40salePrice%3BMAD%3Bsearch-mainSearch'
driver = webdriver.Chrome(ChromeDriverManager().install())
driver.maximize_window()
driver.get(url)
wait = WebDriverWait(driver, 10)
driver.execute_script("arguments[0].scrollIntoView();", wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, '.tab-content'))))
driver.get(wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, '#product-evaluation'))).get_attribute('src'))
data=[]
while True:
for e in driver.find_elements(By.CSS_SELECTOR, 'div.feedback-item'):
try:
country = e.find_element(By.CSS_SELECTOR, '.user-country > b').text
except:
country = None
try:
comment = e.find_element(By.CSS_SELECTOR, '.buyer-feedback span').text
except:
comment = None
data.append({
'country':country,
'comment':comment
})
try:
wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, '#complex-pager a.ui-pagination-next'))).click()
except:
break
pd.DataFrame(data).to_csv('filename.csv',index=False)
I am extracting google reviews of a resturant. I am interested in extracting reviewer name, rating given by reviewer, and text of the review. I used following code for the extraction:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.action_chains import ActionChains
import time
driver = webdriver.Chrome('')
base_url = 'https://www.google.com/search?tbs=lf:1,lf_ui:9&tbm=lcl&sxsrf=AOaemvJFjYToqQmQGGnZUovsXC1CObNK1g:1633336974491&q=10+famous+restaurants+in+Dunedin&rflfq=1&num=10&sa=X&ved=2ahUKEwiTsqaxrrDzAhXe4zgGHZPODcoQjGp6BAgKEGo&biw=1280&bih=557&dpr=2#lrd=0xa82eac0dc8bdbb4b:0x4fc9070ad0f2ac70,1,,,&rlfi=hd:;si:5749134142351780976,l,CiAxMCBmYW1vdXMgcmVzdGF1cmFudHMgaW4gRHVuZWRpbiJDUjEvZ2VvL3R5cGUvZXN0YWJsaXNobWVudF9wb2kvcG9wdWxhcl93aXRoX3RvdXJpc3Rz2gENCgcI5Q8QChgFEgIIFkiDlJ7y7YCAgAhaMhAAEAEQAhgCGAQiIDEwIGZhbW91cyByZXN0YXVyYW50cyBpbiBkdW5lZGluKgQIAxACkgESaXRhbGlhbl9yZXN0YXVyYW50mgEkQ2hkRFNVaE5NRzluUzBWSlEwRm5TVU56ZW5WaFVsOUJSUkFCqgEMEAEqCCIEZm9vZCgA,y,2qOYUvKQ1C8;mv:[[-45.8349553,170.6616387],[-45.9156414,170.4803685]]'
driver.get(base_url)
WebDriverWait(driver,10).until(EC.element_to_be_clickable((By.XPATH,"//div[./span[text()='Newest']]"))).click()
total_reviews_text =driver.find_element_by_xpath("//div[#class='review-score-container']//div//div//span//span[#class='z5jxId']").text
num_reviews = int (total_reviews_text.split()[0])
all_reviews = WebDriverWait(driver, 20).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, 'div.gws-localreviews__google-review')))
time.sleep(2)
total_reviews = len(all_reviews)
while total_reviews < num_reviews:
driver.execute_script('arguments[0].scrollIntoView(true);', all_reviews[-1])
WebDriverWait(driver, 5, 0.25).until_not(EC.presence_of_element_located((By.CSS_SELECTOR, 'div[class$="activityIndicator"]')))
#all_reviews = driver.find_elements_by_css_selector('div.gws-localreviews__google-review')
time.sleep(5)
all_reviews = WebDriverWait(driver, 5).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, 'div.gws-localreviews__google-review')))
print(total_reviews)
total_reviews +=5
person_info = driver.find_elements_by_xpath("//div[#id='reviewSort']//div[contains(#class,'google-review')]")
rating_info = driver.find_elements_by_xpath("//div[#class='PuaHbe']")
review_text = driver.find_elements_by_xpath("//div[#class='Jtu6Td']")
for person in person_info:
name = person.find_element_by_xpath("./div/div/div/a").text
print(name)
for rating in rating_info:
rating_txt = person.find_element_by_xpath("./g-review-stars/span").get_attribute('aria-label')
print(rating_txt)
for text in review_text:
texts = text.find_element_by_xpath("./span").text
print(texts)
The above code worked as per expectations. I want to make slight change in above code. Instead of using three loops to display name, rating, and review_text. I wanted to extract the same information using one loop. So I made following changes in the above code:
reviews_info = driver.find_elements_by_xpath("//div[#class='jxjCjc']")
for review_info in reviews_info:
name = review_info.find_element_by_xpath("./div/div/a").text
rating = review_info.find_element_by_xpath("//div[#class='PuaHbe']//g-review-stars//span").get_attribute('aria-label')
text = review_info.find_element_by_xpath("//div[#class='Jtu6Td']//span").text
print(name)
print(rating)
print(text)
print()
The problem with a change in code is that it displays the same information (i.e. rating and text) for all reviewers names. I am not sure where am I making the mistake. Any help to fix the issue would be really appreciated.
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
driver = webdriver.Chrome()
base_url = 'https://www.google.com/search?tbs=lf:1,lf_ui:9&tbm=lcl&sxsrf=AOaemvJFjYToqQmQGGnZUovsXC1CObNK1g:1633336974491&q=10+famous+restaurants+in+Dunedin&rflfq=1&num=10&sa=X&ved=2ahUKEwiTsqaxrrDzAhXe4zgGHZPODcoQjGp6BAgKEGo&biw=1280&bih=557&dpr=2#lrd=0xa82eac0dc8bdbb4b:0x4fc9070ad0f2ac70,1,,,&rlfi=hd:;si:5749134142351780976,l,CiAxMCBmYW1vdXMgcmVzdGF1cmFudHMgaW4gRHVuZWRpbiJDUjEvZ2VvL3R5cGUvZXN0YWJsaXNobWVudF9wb2kvcG9wdWxhcl93aXRoX3RvdXJpc3Rz2gENCgcI5Q8QChgFEgIIFkiDlJ7y7YCAgAhaMhAAEAEQAhgCGAQiIDEwIGZhbW91cyByZXN0YXVyYW50cyBpbiBkdW5lZGluKgQIAxACkgESaXRhbGlhbl9yZXN0YXVyYW50mgEkQ2hkRFNVaE5NRzluUzBWSlEwRm5TVU56ZW5WaFVsOUJSUkFCqgEMEAEqCCIEZm9vZCgA,y,2qOYUvKQ1C8;mv:[[-45.8349553,170.6616387],[-45.9156414,170.4803685]]'
driver.get(base_url)
WebDriverWait(driver,10).until(EC.element_to_be_clickable((By.XPATH,"//div[./span[text()='Newest']]"))).click()
total_reviews_text =driver.find_element_by_xpath("//div[#class='review-score-container']//div//div//span//span[#class='z5jxId']").text
num_reviews = int (total_reviews_text.split()[0])
print("NUm reviews=", num_reviews)
all_reviews = WebDriverWait(driver, 20).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, 'div.gws-localreviews__google-review')))
time.sleep(2)
total_reviews = len(all_reviews)
print("Total reviews=", total_reviews)
s = "(//div[#id='reviewSort']//div[contains(#class,'google-review')])[0]"
b = '0'
a = 1 # Index of Review button
for i in range(10):
c = str(a)
s = s.replace(b, c) # Updating Xpath's index in every loop so that it can focus on new review everytime.
b = str(a)
a = a + 1
WebDriverWait(driver, 5, 0.25).until_not(EC.presence_of_element_located((By.CSS_SELECTOR, 'div[class$="activityIndicator"]')))
time.sleep(5)
all_reviews = WebDriverWait(driver, 5).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, 'div.gws-localreviews__google-review')))
total_reviews +=1
Info = driver.find_element_by_xpath(s).text
print(Info)
print("<------------------------------------------------------>\n\n")
Output:-
Click Here to See Program Output
I am trying to use selenium in Python to take information from a city's public website and loop through information I have in a csv with each row being a different address, date and city. Ideally I would download the associated PDF, but I am getting stuck on how to actively loop through the csv using pandas. I pasted the code I have so far!
import pandas as pd
import time
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.by import By
from selenium.webdriver.common.alert import Alert
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
df = pd.read_csv(r'C:\\Users\\xxx\\Desktop\\uof_ex.csv')
driver = webdriver.Chrome('C:\\Users\\xxx\\Desktop\\chromedriver_win32\\chromedriver.exe')
driver.get('https://p2c.highpointnc.gov/EventSearch')
wait = WebDriverWait(driver, 20)
wait_implicit = driver.implicitly_wait(5)
action = ActionChains(driver)
pop_element = wait.until(EC.element_to_be_clickable((By.XPATH,'//*[#id="disclaimerDialog"]/md-dialog-actions/button[2]'))).click()
i = 0
while i == 0:
a = 0
address = df.address
city = df.City
date = df.date_occu
search_element = wait.until(EC.element_to_be_clickable((By.XPATH,'//*[#id="byReportInformation-card"]/md-card-title/md-card-title-text/span[1]'))).click()
time.sleep(2)
address_element = wait.until(EC.element_to_be_clickable((By.XPATH,'//*[#id="address-input"]')))
address_element.click()
address_element.clear()
address_element.send_keys(address[a])
time.sleep(2)
city_element = wait.until(EC.element_to_be_clickable((By.XPATH,'//*[#id="city-select"]'))).click()
city_element_choose = wait.until(EC.element_to_be_clickable((By.XPATH, '//*[#id="select_option_24"]'))).click()
time.sleep(2)
stdate_element = wait.until(EC.element_to_be_clickable((By.XPATH,'//*[#id="input_6"]'))).click()
stdate_element_clear = wait.until(EC.element_to_be_clickable((By.XPATH,'//*[#id="input_6"]'))).clear()
time.sleep(2)
enddate_element = wait.until(EC.element_to_be_clickable((By.XPATH,'//*[#id="input_8"]'))).click()
enddate_element_clear = wait.until(EC.element_to_be_clickable((By.XPATH,'//*[#id="input_8"]'))).clear()
time.sleep(2)
stdate_element_choose = wait.until(EC.element_to_be_clickable((By.XPATH,'//*[#id="input_6"]'))).send_keys(date[a])
enddate_element_choose = wait.until(EC.element_to_be_clickable((By.XPATH,'//*[#id="input_8"]'))).send_keys(date[a])
search_element = wait.until(EC.element_to_be_clickable((By.XPATH,'//*[#id="search-button"]'))).click()
time.sleep(2)
back_element = wait.until(EC.presence_of_element_located((By.XPATH,'//*[#id="back-button"]'))).click()
time.sleep(2)
a = a + 1
you can loop over the rows of the pandas csv object
as named tuples
df = pd.read_csv(r'C:\\Users\\xxx\\Desktop\\uof_ex.csv')
for row in df.itertuples():
address = row.address
city = row.City
date = row.date_occu
now address contains the data from the currently iterated row. so you can use address directly instead of address[i].
or as dictionaries
for index, row in df.iterrows():
address = row['address']
city = row['City']
date = row['date_occu']
read here for more options https://www.dataindependent.com/pandas/pandas-iterate-over-rows/
I am new to web scraping. I am trying to extract table data from the Forbes Top Multinational Performers list. I was able to successfully extract some data. However, I only was able to get the top 10 from the list. The table contains ads in between. How can I get all the data?
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
driver = webdriver.Chrome(r'C:/Users/Shirly.Ang3/Desktop/BUSINESS STAT/GGR/chromedriver_win32/chromedriver.exe')
url = "https://www.forbes.com/top-multinational-performers/list/"
driver.get(url)
wait_row = WebDriverWait(driver, 30)
rows = wait_row.until(EC.presence_of_all_elements_located((By.XPATH,
'.//*[#id="the_list"]/tbody[#id="list-table-body"]')))
data = []
for row in rows:
for i in row.find_elements_by_class_name("data"):
try:
if i.is_displayed():
row_dict = {}
row_dict['Rank'] = i.find_element_by_xpath('.//td[2]').text
row_dict['Link'] = i.find_element_by_xpath('.//td[3]/a[#href]').get_attribute("href")
row_dict['Company'] = i.find_element_by_xpath('.//td[3]').text
row_dict['Industry'] = i.find_element_by_xpath('.//td[4]').text
row_dict['Country'] = i.find_element_by_xpath('.//td[5]').text
data.append(row_dict)
except:
continue
driver.close()
df = pd.DataFrame(data)
df.to_csv("Forbes_TEST.csv", sep=",", index=False)
To get all 250 records you just need to add code to scroll to the bottom of the page to your existing code. So add:
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(5)
before:
data = []
and add import time
But saying that your code is really slow. Even with setting your wait_row to 3 it took 1m5.933s to run on my machine. The following code took 0m12.978s to run.
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
from bs4 import BeautifulSoup
import csv
driver = webdriver.Chrome(r'C:/Users/Shirly.Ang3/Desktop/BUSINESS STAT/GGR/chromedriver_win32/chromedriver.exe')
url = "https://www.forbes.com/top-multinational-performers/list/"
driver.get(url)
wait_row = WebDriverWait(driver, 3)
rows = wait_row.until(EC.presence_of_all_elements_located((By.XPATH, './/*[#id="the_list"]/tbody[#id="list-table-body"]')))
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(5)
ranks = []
links = []
companies = []
industries = []
countries = []
soup = BeautifulSoup(driver.page_source, "lxml")
table = soup.find("table", {"id": "the_list"})
for tr in table.find_all("tr", {"class": "data"}):
tds = tr.find_all("td")
ranks.append(tds[1].text)
links.append(tds[2].find('a')['href'])
companies.append(tds[2].text)
industries.append(tds[3].text)
countries.append(tds[4].text)
data = zip(ranks, links, companies, industries, countries)
with open('Forbes_TEST_02.csv', 'w') as csvfile:
csv_out = csv.writer(csvfile)
csv_out.writerow(['Rank', 'Link', 'Company','Industry', 'Country'])
csv_out.writerows(data)
driver.close()