how to get selenium data on excel or csv? - python

Here is my full code. I want to get output data on csv such as title, price everything will be separated column on csv or excel spreadsheet. My code going to details page of each product and collecting my necessary information such as product title, price etc.
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from bs4 import BeautifulSoup
#argument for incognito Chrome
option = Options()
option.add_argument("--incognito")
browser = webdriver.Chrome(options=option)
browser.get("https://www.daraz.com.bd/consumer-electronics/?spm=a2a0e.pdp.breadcrumb.1.4d20110bzkC0bn")
# Wait 20 seconds for page to load
timeout = 20
try:
WebDriverWait(browser, timeout).until(EC.visibility_of_element_located((By.XPATH, "//div[#class='c16H9d']")))
except TimeoutException:
print("Timed out waiting for page to load")
browser.quit()
//getting link of each product
soup = BeautifulSoup(browser.page_source, "html.parser")
product_items = soup.find_all("div", attrs={"data-qa-locator": "product-item"})
for item in product_items:
item_url = f"https:{item.find('a')['href']}"
print(item_url)
browser.get(item_url)
//scrape details page information
itm_soup = BeautifulSoup(browser.page_source, "html.parser")
container_box = itm_soup.find_all("div",{"id":"container"})
# Use the itm_soup to find details about the item from its url.
for itm in container_box:
product_title_element = itm.find("span",class_="pdp-mod-product-badge-title")
product_title = product_title_element.get_text() if product_title_element else "No title"
print(product_title)
browser.quit()
how I will get product title on csv or excel spreadsheet ?

You can use the csv writer module to accomplish this.
from csv import writer
def AddToCSV(List):
with open("Output.csv", "a+", newline='') as output_file:
csv_writer = writer(output_file)
csv_writer.writerow(List)
# this can be used within your for loop
row_list = [item_url, product_title, price, etc..]
AddToCSV(row_list)

Related

Write data in csv file but data are overwritten

Try to scrape the data but data are overwrite and they will give the data of only 2 page in the csv file kindly recommend any solution for that I an waiting for your response How can I fix this? is there any way then suggest me I think due to for loop they overwrite data Thank you. these is the page link https://www.askgamblers.com/online-casinos/countries/ca/
from selenium import webdriver
import time
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
import pandas as pd
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from csv import writer
options = webdriver.ChromeOptions()
options.add_argument("--no-sandbox")
options.add_argument("--disable-gpu")
options.add_argument("--window-size=1920x1080")
options.add_argument("--disable-extensions")
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
wait = WebDriverWait(driver, 20)
for page in range(1,3):
URL = 'https://www.askgamblers.com/online-casinos/countries/ca/{page}'.format(page=page)
driver.get(URL)
time.sleep(2)
urls= []
data = []
page_links =driver.find_elements(By.XPATH, "//div[#class='card__desc']//a[starts-with(#href, '/online')]")
for link in page_links:
href=link.get_attribute("href")
urls.append(href)
with open('product.csv', 'w',newline='',encoding='utf-8') as csvfile:
thewriter=writer(csvfile)
header=['name','url','website_link','company','rating']
thewriter.writerow(header)
for url in urls:
driver.get(url)
time.sleep(1)
try:
name=driver.find_element(By.CSS_SELECTOR,"h1.review-intro__title").text
except:
pass
try:
company=driver.find_element(By.XPATH,"//p[span[contains(.,'Company')]]/following-sibling::div").text
except:
pass
try:
link=driver.find_element(By.XPATH,"//p[span[contains(.,'Website')]]/following-sibling::div").text
except:
pass
try:
rate=driver.find_element(By.CSS_SELECTOR,"span.rating-ring__number").text
except:
pass
jobinfo=[name,url,link,company,rate]
thewriter.writerow(jobinfo)
You open the same file for (over)writing with 'w' each time but loop over 3 pages. Use a different name or use 'a' (append) instead, but you will get the header three times as well with the current configuration.
Better would be to open the file for writing outside the for page loop, write the header, then inside for page write the rows.
Basically:
with open('product.csv', 'w',newline='',encoding='utf-8') as csvfile:
thewriter=writer(csvfile)
header=['name','url','website_link','company','rating']
thewriter.writerow(header)
for page in range(1,3):
... # compute the row info
jobinfo=[name,url,link,company,rate]
thewriter.writerow(jobinfo)

Data are overwritten using beautifulsoup

Try to scrape the data but data are overwrite and they will give the data of only 2 page in the csv file kindly recommend any solution for that I an waiting for your response How can I fix this? is there any way then suggest me I think due to for loop they overwrite data Thank you.these is the have already searched for an answer here and spent a long time on google, but nothing... I've already tried opening the file with 'w' instead of 'r' or 'a' but I still can't get my code to
from selenium import webdriver
import time
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
import pandas as pd
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
import requests
from csv import writer
options = webdriver.ChromeOptions()
options.add_argument("--no-sandbox")
options.add_argument("--disable-gpu")
options.add_argument("--window-size=1920x1080")
options.add_argument("--disable-extensions")
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
wait = WebDriverWait(driver, 20)
url='https://mergr.com/login'
driver.get(url)
email=driver.find_element(By.CSS_SELECTOR,"input#username")
email.send_keys("timgr8#outlook.com")
password=driver.find_element(By.CSS_SELECTOR,"input#password")
password.send_keys("Cosmos1990$$$$$$$")
login=driver.find_element(By.CSS_SELECTOR,"button.btn").click()
for page in range(1,3):
URL = 'https://mergr.com/firms/search/employees?page={page}&firm%5BactiveInvestor%5D=2&sortColumn=employee_weight&sortDirection=asc'.format(page=page)
driver.get(URL)
added_urls = []
product=[]
soup = BeautifulSoup(driver.page_source,"lxml")
details = soup.select("tbody tr")
for detail in details:
try:
t1 = detail.select_one("h5.profile-title a").text
except:
# pass # then you'll just be using the previous row's t1
# [also, if this happens in the first loop, it will raise an error]
t1 = 'MISSING' # '' #
wev = {
'Name':t1,
}
href = detail.select_one("h5.profile-title + p a[href]")
if href and href.get("href", '').startswith('http'):
wev['page_link'] = href.get("href")
added_urls.append(href.get("href"))
product.append(wev)
### IF YOU WANT ROWS THAT CAN'T BE CONNECTED TO NAMES ###
page_links = driver.find_elements(By.CSS_SELECTOR, "h5.profile-title + p a")
for link in page_links:
if href in added_urls: continue # skip links that are already added
href = link.get_attribute("href")
# urls.append(href)
added_urls.append(href)
product.append({"page_link": href})
##########################################################
for pi, prod in enumerate(product):
if "page_link" not in prod or not prod["page_link"]: continue ## missing link
url = prod["page_link"]
driver.get(url)
soup = BeautifulSoup(driver.page_source,"lxml")
try:
website=soup.select_one("p.adress-info a[target='_blank']").text
except:
website=''
del product[pi]["page_link"] ## REMOVE this line IF you want a page_link column in csv
# data={'website':website}
# product.append(data)
product[pi]['website'] = website
df=pd.DataFrame(product)
df.to_csv('firm.csv')
Currently, you're clearing the product list at the beginning of each page loop - either move the product=[] line to before for page in range(1,3) OR indent the last two lines [with append mode - df.to_csv('firm.csv', mode='a' )] to get then inside the page loop; i.e., the product=[] line and the df... lines should have the SAME indent level.
(I don't recommend append mode btw, it's a bit annoying - if you use header=False, you won't have any headers [unless you write extra code to initialize the csv with them, like in saveScrawlSess in this crawler ], but if you don't then the header row keeps getting repeated every few rows....)

How to get selenium to webscrape the 2nd page of results in a popup window

I am trying to webscrape various pages of results. The first page works fine but when I switch to the next page, unfortunately,it just webscrapes the first page of results again. The results dont return a new URL so that way doesn't work but rather its a window on top of the url opened page. I also cant seem to figure out how to append the results of the first page to add the second page, they come out as separate lists. Below is the code I have.
from selenium import webdriver
import time
import requests
from bs4 import BeautifulSoup
from selenium.webdriver.common.keys import Keys
#original webscraping code to get the names of locations from page 1
url = r'https://autochek.africa/en/ng/fix-your-car/service/scheduled-car-service'
driver = webdriver.Chrome()
driver.get(url)
xpath_get_locations = r'/html/body/div[1]/div/div[2]/div/div[1]/div/div[2]/div[2]/div/div/form/div[7]/div/label'
driver.find_element_by_xpath(xpath_get_locations).click()
soup = BeautifulSoup(driver.page_source, 'html.parser')
location_results = [i.text for i in soup.find_all('div', {'class': 'jsx-1642469937 state'})]
print(location_results)
time.sleep(3)
#finished page 1, finding the next button to go to page 2
xpath_find_next_button = r'/html/body/div[1]/div/div[2]/div/div[1]/div/div[2]/div[2]/div[2]/div[2]/div/div/div[3]/ul/li[13]'
driver.find_element_by_xpath(xpath_find_next_button).click()
#getting the locations from page 2
second_page_results = [i.text for i in soup.find_all('div', {'class': 'jsx-1642469937 state'})]
print(second_page_results)
time.sleep(2)
After loading new page or running some JavaScript code on page you have to run again
soup = BeautifulSoup(driver.page_source, 'html.parser')
to work with new HTML.
Or skip BeautifulSoup and do all in Selenium.
Use find_elements_... with char s in word elements.
items = driver.find_elements_by_xpath('//div[#class="jsx-1642469937 state"]')
location_result = [i.text for i in items]
By The Way:
(xpath doesn't need prefix r because it doesn't use \ )
Shorter and more readable xpath.
#xpath_get_locations = r'/html/body/div[1]/div/div[2]/div/div[1]/div/div[2]/div[2]/div/div/form/div[7]/div/label'
xpath_get_locations = '//label[text()="Drop-off at Autochek location"]'
And it would be simpler to use button Next > instead of searching buttons 2, 3, etc.
xpath_find_next_button = '//li[#class="next-li"]/a'
EDIT:
Full working code which uses while-loop to visit all pages.
I added module webdriver_manager which automatically downloads (fresh) driver for browser.
I use find_elemens(By.XPATH, ...) because find_elemens_by_xpath(...) is deprecated.
from selenium import webdriver
from selenium.webdriver.common.by import By
#from selenium.webdriver.common.keys import Keys
#from selenium.webdriver.support.ui import WebDriverWait
#from selenium.webdriver.support import expected_conditions as EC
#from selenium.common.exceptions import NoSuchElementException, TimeoutException
import time
#from bs4 import BeautifulSoup
from webdriver_manager.chrome import ChromeDriverManager
#from webdriver_manager.firefox import GeckoDriverManager
driver = webdriver.Chrome(executable_path=ChromeDriverManager().install())
#driver = webdriver.Firefox(executable_path=GeckoDriverManager().install())
# ---
url = 'https://autochek.africa/en/ng/fix-your-car/service/scheduled-car-service'
driver.get(url)
#xpath_get_locations = r'/html/body/div[1]/div/div[2]/div/div[1]/div/div[2]/div[2]/div/div/form/div[7]/div/label'
xpath_get_locations = '//label[text()="Drop-off at Autochek location"]'
driver.find_element(By.XPATH, xpath_get_locations).click()
# ---
all_locations = []
while True:
# --- get locations on page
time.sleep(1) # sometimes `JavaScript` may need time to add new items (and you can't catch it with `WebDriverWait`)
#items = soup.find_all('div', {'class': 'jsx-1642469937 state'})
items = driver.find_elements(By.XPATH, '//div[#class="jsx-1642469937 state"]')
#soup = BeautifulSoup(driver.page_source, 'html.parser')
locations = [i.text for i in items]
print(locations)
print('-------')
all_locations += locations
# --- find button `next >` and try to click it
#xpath_find_next_button = r'/html/body/div[1]/div/div[2]/div/div[1]/div/div[2]/div[2]/div[2]/div[2]/div/div/div[3]/ul/li[13]'
xpath_find_next_button = '//li[#class="next-li"]/a'
try:
driver.find_element(By.XPATH, xpath_find_next_button).click()
except:
break # exit loop
# ---
#driver.close()

Google Maps some XPATH selectors return data some not Selenium Python

I was trying to scrape google maps. The phone and hours variable is not returning any data. Other variables work fine and return data. The XPATH is correct. I am not sure what's the issue here.
Here is the LINK
The other selectors like name, address, title, website return the data fine but phone and hours not returning any data.
Hoping for some answers.
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.support.ui import Select
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.keys import Keys
from scrapy.selector import Selector
import csv
from tqdm import tqdm
import time
driver = webdriver.Firefox()
linksFile=open("links.txt",'r')
allLinks = linksFile.readlines()
for link in tqdm(allLinks):
try:
driver.get(link)
except Exception:
print('Something went wrong with the URL: ')
# time.sleep(15)
while True:
WebDriverWait(driver, 15).until(
EC.presence_of_element_located((By.XPATH, '//div[contains(text(), "Directions")] | //div[contains(text(), "Website")]'))
)
results = driver.find_elements_by_xpath('//div[contains(text(), "Directions")] | //div[contains(text(), "Website")]')
for result in results:
# writing to the CSV file
outFile = open("data.csv",'a+',newline="")
writer = csv.writer(outFile)
business = driver.find_element_by_xpath('//div[#role="heading"]/div')
business.click()
# waiting for the page to load
WebDriverWait(driver, 15).until(
EC.presence_of_element_located((By.XPATH, '//div[#class="immersive-container"]'))
)
# parcing response to the scrapy selector
response = Selector(text=driver.page_source)
name = response.xpath('//h2[#data-attrid="title"]/span/text()').get()
title = response.xpath('(//span[contains(text(), "Google reviews")])/parent::a/parent::span/parent::span/parent::div/parent::div/parent::div/following-sibling::div/div/span/span/text()').get()
address = response.xpath('//a[contains(text(), "Address")]/parent::span/following-sibling::span/text()').get()
website = response.xpath('(//a[contains(text(), "Website")])/#href').get()
phone = response.xpath('//a[contains(text(), "Phone")]/parent::span/following-sibling::span/a/span/text()').get()
hours = response.xpath('//a[contains(text(), "Hours")]/parent::span/following-sibling::div/label/span//btext()').get()
total_reviews = response.xpath('(//span[contains(text(), "Google reviews")])[1]/text()').get()
total_rating = response.xpath('(//span[contains(text(), "Google reviews")])/parent::a/parent::span/parent::span/parent::div/span/text()').get()
input('Check: ')
outFile = open("data.csv",'a+',newline="")
writer = csv.writer(outFile)
vals = [name, title, address, website, phone, hours, total_reviews, total_rating]
writer.writerow(vals)
outFile.close()
Can you use Java script outerHTML intead of pageSource.
response = Selector( driver.execute_script("return document.documentElement.outerHTML"))
Also there is an issue in xpath of Hours:
hours = response.xpath('//a[contains(text(), "Hours")]/parent::span/following-sibling::div/label/span//b/text()').get()
Try Google Maps link and not google search: https://www.google.com/maps/place/Leduc+Plumbing+and+Heating/#53.274672,-113.5486679,17z/data=!3m1!4b1!4m5!3m4!1s0x539ff9a5d31a87c9:0xf494d91aafd55e55!8m2!3d53.2746688!4d-113.5464739
IT should be more stable.

bs4 again from website and save to text file

I am learning on how to extract data from websites now and have managed to get alot of information. However for my next website I am failing for some unknown reason as nothing is saved to the text files nor do I get any output in print. Here is my piece of code:
import json
import urllib.request
from bs4 import BeautifulSoup
import requests
url = 'https://www.jaffari.org/'
request = urllib.request.Request(url,headers={'User-Agent': 'Mozilla/5.0'})
response = urllib.request.urlopen(request)
html = response.read()
soup = BeautifulSoup(html.decode("utf-8"), "html.parser")
table = soup.find('div', attrs={"class":"textwidget"})
name = table.text.encode('utf-8').strip()
with open('/home/pi/test.txt', 'w') as outfile:
json.dump(name, outfile)
print (name)
Can anyone help please?
The prayer times are rendered by java-scripts therefore you need to use browser tool like selenium to load the page and then use beautiful soup to get the data.
You need to download compatible ChromeDriver from this link and passed the chrome driver path as i have provided.
Code here to fetch name and prayer times and saved in a text file.
from selenium.webdriver.chrome.options import Options
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup
import re
options = Options()
# Runs Chrome in headless mode.
options.add_argument("--headless")
#path of the chrome driver
driver=webdriver.Chrome(executable_path="D:\Software\chromedriver.exe", chrome_options=options)
driver.headless=True
driver.get('https://www.jaffari.org/')
WebDriverWait(driver,20).until(EC.visibility_of_element_located((By.CSS_SELECTOR,'div.sidebar-widget.widget_text>div>table')))
print("Data rendered successfully!!!")
#Get the page source
html=driver.page_source
soup=BeautifulSoup(html,'html.parser')
#Close the driver
driver.close()
with open('testPrayers.txt', 'w') as outfile:
for row in soup.select("div.sidebar-widget.widget_text>div>table tr"):
name=row.select("td")[0].text.strip()
time=re.findall('(\d{1,2}:?\d{1,2}\W[A|P]M$)',row.select("td")[1].text.strip())
outfile.write(name + " " + time[0] + "\n")
print(name + " " + time[0])
outfile.close()
print('Done')
Updated data with different file name.
from selenium.webdriver.chrome.options import Options
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup
import re
options = Options()
# Runs Chrome in headless mode.
options.add_argument("--headless")
#path of the chrome driver
driver=webdriver.Chrome(executable_path="D:\Software\chromedriver.exe", chrome_options=options)
driver.headless=True
driver.get('https://www.jaffari.org/')
WebDriverWait(driver,20).until(EC.visibility_of_element_located((By.CSS_SELECTOR,'div.sidebar-widget.widget_text>div>table')))
print("Data rendered successfully!!!")
#Get the page source
html=driver.page_source
soup=BeautifulSoup(html,'html.parser')
#Close the driver
driver.close()
for row in soup.select("div.sidebar-widget.widget_text>div>table tr"):
name=row.select("td")[0].text.strip()
time=re.findall('(\d{1,2}:?\d{1,2}\W[A|P]M$)',row.select("td")[1].text.strip())
print(name + " " + time[0])
with open(name+'.txt', 'w') as outfile:
outfile.write(time[0])
outfile.close()
print('Done')
The name variable needs to be a string rather than a bytes object. Try with
with open('/home/pi/test.txt', 'w') as outfile:
json.dump(name.decode(), outfile)
print (name.decode())
Hope it helps.

Categories

Resources