# 뉴스 크롤링.py
#######################################'사용후핵연료' 키워드 검색##################################################
import sys, os
from bs4 import BeautifulSoup
import requests
from selenium import webdriver
import selenium
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
from datetime import datetime, timedelta
from pandas import DataFrame
import time
from openpyxl.workbook import Workbook
sleep_sec = 0.5
wb = Workbook()
# User-Agent를 입력해주세요.
headers = {'User-Agent' : '________________'}
query = 'spent nuclear fuel'
yesterday = (datetime.today() - timedelta(1)).strftime("%Y.%m.%d")
def news_crawling():
service = Service(executable_path=ChromeDriverManager().install())
browser = webdriver.Chrome(service=service)
print('브라우저를 실행시킵니다(자동 제어)\n')
news_url = 'https://www.google.com/search?q={0}&tbm=nws&source-news]'.format(query, yesterday)
browser.get(news_url)
time.sleep(sleep_sec)
print('\n크롤링을 시작합니다.')
#####동적 제어로 페이지 넘어가며 크롤링
news_dict = {}
idx = 1
cur_page = 1
news_num = 1000000
while True:
table = browser.find_element("xpath",'.//div[#data-hveid="CBAQAA"]')
li_list = table.find_elements("xpath",'.//li[contains(#class="vJOb1e aIfcHf Hw13jc"]')
area_list = [li.find_element("xpath",'.//div[#class="mCBkyc y355M ynAwRc MBeuO nDgy9d"]') for li in li_list]
for a in area_list[:min(len(area_list), news_num-idx+1)]:
n = a.find_element("xpath",'.//div[#role="heading"]')
n_url = n.get_attribute('href')
try:
img = a.find_element(By.CSS_SELECTOR,'img#dimg_').find_element(By.CSS_SELECTOR, 'img')
img = img.get_attribute('src')
except:
img = " "
news_dict[idx] = {'Title' : n.get_attribute('title'),
'url' : n_url,
'thumbnail': img}
idx += 1
try:
next_btn = browser.find_element(By.CSS_SELECTOR, 'a#pnnext')
next_btn.click()
cur_page +=1
# pages = browser.find_element("xpath",'//div[#class="sc_page_inner"]')
# next_page_url = [p for p in pages.find_elements("xpath",'.//a') if p.text == str(cur_page)][0].get_attribute('href')
pages = browser.find_element("xpath",'//table[#class="fl"]')
next_page_url = [p for p in pages.find_elements("xpath",'.//a') if p.text == str(cur_page)][0].get_attribute('aria-lable')
browser.get(next_page_url)
time.sleep(sleep_sec)
except:
print('\n브라우저를 종료합니다.\n' + '=' * 100)
time.sleep(0.7)
browser.close()
break
########################################################여기까지 수정 완료################################################################
# 엑셀파일 추출
print('데이터프레임 변환\n')
news_df = DataFrame(news_dict).T
folder_path = os.getcwd()
xlsx_file_name = '{}_{}.xlsx'.format(query, yesterday)
news_df.to_excel(xlsx_file_name, index=False)
print('엑셀 저장 완료 | 경로 : {}\\{}\n'.format(folder_path, xlsx_file_name))
news_crawling()
this is my code. I use it on Korean website and it works well. But after I modified it for google search, it wouldn't work.
I want to search something on google and then get the news titles into a xlsx file.
I before used it in Korean website, so I changed the part below
table = browser.find_element("xpath",'.//div[#data-hveid="CBAQAA"]')
li_list = table.find_elements("xpath",'.//li[contains(#class="vJOb1e aIfcHf Hw13jc"]')
area_list = [li.find_element("xpath",'.//div[#class="mCBkyc y355M ynAwRc MBeuO nDgy9d"]') for li in li_list]
and when I run the code, it only gives me an empty xlsx file.
can anyone help with this please? I would be so appreciate.
Here is one possible solution:
from openpyxl import Workbook
from datetime import datetime
from datetime import timedelta
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
def get_url(query: str, min_date: str, max_date: str) -> str:
return f'https://www.google.com/search?q={query}&tbm=nws&source-news&tbs=cdr:1,cd_min:{min_date},cd_max:{max_date}'
options = webdriver.ChromeOptions()
options.add_argument('--headless')
options.add_experimental_option("excludeSwitches", ["enable-automation", "enable-logging"])
service = Service(executable_path="path/to/your/chromedriver.exe")
driver = webdriver.Chrome(service=service, options=options)
wait = WebDriverWait(driver, 5)
yesterday = (datetime.now() - timedelta(1)).strftime("%m.%d.%Y")
driver.get(get_url('spent nuclear fuel', yesterday, yesterday))
url_locator = (By.CSS_SELECTOR, '#rso a')
title_locator = (By.CSS_SELECTOR, 'a div[role="heading"]')
thumbnail_locator = (By.CSS_SELECTOR, '#rso a>div>div:first-child img')
workbook = Workbook()
worksheet = workbook.active
page = 1
while True:
print(f'Current page: {page}')
url_web_elements = wait.until(EC.visibility_of_all_elements_located(url_locator))
title_web_elements = wait.until(EC.presence_of_all_elements_located(title_locator))
thumbnail_web_elements = wait.until(EC.visibility_of_all_elements_located(thumbnail_locator))
titles = [title.text.replace(',', '.') for title in title_web_elements]
urls = [link.get_attribute('href') for link in url_web_elements]
thumbnails = [thumbnail.get_attribute('src') for thumbnail in thumbnail_web_elements]
for data in zip(titles, urls, thumbnails):
news = {
'title' : data[0],
'url' : data[1],
'thumbnail': data[2]
}
worksheet.append(list(news.values()))
try:
page += 1
wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, '#pnnext'))).click()
except TimeoutException:
break
workbook.save(f'google_news_{yesterday}.xlsx')
driver.quit()
Output is xlsx file google_news_11.10.2022.xlsx
In the get_url function, you can pass a range of dates for which the news will be displayed. For example get_url('spent nuclear fuel', 01.11.2022, 11.11.2022)
You can also save data to csv using this solution:
import csv
from datetime import datetime
from datetime import timedelta
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
def get_url(query: str, min_date: str, max_date: str) -> str:
return f'https://www.google.com/search?q={query}&tbm=nws&source-news&tbs=cdr:1,cd_min:{min_date},cd_max:{max_date}'
def save_to_csv(data: list) -> None:
with open(file='google_news.csv', mode='a', encoding="utf-8") as f:
writer = csv.writer(f, lineterminator='\n')
writer.writerow([*data])
options = webdriver.ChromeOptions()
options.add_argument('--headless')
options.add_experimental_option("excludeSwitches", ["enable-automation", "enable-logging"])
service = Service(executable_path="path/to/your/chromedriver.exe")
driver = webdriver.Chrome(service=service, options=options)
wait = WebDriverWait(driver, 5)
yesterday = (datetime.now() - timedelta(1)).strftime("%m.%d.%Y")
driver.get(get_url('spent nuclear fuel', yesterday, yesterday))
url_locator = (By.CSS_SELECTOR, '#rso a')
title_locator = (By.CSS_SELECTOR, 'a div[role="heading"]')
thumbnail_locator = (By.CSS_SELECTOR, '#rso a>div>div:first-child img')
page = 1
while True:
print(f'Current page: {page}')
url_web_elements = wait.until(EC.visibility_of_all_elements_located(url_locator))
title_web_elements = wait.until(EC.presence_of_all_elements_located(title_locator))
thumbnail_web_elements = wait.until(EC.visibility_of_all_elements_located(thumbnail_locator))
titles = [title.text.replace(',', '.') for title in title_web_elements]
urls = [link.get_attribute('href') for link in url_web_elements]
thumbnails = [thumbnail.get_attribute('src') for thumbnail in thumbnail_web_elements]
for data in zip(titles, urls, thumbnails):
news = {
'title' : data[0],
'url' : data[1],
'thumbnail': data[2]
}
save_to_csv(news.values())
try:
page += 1
wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, '#pnnext'))).click()
except TimeoutException:
break
driver.quit()
Output is csv file google_news.csv:
COP27: nuclear boss doesn't expect surge in waste recycling,https://news.yahoo.com/cop27-nuclear-boss-doesnt-expect-072631885.html,""
UN Nuclear Chief Says Recycling Nuclear Waste 'Difficult ...,https://www.theepochtimes.com/un-nuclear-chief-says-recycling-nuclear-waste-difficult-after-biden-looks-to-fund-reprocessing-projects_4855151.html,""
COP27: UN nuclear chief says radioactive waste recycling is 'difficult' technology,https://www.deccanherald.com/international/world-news-politics/cop27-un-nuclear-chief-says-radioactive-waste-recycling-is-difficult-technology-1161036.html,""
Tested on Python 3.9.10. Used Selenium 4.5.0, openpyxl 3.0.10
Related
Try to scrape the data but data are overwrite and they will give the data of only 2 page in the csv file kindly recommend any solution for that I an waiting for your response How can I fix this? is there any way then suggest me I think due to for loop they overwrite data Thank you.these is the page link https://www.askgamblers.com/online-casinos/countries/uk/
from selenium import webdriver
import time
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
import pandas as pd
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from csv import writer
options = webdriver.ChromeOptions()
options.add_argument("--no-sandbox")
options.add_argument("--disable-gpu")
options.add_argument("--window-size=1920x1080")
options.add_argument("--disable-extensions")
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
wait = WebDriverWait(driver, 20)
for page in range(1,3):
URL = 'https://www.askgamblers.com/online-casinos/countries/uk/{page}'.format(page=page)
driver.get(URL)
time.sleep(2)
urls= []
data = []
page_links =driver.find_elements(By.XPATH, "//div[#class='card__desc']//a[starts-with(#href, '/online')]")
for link in page_links:
href=link.get_attribute("href")
urls.append(href)
product=[]
for url in urls:
wev={}
driver.get(url)
time.sleep(1)
try:
title=driver.find_element(By.CSS_SELECTOR,"h1.review-intro__title").text
except:
pass
wev['Title']=title
soup = BeautifulSoup(driver.page_source,"lxml")
pays=soup.select("div#tabPayments")
for pay in pays:
try:
t1=pay.select_one(".review-details-wrapper:nth-child(1) .review-details__item:nth-child(1) .review-details__text").get_text(' ',strip=True)
except:
pass
wev['deposit_method']=t1
try:
t2=pay.select_one(".review-details-wrapper:nth-child(1) .review-details__item+ .review-details__item .review-details__text").get_text(' ',strip=True)
except:
pass
wev['curriences']=t2
try:
t3=pay.select_one(" .review-details-wrapper+ .review-details-wrapper .review-details__item:nth-child(1) .review-details__text").get_text(' ',strip=True)
except:
pass
wev['with_drawl method']=t3
try:
t4 = pay.select_one(" .review-details-wrapper+ .review-details-wrapper .review-details__item:nth-child(2) .review-details__text")
t4 = [i.replace("\n", "") for i in t4 if i.text]
except:
pass
wev['with_drawl_time']=t4
product.append(wev)
df=pd.DataFrame(product)
df.to_csv('casino.csv')
All result in 1 file :
from selenium import webdriver
import time
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
import pandas as pd
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from csv import writer
options = webdriver.ChromeOptions()
options.add_argument("--no-sandbox")
options.add_argument("--disable-gpu")
options.add_argument("--window-size=1920x1080")
options.add_argument("--disable-extensions")
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
wait = WebDriverWait(driver, 20)
product=[]
for page in range(1,4):
URL = 'https://www.askgamblers.com/online-casinos/countries/uk/{page}'.format(page=page)
driver.get(URL)
time.sleep(2)
urls= []
data = []
page_links =driver.find_elements(By.XPATH, "//div[#class='card__desc']//a[starts-with(#href, '/online')]")
for link in page_links:
href=link.get_attribute("href")
urls.append(href)
for url in urls:
wev={}
driver.get(url)
time.sleep(1)
try:
title=driver.find_element(By.CSS_SELECTOR,"h1.review-intro__title").text
except:
pass
wev['Title']=title
soup = BeautifulSoup(driver.page_source,"lxml")
pays=soup.select("div#tabPayments")
for pay in pays:
try:
t1=pay.select_one(".review-details-wrapper:nth-child(1) .review-details__item:nth-child(1) .review-details__text").get_text(' ',strip=True)
except:
pass
wev['deposit_method']=t1
try:
t2=pay.select_one(".review-details-wrapper:nth-child(1) .review-details__item+ .review-details__item .review-details__text").get_text(' ',strip=True)
except:
pass
wev['curriences']=t2
try:
t3=pay.select_one(" .review-details-wrapper+ .review-details-wrapper .review-details__item:nth-child(1) .review-details__text").get_text(' ',strip=True)
except:
pass
wev['with_drawl method']=t3
try:
t4 = pay.select_one(" .review-details-wrapper+ .review-details-wrapper .review-details__item:nth-child(2) .review-details__text")
t4 = [i.replace("\n", "") for i in t4 if i.text]
except:
pass
wev['with_drawl_time']=t4
product.append(wev)
df=pd.DataFrame(product)
df.to_csv('casino.csv')
In first loop its running only 2 times :
Change it to 1,4 as below then it will give you [1,2,3]:
for page in range(1,4):
Then data getting overwritten because output file name is same:
change file name as below:
df.to_csv(f'casino_{page}.csv')
With every iteration through the loop, the previously extracted data is overwritten. How can I solve this problem?
from selenium import webdriver
import time
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support.select import Select
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
# url='https://www.amazon.com/dp/B00M0DWQYI?th=1'
# url='https://www.amazon.com/dp/B010RWD4GM?th=1'
PATH="C:\Program Files (x86)\chromedriver.exe"
driver =webdriver.Chrome(PATH)
df_urls = pd.read_csv('D:/selenium/inputs/amazone-asin.csv',encoding='utf-8')
list_dicts_urls =df_urls.to_dict('records')
item=dict()
product=[]
for url in list_dicts_urls:
product_url = 'https://' + url['MARKETPLACE'] + '/dp/' + url['ASIN']
driver.get(product_url)
try:
item['title'] = driver.find_element(By.CSS_SELECTOR,'span#productTitle').text
except:
item['title'] = ''
try:
item['brand'] = driver.find_element(By.CSS_SELECTOR,'a#bylineInfo').text.replace('Visit the','').replace('Store','').strip()
except:
item['brand'] = ''
try:
rating = driver.find_element(By.CSS_SELECTOR,'span#acrCustomerReviewText').text.replace('ratings','').strip()
rating = int(rating.replace(',', ''))
item['rating'] = rating
except:
item['rating'] = ''
time.sleep(2)
try:
p1=driver.find_element(By.XPATH, '//span[#class="a-price-whole"]').text
p2= driver.find_element(By.XPATH, '//span[#class="a-price-fraction"]').text
item['price']=p1+p2
except:
item['price']=''
product.append(item)
df=pd.DataFrame(product)
df.to_csv("ama.csv")
I think you need to define item=dict() inside the for loop. Otherwise this is the same, single item object used in all the loop iterations.
Try this:
from selenium import webdriver
import time
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support.select import Select
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
# url='https://www.amazon.com/dp/B00M0DWQYI?th=1'
# url='https://www.amazon.com/dp/B010RWD4GM?th=1'
PATH="C:\Program Files (x86)\chromedriver.exe"
driver =webdriver.Chrome(PATH)
df_urls = pd.read_csv('D:/selenium/inputs/amazone-asin.csv',encoding='utf-8')
list_dicts_urls =df_urls.to_dict('records')
product=[]
for url in list_dicts_urls:
item=dict()
product_url = 'https://' + url['MARKETPLACE'] + '/dp/' + url['ASIN']
driver.get(product_url)
try:
item['title'] = driver.find_element(By.CSS_SELECTOR,'span#productTitle').text
except:
item['title'] = ''
try:
item['brand'] = driver.find_element(By.CSS_SELECTOR,'a#bylineInfo').text.replace('Visit the','').replace('Store','').strip()
except:
item['brand'] = ''
try:
rating = driver.find_element(By.CSS_SELECTOR,'span#acrCustomerReviewText').text.replace('ratings','').strip()
rating = int(rating.replace(',', ''))
item['rating'] = rating
except:
item['rating'] = ''
time.sleep(2)
try:
p1=driver.find_element(By.XPATH, '//span[#class="a-price-whole"]').text
p2= driver.find_element(By.XPATH, '//span[#class="a-price-fraction"]').text
item['price']=p1+p2
except:
item['price']=''
product.append(item)
df=pd.DataFrame(product)
df.to_csv("ama.csv")
I am trying to scrape each product page from this website: https://www.aliexpress.com/wholesale?catId=0&initiative_id=SB_20220315022920&SearchText=bluetooth+earphones
Especially I want to get comments and custumer countries as I mentionned in the photo:
enter image description here
The main issue is that my code does not inspect the right elements and this is what I am struggling with .
First, I tried my scraping on this product : https://www.aliexpress.com/item/1005003801507855.html?spm=a2g0o.productlist.0.0.1e951bc72xISfE&algo_pvid=6d3ed61e-f378-43d0-a429-5f6cddf3d6ad&algo_exp_id=6d3ed61e-f378-43d0-a429-5f6cddf3d6ad-8&pdp_ext_f=%7B%22sku_id%22%3A%2212000027213624098%22%7D&pdp_pi=-1%3B40.81%3B-1%3B-1%40salePrice%3BMAD%3Bsearch-mainSearch
Here is my code :
from selenium import webdriver
from selenium.webdriver.common.by import By
from lxml import html
import cssselect
from time import sleep
from itertools import zip_longest
import csv
driver = webdriver.Edge(executable_path=r"C:/Users/OUISSAL/Desktop/wscraping/XEW/scraping/codes/msedgedriver")
url = "https://www.aliexpress.com/item/1005003801507855.html?spm=a2g0o.productlist.0.0.1e951bc72xISfE&algo_pvid=6d3ed61e-f378-43d0-a429-5f6cddf3d6ad&algo_exp_id=6d3ed61e-f378-43d0-a429-5f6cddf3d6ad-8&pdp_ext_f=%7B%22sku_id%22%3A%2212000027213624098%22%7D&pdp_pi=-1%3B40.81%3B-1%3B-1%40salePrice%3BMAD%3Bsearch-mainSearch"
with open ("data.csv", "w", encoding="utf-8") as csvfile:
wr = csv.writer(csvfile)
wr.writerow(["Comment","Custumer country"])
driver.get(url)
driver.execute_script('window.scrollTo(0, document.body.scrollHeight);')
review_buttom = driver.find_element_by_xpath('//li[#ae_button_type="tab_feedback"]')
review_buttom.click()
html_source = driver.find_element_by_xpath('//div[#id="transction-feedback"]')
tree = html.fromstring(html_source)
#tree = html.fromstring(driver.page_source)
for rvw in tree.xpath('//div[#class="feedback-item clearfix"]'):
country = rvw.xpath('//div[#class="user-country"]//b/text()')
if country:
country = country[0]
else:
country = ''
print('country:', country)
comment = rvw.xpath('//dt[#id="buyer-feedback"]//span/text()')
if comment:
comment = comment[0]
else:
comment = ''
print('comment:', comment)
driver.close()
Thank you !!
What happens?
There is one main issue, the feedback you are looking for is in an iframe, so you wont get your information by calling the elements directly.
How to fix?
Scroll into view of element that holds the iframe navigate to its source and interact with its pagination to get all the feedbacks.
Example
from selenium import webdriver
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
url = 'https://www.aliexpress.com/item/1005003801507855.html?spm=a2g0o.productlist.0.0.1e951bc72xISfE&algo_pvid=6d3ed61e-f378-43d0-a429-5f6cddf3d6ad&algo_exp_id=6d3ed61e-f378-43d0-a429-5f6cddf3d6ad-8&pdp_ext_f=%7B%22sku_id%22%3A%2212000027213624098%22%7D&pdp_pi=-1%3B40.81%3B-1%3B-1%40salePrice%3BMAD%3Bsearch-mainSearch'
driver = webdriver.Chrome(ChromeDriverManager().install())
driver.maximize_window()
driver.get(url)
wait = WebDriverWait(driver, 10)
driver.execute_script("arguments[0].scrollIntoView();", wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, '.tab-content'))))
driver.get(wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, '#product-evaluation'))).get_attribute('src'))
data=[]
while True:
for e in driver.find_elements(By.CSS_SELECTOR, 'div.feedback-item'):
try:
country = e.find_element(By.CSS_SELECTOR, '.user-country > b').text
except:
country = None
try:
comment = e.find_element(By.CSS_SELECTOR, '.buyer-feedback span').text
except:
comment = None
data.append({
'country':country,
'comment':comment
})
try:
wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, '#complex-pager a.ui-pagination-next'))).click()
except:
break
pd.DataFrame(data).to_csv('filename.csv',index=False)
So i am working on a custom web scraper for any kind of ecommerce site, i want it to scrape names and prices of listings on a site and then export them to csv, but the problem is it exports only one line of (name, price) and it prints it on every line of csv, i couldnt find a good solution for this, i hope im not asking an extremely stupid thing, although i think the fix is easy. I hope someone will read my code and help me, thank you !
###imports
from selenium.webdriver.common.keys import Keys
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import csv
import pandas as pd
#driver path
driver = webdriver.Firefox(executable_path="D:\Programy\geckoDriver\geckodriver.exe")
#init + search
driver.get("https://pc.bazos.sk/pc/")
time.sleep(1)
nazov = driver.find_element_by_name("hledat")
nazov.send_keys("xeon")
cenamin = driver.find_element_by_name("cenaod")
cenamin.send_keys("")
cenamax = driver.find_element_by_name("cenado")
cenamax.send_keys("300")
driver.find_element_by_name("Submit").click()
##cookie acceptor
driver.find_element_by_xpath("/html/body/div[1]/button").click()
##main
x = 3
for i in range(x):
try:
main = WebDriverWait(driver, 7).until(
EC.presence_of_element_located((By.XPATH, "/html/body/div[1]/table/tbody/tr/td[2]"))
)
##find listings in table
inzeraty = main.find_elements_by_class_name("vypis")
for vypis in inzeraty:
nadpis = vypis.find_element_by_class_name("nadpis")
##print listings to check correctness
nadpist = nadpis.text
print(nadpist)
##find the price and print
for vypis in inzeraty:
cena = vypis.find_element_by_class_name("cena")
cenat = cena.text
print(cenat)
##export to csv - not working
time.sleep(1)
print("Writing to csv")
d = {"Nazov": [nadpist]*20*x,"Cena": [cenat]*20*x}
df = pd.DataFrame(data=d)
df.to_csv("bobo.csv")
time.sleep(1)
print("Writing to csv done !")
##next page
dalsia = driver.find_element_by_link_text("Ďalšia")
dalsia.click()
except:
driver.quit()
i want the csv to look like:
name,price
name2, price2
it would be great is the csv had only two columns and x rows depending on the number of listings
from selenium.webdriver.common.keys import Keys
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import pandas as pd
#driver path
driver = webdriver.Chrome()
#init + search
driver.get("https://pc.bazos.sk/pc/")
time.sleep(1)
nazov = driver.find_element_by_name("hledat")
nazov.send_keys("xeon")
cenamin = driver.find_element_by_name("cenaod")
cenamin.send_keys("")
cenamax = driver.find_element_by_name("cenado")
cenamax.send_keys("300")
driver.find_element_by_name("Submit").click()
##cookie acceptor
time.sleep(10)
driver.find_element_by_xpath("/html/body/div[1]/button").click()
##main
x = 3
d = []
for i in range(x):
try:
main = WebDriverWait(driver, 7).until(
EC.presence_of_element_located(
(By.XPATH, "/html/body/div[1]/table/tbody/tr/td[2]")))
##find listings in table
inzeraty = main.find_elements_by_class_name("vypis")
for vypis in inzeraty:
d.append({"Nazov": vypis.find_element_by_class_name("nadpis").text,
"Cena": vypis.find_element_by_class_name("cena").text
})
##next page
dalsia = driver.find_element_by_link_text("Ďalšia")
dalsia.click()
except:
driver.quit()
time.sleep(1)
print("Writing to csv")
df = pd.DataFrame(data=d)
df.to_csv("bobo.csv",index=False)
this gives me 59 items with price. first added to dict then to list, then send that to pandas.
All you need to do is create two empty lists nadpist_l, cenat_l and append data to that lists, finally save the lists as a dataframe.
UPDATED as per the comment
Check if this works
###imports
from selenium.webdriver.common.keys import Keys
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import pandas as pd
#driver path
driver = webdriver.Chrome()
#init + search
driver.get("https://pc.bazos.sk/pc/")
time.sleep(1)
nazov = driver.find_element_by_name("hledat")
nazov.send_keys("xeon")
cenamin = driver.find_element_by_name("cenaod")
cenamin.send_keys("")
cenamax = driver.find_element_by_name("cenado")
cenamax.send_keys("300")
driver.find_element_by_name("Submit").click()
##cookie acceptor
time.sleep(10)
driver.find_element_by_xpath("/html/body/div[1]/button").click()
##main
x = 3
d = {}
for i in range(x):
try:
main = WebDriverWait(driver, 7).until(
EC.presence_of_element_located(
(By.XPATH, "/html/body/div[1]/table/tbody/tr/td[2]")))
##find listings in table
inzeraty = main.find_elements_by_class_name("vypis")
nadpist_l = []
for vypis in inzeraty:
nadpis = vypis.find_element_by_class_name("nadpis")
##print listings to check correctness
nadpist = nadpis.text
nadpist_l.append(nadpist)
# print(nadpist)
##find the price and print
cenat_l = []
for vypis in inzeraty:
cena = vypis.find_element_by_class_name("cena")
cenat = cena.text
cenat_l.append(cenat)
print(len(cenat_l))
##export to csv - not working
d.update({"Nazov": [nadpist_l] * 20 * x, "Cena": [cenat_l] * 20 * x})
##next page
dalsia = driver.find_element_by_link_text("Ďalšia")
dalsia.click()
except:
driver.quit()
time.sleep(1)
print("Writing to csv")
df = pd.DataFrame(data=d)
df.to_csv("bobo.csv")
time.sleep(1)
print("Writing to csv done !")
I wrote a code to get the following value "Exam Code", "Exam Name" and "Total Question". The issue is that in the put CSV file I am getting the wrong value in the "Exam Code" column. I am getting the same value as "Exam Name". The xPath looks fine to me. I don't know where is the issue happening.
Following is the code:
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException
import time
option = Options()
option.add_argument("--disable-infobars")
option.add_argument("start-maximized")
option.add_argument("--disable-extensions")
option.add_experimental_option("excludeSwitches", ['enable-automation'])
# Pass the argument 1 to allow and 2 to block
# option.add_experimental_option("prefs", {
# "profile.default_content_setting_values.notifications": 1
# })
driver = webdriver.Chrome(chrome_options=option, executable_path='C:\\Users\\Awais\\Desktop\\web crawling\\chromedriver.exe')
url = ["https://www.marks4sure.com/210-060-exam.html",
"https://www.marks4sure.com/210-065-exam.html",
"https://www.marks4sure.com/200-355-exam.html",
"https://www.marks4sure.com/9A0-127-exam.html",
"https://www.marks4sure.com/300-470-exam.html",]
driver.implicitly_wait(0.5)
na = "N/A"
# text = 'Note: This exam is available on Demand only. You can Pre-Order this Exam and we will arrange this for you.'
links = []
exam_code = []
exam_name = []
total_q = []
for items in range(0, 5):
driver.get(url[items])
# if driver.find_element_by_xpath("//div[contains(#class, 'alert') and contains(#class, 'alert-danger')]") == text:
# continue
items += 1
try:
c_url = driver.current_url
links.append(c_url)
except:
pass
try:
codes = driver.find_element_by_xpath('''//div[contains(#class, 'col-sm-6') and contains(#class, 'exam-row-data') and position() = 2]''')
exam_code.append(codes.text)
except:
exam_code.append(na)
try:
names = driver.find_element_by_xpath('//*[#id="content"]/div/div[1]/div[2]/div[3]/div[2]/a')
exam_name.append(names.text)
except:
exam_name.append(na)
try:
question = driver.find_element_by_xpath('//*[#id="content"]/div/div[1]/div[2]/div[4]/div[2]/strong')
total_q.append(question.text)
except:
total_q.append(na)
continue
all_info = list(zip(links, exam_name, exam_name, total_q))
print(all_info)
df = pd.DataFrame(all_info, columns=["Links", "Exam Code", "Exam Name", "Total Question"])
df.to_csv("data5.csv", index=False)
driver.close()
You are getting the exam name in there twice, and instead of exam codes because that's what you are telling it to do (minor typo here with having exam_name in there twice):
all_info = list(zip(links, exam_name, exam_name, total_q))
change to: all_info = list(zip(links, exam_code, exam_name, total_q))
Few things I'm confused about.
1) Why use Selnium? There is no need for selenium as the data is returned in the initial request in the html source. So I would just use requests as it would speed up the processing.
2) The link and the exam code are already in the url you are iterating through. I would just split or use regex to that string to get the link and the code. You only really need to get the exam name and number of questions then.
With that being said, I adjusted it slightly to just get exam name and number of questions:
import requests
from bs4 import BeautifulSoup
import pandas as pd
urls = ["https://www.marks4sure.com/210-060-exam.html",
"https://www.marks4sure.com/210-065-exam.html",
"https://www.marks4sure.com/200-355-exam.html",
"https://www.marks4sure.com/9A0-127-exam.html",
"https://www.marks4sure.com/300-470-exam.html",]
links = []
exam_code = []
exam_name = []
total_q = []
for url in urls:
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
links.append(url)
exam_code.append(url.rsplit('-exam')[0].split('/')[-1])
exam_row = soup.select('div[class*="exam-row-data"]')
for exam in exam_row:
if exam.text == 'Exam Name: ':
exam_name.append(exam.find_next_sibling("div").text)
continue
if 'Questions' in exam.text and 'Total Questions' not in exam.text:
total_q.append(exam.text.strip())
continue
all_info = list(zip(links, exam_code, exam_name, total_q))
print(all_info)
df = pd.DataFrame(all_info, columns=["Links", "Exam Code", "Exam Name", "Total Question"])
df.to_csv("data5.csv", index=False)
Hi to get the exam code I think it is better to work with regex and get it from URL itself.
Also below code gives me the exam codes correctly except for 4th link which has a different structure as compared to others.
# -*- coding: utf-8 -*-
"""
Created on Fri Mar 6 14:48:00 2020
#author: prakh
"""
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException
import time
option = Options()
option.add_argument("--disable-infobars")
option.add_argument("start-maximized")
option.add_argument("--disable-extensions")
option.add_experimental_option("excludeSwitches", ['enable-automation'])
# Pass the argument 1 to allow and 2 to block
# option.add_experimental_option("prefs", {
# "profile.default_content_setting_values.notifications": 1
# })
driver = webdriver.Chrome(executable_path='C:/Users/prakh/Documents/PythonScripts/chromedriver.exe')
url = ["https://www.marks4sure.com/210-060-exam.html",
"https://www.marks4sure.com/210-065-exam.html",
"https://www.marks4sure.com/200-355-exam.html",
"https://www.marks4sure.com/9A0-127-exam.html",
"https://www.marks4sure.com/300-470-exam.html",]
driver.implicitly_wait(0.5)
na = "N/A"
# text = 'Note: This exam is available on Demand only. You can Pre-Order this Exam and we will arrange this for you.'
links = []
exam_code = []
exam_name = []
total_q = []
for items in range(0, 5):
driver.get(url[items])
# if driver.find_element_by_xpath("//div[contains(#class, 'alert') and contains(#class, 'alert-danger')]") == text:
# continue
items += 1
try:
c_url = driver.current_url
links.append(c_url)
except:
pass
try:
codes = driver.find_element_by_xpath('//*[#id="content"]/div/div[1]/div[2]/div[2]/div[2]')
exam_code.append(codes.text)
except:
exam_code.append(na)
try:
names = driver.find_element_by_xpath('//*[#id="content"]/div/div[1]/div[2]/div[3]/div[2]/a')
exam_name.append(names.text)
except:
exam_name.append(na)
try:
question = driver.find_element_by_xpath('//*[#id="content"]/div/div[1]/div[2]/div[4]/div[2]/strong')
total_q.append(question.text)
except:
total_q.append(na)
continue
all_info = list(zip(links, exam_code, exam_name, total_q))
print(all_info)
df = pd.DataFrame(all_info, columns=["Links", "Exam Code", "Exam Name", "Total Question"])
df.to_csv("data5.csv", index=False)
driver.close()
You don't need selenium because the source code contains the info you need without using JavaScript.
Also, most pages redirect to marks4sure.com/200-301-exam.html, so you'll get the same results. Only marks4sure.com/300-470-exam.html don't.
import requests
from bs4 import BeautifulSoup
urls = ["https://www.marks4sure.com/210-060-exam.html",
"https://www.marks4sure.com/210-065-exam.html",
"https://www.marks4sure.com/200-355-exam.html",
"https://www.marks4sure.com/9A0-127-exam.html",
"https://www.marks4sure.com/300-470-exam.html",]
with open("output.csv", "w") as f:
f.write("exam_code,exam_name,exam_quest\n")
for url in urls:
page = requests.get(url)
soup = BeautifulSoup(page.text, 'html5lib')
for n, v in enumerate(soup.find_all(class_ = "col-sm-6 exam-row-data")):
if n == 1:
exam_code = v.text.strip()
if n == 3:
exam_name = v.text.strip()
if n == 5:
exam_quest = v.text.strip()
f.write(f"{exam_code},{exam_name},{exam_quest}\n")