Data are overwritten in DataFrame - python

Try to scrape the data but data are overwrite and they will give the data of only 2 page in the csv file kindly recommend any solution for that I an waiting for your response How can I fix this? is there any way then suggest me I think due to for loop they overwrite data Thank you.these is the page link https://www.askgamblers.com/online-casinos/countries/uk/
from selenium import webdriver
import time
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
import pandas as pd
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from csv import writer
options = webdriver.ChromeOptions()
options.add_argument("--no-sandbox")
options.add_argument("--disable-gpu")
options.add_argument("--window-size=1920x1080")
options.add_argument("--disable-extensions")
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
wait = WebDriverWait(driver, 20)
for page in range(1,3):
URL = 'https://www.askgamblers.com/online-casinos/countries/uk/{page}'.format(page=page)
driver.get(URL)
time.sleep(2)
urls= []
data = []
page_links =driver.find_elements(By.XPATH, "//div[#class='card__desc']//a[starts-with(#href, '/online')]")
for link in page_links:
href=link.get_attribute("href")
urls.append(href)
product=[]
for url in urls:
wev={}
driver.get(url)
time.sleep(1)
try:
title=driver.find_element(By.CSS_SELECTOR,"h1.review-intro__title").text
except:
pass
wev['Title']=title
soup = BeautifulSoup(driver.page_source,"lxml")
pays=soup.select("div#tabPayments")
for pay in pays:
try:
t1=pay.select_one(".review-details-wrapper:nth-child(1) .review-details__item:nth-child(1) .review-details__text").get_text(' ',strip=True)
except:
pass
wev['deposit_method']=t1
try:
t2=pay.select_one(".review-details-wrapper:nth-child(1) .review-details__item+ .review-details__item .review-details__text").get_text(' ',strip=True)
except:
pass
wev['curriences']=t2
try:
t3=pay.select_one(" .review-details-wrapper+ .review-details-wrapper .review-details__item:nth-child(1) .review-details__text").get_text(' ',strip=True)
except:
pass
wev['with_drawl method']=t3
try:
t4 = pay.select_one(" .review-details-wrapper+ .review-details-wrapper .review-details__item:nth-child(2) .review-details__text")
t4 = [i.replace("\n", "") for i in t4 if i.text]
except:
pass
wev['with_drawl_time']=t4
product.append(wev)
df=pd.DataFrame(product)
df.to_csv('casino.csv')

All result in 1 file :
from selenium import webdriver
import time
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
import pandas as pd
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from csv import writer
options = webdriver.ChromeOptions()
options.add_argument("--no-sandbox")
options.add_argument("--disable-gpu")
options.add_argument("--window-size=1920x1080")
options.add_argument("--disable-extensions")
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
wait = WebDriverWait(driver, 20)
product=[]
for page in range(1,4):
URL = 'https://www.askgamblers.com/online-casinos/countries/uk/{page}'.format(page=page)
driver.get(URL)
time.sleep(2)
urls= []
data = []
page_links =driver.find_elements(By.XPATH, "//div[#class='card__desc']//a[starts-with(#href, '/online')]")
for link in page_links:
href=link.get_attribute("href")
urls.append(href)
for url in urls:
wev={}
driver.get(url)
time.sleep(1)
try:
title=driver.find_element(By.CSS_SELECTOR,"h1.review-intro__title").text
except:
pass
wev['Title']=title
soup = BeautifulSoup(driver.page_source,"lxml")
pays=soup.select("div#tabPayments")
for pay in pays:
try:
t1=pay.select_one(".review-details-wrapper:nth-child(1) .review-details__item:nth-child(1) .review-details__text").get_text(' ',strip=True)
except:
pass
wev['deposit_method']=t1
try:
t2=pay.select_one(".review-details-wrapper:nth-child(1) .review-details__item+ .review-details__item .review-details__text").get_text(' ',strip=True)
except:
pass
wev['curriences']=t2
try:
t3=pay.select_one(" .review-details-wrapper+ .review-details-wrapper .review-details__item:nth-child(1) .review-details__text").get_text(' ',strip=True)
except:
pass
wev['with_drawl method']=t3
try:
t4 = pay.select_one(" .review-details-wrapper+ .review-details-wrapper .review-details__item:nth-child(2) .review-details__text")
t4 = [i.replace("\n", "") for i in t4 if i.text]
except:
pass
wev['with_drawl_time']=t4
product.append(wev)
df=pd.DataFrame(product)
df.to_csv('casino.csv')

In first loop its running only 2 times :
Change it to 1,4 as below then it will give you [1,2,3]:
for page in range(1,4):
Then data getting overwritten because output file name is same:
change file name as below:
df.to_csv(f'casino_{page}.csv')

Related

No data table with Python3 Selenium

I need to improve this script to extract daily data from this site. However, I am not getting any data except for the "Spot" column!
Thanks for the help!
UPD. Now i can't change the date(
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup as bs
import pandas as pd
from selenium_stealth import stealth
import time
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
url ="https://www.eex.com/en/market-data/natural-gas/spot"
chrome_options = Options()
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--headless")
chrome_options.add_argument("start-maximized")
browser = webdriver.Chrome(executable_path="chromedriver1/chromedriver", options=chrome_options)
browser.get("https://www.eex.com/en/market-data/natural-gas/spot")
time.sleep(10)
date_picker = WebDriverWait(browser, 20).until(EC.visibility_of_element_located((By.XPATH, '//*[#id="symbolheader_ngs"]/div/div/div/input')))
date_picker.send_keys("2023-01-23")
time.sleep(20)
page_source = browser.page_source
s = bs(page_source)
table = s.select('table')[1]
final_list = []
for row in table.select('tr'):
final_list.append([x.text for x in row.find_all(['td', 'th'])])
final_df = pd.DataFrame(final_list[2:], columns = final_list[:1])
final_df.columns = ['Spot', 'Last Price', 'Last Volume', 'End of Day Index', 'Volume Exchange','del']
df=final_df.drop('del',axis=1)
browser.quit()
df.to_excel('final_df.xlsx', index = False)
little tweaks so that all columns can be extracted. main idea is that extract logic need to be checked with how HTML dom is.
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup as bs
import pandas as pd
def get_df(page_source):
soup = bs(page_source, 'html.parser')
table = soup.select('table')[1]
table_header=table.find("tr", {"class": "mv-quote-header-row"})
table_body=table.select('tbody')
result={}
for e_header in table_header.find_all('th'):
if e_header.text:
result[e_header.text]=[]
for e_r in table_body[0].find_all('tr'):
r1=[e.text for e in e_r.find_all('td',{'class':not ['mv-quote-button']})]
result['Spot'].append(r1[0])
result['Last Price'].append(r1[1])
result['Last Volume'].append(r1[2])
result['End of Day Index'].append(r1[3])
result['Volume Exchange'].append(r1[4])
#result
df=pd.DataFrame(result)
return df
chrome_options = Options()
chrome_options.add_argument("--no-sandbox")
#chrome_options.add_argument("--headless")
chrome_options.add_argument("start-maximized")
webdriver_service = Service("chromedriver/chromedriver") ## path to where you saved chromedriver binary
#webdriver_service = Service()
browser = webdriver.Chrome(service=webdriver_service, options=chrome_options)
browser.get("https://www.eex.com/en/market-data/natural-gas/spot")
#soup = BeautifulSoup(browser.page_source, 'html5lib')
page_source=browser.page_source
#table = soup.select('table')[1]
final_df=get_df(browser.page_source)
browser.quit()
final_df.to_excel('final_df.xlsx', index = False)

something is wrong google crawler. please

# 뉴스 크롤링.py
#######################################'사용후핵연료' 키워드 검색##################################################
import sys, os
from bs4 import BeautifulSoup
import requests
from selenium import webdriver
import selenium
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
from datetime import datetime, timedelta
from pandas import DataFrame
import time
from openpyxl.workbook import Workbook
sleep_sec = 0.5
wb = Workbook()
# User-Agent를 입력해주세요.
headers = {'User-Agent' : '________________'}
query = 'spent nuclear fuel'
yesterday = (datetime.today() - timedelta(1)).strftime("%Y.%m.%d")
def news_crawling():
service = Service(executable_path=ChromeDriverManager().install())
browser = webdriver.Chrome(service=service)
print('브라우저를 실행시킵니다(자동 제어)\n')
news_url = 'https://www.google.com/search?q={0}&tbm=nws&source-news]'.format(query, yesterday)
browser.get(news_url)
time.sleep(sleep_sec)
print('\n크롤링을 시작합니다.')
#####동적 제어로 페이지 넘어가며 크롤링
news_dict = {}
idx = 1
cur_page = 1
news_num = 1000000
while True:
table = browser.find_element("xpath",'.//div[#data-hveid="CBAQAA"]')
li_list = table.find_elements("xpath",'.//li[contains(#class="vJOb1e aIfcHf Hw13jc"]')
area_list = [li.find_element("xpath",'.//div[#class="mCBkyc y355M ynAwRc MBeuO nDgy9d"]') for li in li_list]
for a in area_list[:min(len(area_list), news_num-idx+1)]:
n = a.find_element("xpath",'.//div[#role="heading"]')
n_url = n.get_attribute('href')
try:
img = a.find_element(By.CSS_SELECTOR,'img#dimg_').find_element(By.CSS_SELECTOR, 'img')
img = img.get_attribute('src')
except:
img = " "
news_dict[idx] = {'Title' : n.get_attribute('title'),
'url' : n_url,
'thumbnail': img}
idx += 1
try:
next_btn = browser.find_element(By.CSS_SELECTOR, 'a#pnnext')
next_btn.click()
cur_page +=1
# pages = browser.find_element("xpath",'//div[#class="sc_page_inner"]')
# next_page_url = [p for p in pages.find_elements("xpath",'.//a') if p.text == str(cur_page)][0].get_attribute('href')
pages = browser.find_element("xpath",'//table[#class="fl"]')
next_page_url = [p for p in pages.find_elements("xpath",'.//a') if p.text == str(cur_page)][0].get_attribute('aria-lable')
browser.get(next_page_url)
time.sleep(sleep_sec)
except:
print('\n브라우저를 종료합니다.\n' + '=' * 100)
time.sleep(0.7)
browser.close()
break
########################################################여기까지 수정 완료################################################################
# 엑셀파일 추출
print('데이터프레임 변환\n')
news_df = DataFrame(news_dict).T
folder_path = os.getcwd()
xlsx_file_name = '{}_{}.xlsx'.format(query, yesterday)
news_df.to_excel(xlsx_file_name, index=False)
print('엑셀 저장 완료 | 경로 : {}\\{}\n'.format(folder_path, xlsx_file_name))
news_crawling()
this is my code. I use it on Korean website and it works well. But after I modified it for google search, it wouldn't work.
I want to search something on google and then get the news titles into a xlsx file.
I before used it in Korean website, so I changed the part below
table = browser.find_element("xpath",'.//div[#data-hveid="CBAQAA"]')
li_list = table.find_elements("xpath",'.//li[contains(#class="vJOb1e aIfcHf Hw13jc"]')
area_list = [li.find_element("xpath",'.//div[#class="mCBkyc y355M ynAwRc MBeuO nDgy9d"]') for li in li_list]
and when I run the code, it only gives me an empty xlsx file.
can anyone help with this please? I would be so appreciate.
Here is one possible solution:
from openpyxl import Workbook
from datetime import datetime
from datetime import timedelta
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
def get_url(query: str, min_date: str, max_date: str) -> str:
return f'https://www.google.com/search?q={query}&tbm=nws&source-news&tbs=cdr:1,cd_min:{min_date},cd_max:{max_date}'
options = webdriver.ChromeOptions()
options.add_argument('--headless')
options.add_experimental_option("excludeSwitches", ["enable-automation", "enable-logging"])
service = Service(executable_path="path/to/your/chromedriver.exe")
driver = webdriver.Chrome(service=service, options=options)
wait = WebDriverWait(driver, 5)
yesterday = (datetime.now() - timedelta(1)).strftime("%m.%d.%Y")
driver.get(get_url('spent nuclear fuel', yesterday, yesterday))
url_locator = (By.CSS_SELECTOR, '#rso a')
title_locator = (By.CSS_SELECTOR, 'a div[role="heading"]')
thumbnail_locator = (By.CSS_SELECTOR, '#rso a>div>div:first-child img')
workbook = Workbook()
worksheet = workbook.active
page = 1
while True:
print(f'Current page: {page}')
url_web_elements = wait.until(EC.visibility_of_all_elements_located(url_locator))
title_web_elements = wait.until(EC.presence_of_all_elements_located(title_locator))
thumbnail_web_elements = wait.until(EC.visibility_of_all_elements_located(thumbnail_locator))
titles = [title.text.replace(',', '.') for title in title_web_elements]
urls = [link.get_attribute('href') for link in url_web_elements]
thumbnails = [thumbnail.get_attribute('src') for thumbnail in thumbnail_web_elements]
for data in zip(titles, urls, thumbnails):
news = {
'title' : data[0],
'url' : data[1],
'thumbnail': data[2]
}
worksheet.append(list(news.values()))
try:
page += 1
wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, '#pnnext'))).click()
except TimeoutException:
break
workbook.save(f'google_news_{yesterday}.xlsx')
driver.quit()
Output is xlsx file google_news_11.10.2022.xlsx
In the get_url function, you can pass a range of dates for which the news will be displayed. For example get_url('spent nuclear fuel', 01.11.2022, 11.11.2022)
You can also save data to csv using this solution:
import csv
from datetime import datetime
from datetime import timedelta
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
def get_url(query: str, min_date: str, max_date: str) -> str:
return f'https://www.google.com/search?q={query}&tbm=nws&source-news&tbs=cdr:1,cd_min:{min_date},cd_max:{max_date}'
def save_to_csv(data: list) -> None:
with open(file='google_news.csv', mode='a', encoding="utf-8") as f:
writer = csv.writer(f, lineterminator='\n')
writer.writerow([*data])
options = webdriver.ChromeOptions()
options.add_argument('--headless')
options.add_experimental_option("excludeSwitches", ["enable-automation", "enable-logging"])
service = Service(executable_path="path/to/your/chromedriver.exe")
driver = webdriver.Chrome(service=service, options=options)
wait = WebDriverWait(driver, 5)
yesterday = (datetime.now() - timedelta(1)).strftime("%m.%d.%Y")
driver.get(get_url('spent nuclear fuel', yesterday, yesterday))
url_locator = (By.CSS_SELECTOR, '#rso a')
title_locator = (By.CSS_SELECTOR, 'a div[role="heading"]')
thumbnail_locator = (By.CSS_SELECTOR, '#rso a>div>div:first-child img')
page = 1
while True:
print(f'Current page: {page}')
url_web_elements = wait.until(EC.visibility_of_all_elements_located(url_locator))
title_web_elements = wait.until(EC.presence_of_all_elements_located(title_locator))
thumbnail_web_elements = wait.until(EC.visibility_of_all_elements_located(thumbnail_locator))
titles = [title.text.replace(',', '.') for title in title_web_elements]
urls = [link.get_attribute('href') for link in url_web_elements]
thumbnails = [thumbnail.get_attribute('src') for thumbnail in thumbnail_web_elements]
for data in zip(titles, urls, thumbnails):
news = {
'title' : data[0],
'url' : data[1],
'thumbnail': data[2]
}
save_to_csv(news.values())
try:
page += 1
wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, '#pnnext'))).click()
except TimeoutException:
break
driver.quit()
Output is csv file google_news.csv:
COP27: nuclear boss doesn't expect surge in waste recycling,https://news.yahoo.com/cop27-nuclear-boss-doesnt-expect-072631885.html,""
UN Nuclear Chief Says Recycling Nuclear Waste 'Difficult ...,https://www.theepochtimes.com/un-nuclear-chief-says-recycling-nuclear-waste-difficult-after-biden-looks-to-fund-reprocessing-projects_4855151.html,""
COP27: UN nuclear chief says radioactive waste recycling is 'difficult' technology,https://www.deccanherald.com/international/world-news-politics/cop27-un-nuclear-chief-says-radioactive-waste-recycling-is-difficult-technology-1161036.html,""
Tested on Python 3.9.10. Used Selenium 4.5.0, openpyxl 3.0.10

Data are overwrite how to solve that

With every iteration through the loop, the previously extracted data is overwritten. How can I solve this problem?
from selenium import webdriver
import time
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support.select import Select
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
# url='https://www.amazon.com/dp/B00M0DWQYI?th=1'
# url='https://www.amazon.com/dp/B010RWD4GM?th=1'
PATH="C:\Program Files (x86)\chromedriver.exe"
driver =webdriver.Chrome(PATH)
df_urls = pd.read_csv('D:/selenium/inputs/amazone-asin.csv',encoding='utf-8')
list_dicts_urls =df_urls.to_dict('records')
item=dict()
product=[]
for url in list_dicts_urls:
product_url = 'https://' + url['MARKETPLACE'] + '/dp/' + url['ASIN']
driver.get(product_url)
try:
item['title'] = driver.find_element(By.CSS_SELECTOR,'span#productTitle').text
except:
item['title'] = ''
try:
item['brand'] = driver.find_element(By.CSS_SELECTOR,'a#bylineInfo').text.replace('Visit the','').replace('Store','').strip()
except:
item['brand'] = ''
try:
rating = driver.find_element(By.CSS_SELECTOR,'span#acrCustomerReviewText').text.replace('ratings','').strip()
rating = int(rating.replace(',', ''))
item['rating'] = rating
except:
item['rating'] = ''
time.sleep(2)
try:
p1=driver.find_element(By.XPATH, '//span[#class="a-price-whole"]').text
p2= driver.find_element(By.XPATH, '//span[#class="a-price-fraction"]').text
item['price']=p1+p2
except:
item['price']=''
product.append(item)
df=pd.DataFrame(product)
df.to_csv("ama.csv")
I think you need to define item=dict() inside the for loop. Otherwise this is the same, single item object used in all the loop iterations.
Try this:
from selenium import webdriver
import time
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support.select import Select
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
# url='https://www.amazon.com/dp/B00M0DWQYI?th=1'
# url='https://www.amazon.com/dp/B010RWD4GM?th=1'
PATH="C:\Program Files (x86)\chromedriver.exe"
driver =webdriver.Chrome(PATH)
df_urls = pd.read_csv('D:/selenium/inputs/amazone-asin.csv',encoding='utf-8')
list_dicts_urls =df_urls.to_dict('records')
product=[]
for url in list_dicts_urls:
item=dict()
product_url = 'https://' + url['MARKETPLACE'] + '/dp/' + url['ASIN']
driver.get(product_url)
try:
item['title'] = driver.find_element(By.CSS_SELECTOR,'span#productTitle').text
except:
item['title'] = ''
try:
item['brand'] = driver.find_element(By.CSS_SELECTOR,'a#bylineInfo').text.replace('Visit the','').replace('Store','').strip()
except:
item['brand'] = ''
try:
rating = driver.find_element(By.CSS_SELECTOR,'span#acrCustomerReviewText').text.replace('ratings','').strip()
rating = int(rating.replace(',', ''))
item['rating'] = rating
except:
item['rating'] = ''
time.sleep(2)
try:
p1=driver.find_element(By.XPATH, '//span[#class="a-price-whole"]').text
p2= driver.find_element(By.XPATH, '//span[#class="a-price-fraction"]').text
item['price']=p1+p2
except:
item['price']=''
product.append(item)
df=pd.DataFrame(product)
df.to_csv("ama.csv")

Scrape only 1 page I want to scrape multiple pages with selenium

I am trying to scrape multiple pages with selenium but they will scrape only 1 page what mistake I will do is there any solution then provide us this is the page link https://zoekeenadvocaat.advocatenorde.nl/zoeken?q=&type=advocaten&limiet=10&sortering=afstand&filters%5Brechtsgebieden%5D=%5B%5D&filters%5Bspecialisatie%5D=0&filters%5Btoevoegingen%5D=0&locatie%5Badres%5D=Holland&locatie%5Bgeo%5D%5Blat%5D=52.132633&locatie%5Bgeo%5D%5Blng%5D=5.291266&locatie%5Bstraal%5D=56&locatie%5Bhash%5D=67eb2b8d0aab60ec69666532ff9527c9&weergave=lijst&pagina=1
import time
from selenium import webdriver
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd
options = webdriver.ChromeOptions()
options.add_argument("--no-sandbox")
options.add_argument("--disable-gpu")
options.add_argument("--window-size=1920x1080")
options.add_argument("--disable-extensions")
chrome_driver = webdriver.Chrome(
service=Service(ChromeDriverManager().install()),
options=options
)
def supplyvan_scraper():
with chrome_driver as driver:
driver.implicitly_wait(15)
URL = 'https://zoekeenadvocaat.advocatenorde.nl/zoeken?qvrtqca=&filters%5Brechtsgebieden%5D=%5B%5D&ypb=&locatie%5Badres%5D=Holland&locatie%5Bgeo%5D=%7B%22lat%22%3A%2252.132633%22%2C%22lng%22%3A%225.291266%22%7D&locatie%5Bstraal%5D=56&filters%5Bspecialisatie%5D=0&filters%5Btoevoegingen%5D=0&locatie%5Bhash%5D='
driver.get(URL)
time.sleep(3)
page=1
page_links = [element.get_attribute('href') for element in
driver.find_elements(By.XPATH, "//span[#class='h4 no-margin-bottom']//a")]
data=[]
for link in page_links:
wev={}
driver.get(link)
time.sleep(2)
try:
title = driver.find_element(By.CSS_SELECTOR, '.title h3').text
except:
pass
wev['title']=title
try:
advocaten=driver.find_element(By.CSS_SELECTOR,".secondary").text
except:
pass
wev['advocaten']=advocaten
details=driver.find_elements(By.XPATH,"//section[#class='lawyer-info']")
for detail in details:
try:
address=detail.find_element_by_xpath("//div[#class='column medium-6']").text.strip()
except:
pass
wev['address']=address
try:
email=detail.find_element(By.XPATH, "//div[#class='row'][3]//div[#class='column small-9']//a").get_attribute('href')
except:
pass
wev['email']=email
try:
website=detail.find_element(By.XPATH, "//div[#class='row'][4]//div[#class='column small-9']//a").get_attribute('href')
except:
pass
wev['website']=website
data.append(wev)
if len(driver.find_elements_by_xpath("//a[#class='button next']")) > 0:
url = "https://zoekeenadvocaat.advocatenorde.nl/zoeken?q=&type=advocaten&limiet=10&sortering=afstand&filters%5Brechtsgebieden%5D=%5B%5D&filters%5Bspecialisatie%5D=0&filters%5Btoevoegingen%5D=0&locatie%5Badres%5D=Holland&locatie%5Bgeo%5D%5Blat%5D=52.132633&locatie%5Bgeo%5D%5Blng%5D=5.291266&locatie%5Bstraal%5D=56&locatie%5Bhash%5D=67eb2b8d0aab60ec69666532ff9527c9&weergave=lijst&pagina={}".format(page)
driver.get(url)
page += 1
if int(page)>5:
break
else:
break
df=pd.DataFrame(data)
print(df)
You can make the pagination in starting url using for loop as follows:
import time
from selenium import webdriver
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd
options = webdriver.ChromeOptions()
options.add_argument("--no-sandbox")
options.add_argument("--disable-gpu")
options.add_argument("--window-size=1920x1080")
options.add_argument("--disable-extensions")
chrome_driver = webdriver.Chrome(
service=Service(ChromeDriverManager().install()),
options=options)
data=[]
def supplyvan_scraper():
with chrome_driver as driver:
driver.implicitly_wait(15)
URL = 'https://zoekeenadvocaat.advocatenorde.nl/zoeken?q=&type=advocaten&limiet=10&sortering=afstand&filters%5Brechtsgebieden%5D=%5B%5D&filters%5Bspecialisatie%5D=0&filters%5Btoevoegingen%5D=0&locatie%5Badres%5D=Holland&locatie%5Bgeo%5D%5Blat%5D=52.132633&locatie%5Bgeo%5D%5Blng%5D=5.291266&locatie%5Bstraal%5D=56&locatie%5Bhash%5D=67eb2b8d0aab60ec69666532ff9527c9&weergave=lijst&pagina={page}'
for page in range(1,11):
driver.get(URL.format(page=page))
time.sleep(3)
page_links = [element.get_attribute('href') for element in driver.find_elements(By.XPATH, "//span[#class='h4 no-margin-bottom']//a")]
for link in page_links:
wev={}
driver.get(link)
time.sleep(2)
try:
title = driver.find_element(By.CSS_SELECTOR, '.title h3').text
except:
pass
wev['title']=title
try:
advocaten=driver.find_element(By.CSS_SELECTOR,".secondary").text
except:
pass
wev['advocaten']=advocaten
details=driver.find_elements(By.XPATH,"//section[#class='lawyer-info']")
for detail in details:
try:
address=detail.find_element_by_xpath("//div[#class='column medium-6']").text.strip()
except:
pass
wev['address']=address
try:
email=detail.find_element(By.XPATH, "//div[#class='row'][3]//div[#class='column small-9']//a").get_attribute('href')
except:
pass
wev['email']=email
try:
website=detail.find_element(By.XPATH, "//div[#class='row'][4]//div[#class='column small-9']//a").get_attribute('href')
except:
pass
wev['website']=website
data.append(wev)
df=pd.DataFrame(data)
print(df)
You also can try:
URL = 'https://zoekeenadvocaat.advocatenorde.nl/zoeken?q=&type=advocaten&limiet=10&sortering=afstand&filters%5Brechtsgebieden%5D=%5B%5D&filters%5Bspecialisatie%5D=0&filters%5Btoevoegingen%5D=0&locatie%5Badres%5D=Holland&locatie%5Bgeo%5D%5Blat%5D=52.132633&locatie%5Bgeo%5D%5Blng%5D=5.291266&locatie%5Bstraal%5D=56&locatie%5Bhash%5D=67eb2b8d0aab60ec69666532ff9527c9&weergave=lijst&pagina={page}'
for page in range(1,11):
url=URL.format(page=page)
driver.get(url)

dynamic values web scraping

Hello guys I've been trying to web scrape some pages that contain values that change all the time, but I'm not able to get the prices so far. Can anybody help me, this is where I reached so far!
import requests
import bs4
from urllib.request import Request, urlopen as uReq
from bs4 import BeautifulSoup as soup
from selenium import webdriver
from selenium.webdriver.firefox.firefox_binary import FirefoxBinary
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
my_url = 'https://www.cryptocompare.com/'
binary = FirefoxBinary('C:/Program Files/Mozilla Firefox/firefox.exe')
options = Options()
options.set_headless(headless=True)
options.binary = binary
cap = DesiredCapabilities().FIREFOX
cap["marionette"] = True
driver = webdriver.Firefox(firefox_options=options, capabilities=cap, executable_path="C:/Users/Genti/AppData/Local/Programs/Python/Python36-32/Lib/site-packages/selenium/geckodriver.exe")
browser = webdriver.Firefox(firefox_binary=binary)
browser.get(my_url)
html = browser.execute_script("return document.documentElement.outerHTML")
sel_soup = soup(html, 'html.parser')
prices = sel_soup.findAll("td", {"class":"price"})
print(prices)
You can try below code to get currency names, prices
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import datetime
from selenium import webdriver
from selenium.webdriver.firefox.firefox_binary import FirefoxBinary
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
my_url = 'https://www.cryptocompare.com/'
binary = FirefoxBinary('C:/Program Files/Mozilla Firefox/firefox.exe')
options = Options()
options.set_headless(headless=True)
options.binary = binary
cap = DesiredCapabilities().FIREFOX
cap["marionette"] = True
driver = webdriver.Firefox(firefox_options=options, capabilities=cap, executable_path="C:/Users/Genti/AppData/Local/Programs/Python/Python36-32/Lib/site-packages/selenium/geckodriver.exe")
driver.get(my_url)
names = [name.text.split('\n')[0] for name in WebDriverWait(driver, 10).until(EC.visibility_of_all_elements_located((By.CLASS_NAME, 'desktop-name')))]
prices = [price.text for price in WebDriverWait(driver, 10).until(EC.visibility_of_all_elements_located((By.CLASS_NAME, 'current-price-value')))]
print(datetime.datetime.now())
for name, price in zip(names, prices):
print(name + " - " + price)
In case, if you want all the 10 prices. you'd have to store all the prices in a list, like this :
all_prices = driver.find_elements_by_css_selector("td[class='price'] div")
then just iterate through a loop to get the values :
for price in all_prices:
print(price.text)
let me know, if you are facing any difficulties.
If you want to use BS and not Selenium Webdriver:
prices = sel_soup.select("td[class^='price'] > div")

Categories

Resources