With every iteration through the loop, the previously extracted data is overwritten. How can I solve this problem?
from selenium import webdriver
import time
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support.select import Select
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
# url='https://www.amazon.com/dp/B00M0DWQYI?th=1'
# url='https://www.amazon.com/dp/B010RWD4GM?th=1'
PATH="C:\Program Files (x86)\chromedriver.exe"
driver =webdriver.Chrome(PATH)
df_urls = pd.read_csv('D:/selenium/inputs/amazone-asin.csv',encoding='utf-8')
list_dicts_urls =df_urls.to_dict('records')
item=dict()
product=[]
for url in list_dicts_urls:
product_url = 'https://' + url['MARKETPLACE'] + '/dp/' + url['ASIN']
driver.get(product_url)
try:
item['title'] = driver.find_element(By.CSS_SELECTOR,'span#productTitle').text
except:
item['title'] = ''
try:
item['brand'] = driver.find_element(By.CSS_SELECTOR,'a#bylineInfo').text.replace('Visit the','').replace('Store','').strip()
except:
item['brand'] = ''
try:
rating = driver.find_element(By.CSS_SELECTOR,'span#acrCustomerReviewText').text.replace('ratings','').strip()
rating = int(rating.replace(',', ''))
item['rating'] = rating
except:
item['rating'] = ''
time.sleep(2)
try:
p1=driver.find_element(By.XPATH, '//span[#class="a-price-whole"]').text
p2= driver.find_element(By.XPATH, '//span[#class="a-price-fraction"]').text
item['price']=p1+p2
except:
item['price']=''
product.append(item)
df=pd.DataFrame(product)
df.to_csv("ama.csv")
I think you need to define item=dict() inside the for loop. Otherwise this is the same, single item object used in all the loop iterations.
Try this:
from selenium import webdriver
import time
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support.select import Select
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
# url='https://www.amazon.com/dp/B00M0DWQYI?th=1'
# url='https://www.amazon.com/dp/B010RWD4GM?th=1'
PATH="C:\Program Files (x86)\chromedriver.exe"
driver =webdriver.Chrome(PATH)
df_urls = pd.read_csv('D:/selenium/inputs/amazone-asin.csv',encoding='utf-8')
list_dicts_urls =df_urls.to_dict('records')
product=[]
for url in list_dicts_urls:
item=dict()
product_url = 'https://' + url['MARKETPLACE'] + '/dp/' + url['ASIN']
driver.get(product_url)
try:
item['title'] = driver.find_element(By.CSS_SELECTOR,'span#productTitle').text
except:
item['title'] = ''
try:
item['brand'] = driver.find_element(By.CSS_SELECTOR,'a#bylineInfo').text.replace('Visit the','').replace('Store','').strip()
except:
item['brand'] = ''
try:
rating = driver.find_element(By.CSS_SELECTOR,'span#acrCustomerReviewText').text.replace('ratings','').strip()
rating = int(rating.replace(',', ''))
item['rating'] = rating
except:
item['rating'] = ''
time.sleep(2)
try:
p1=driver.find_element(By.XPATH, '//span[#class="a-price-whole"]').text
p2= driver.find_element(By.XPATH, '//span[#class="a-price-fraction"]').text
item['price']=p1+p2
except:
item['price']=''
product.append(item)
df=pd.DataFrame(product)
df.to_csv("ama.csv")
Related
Try to scrape the data but data are overwrite and they will give the data of only 2 page in the csv file kindly recommend any solution for that I an waiting for your response How can I fix this? is there any way then suggest me I think due to for loop they overwrite data Thank you.these is the page link https://www.askgamblers.com/online-casinos/countries/uk/
from selenium import webdriver
import time
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
import pandas as pd
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from csv import writer
options = webdriver.ChromeOptions()
options.add_argument("--no-sandbox")
options.add_argument("--disable-gpu")
options.add_argument("--window-size=1920x1080")
options.add_argument("--disable-extensions")
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
wait = WebDriverWait(driver, 20)
for page in range(1,3):
URL = 'https://www.askgamblers.com/online-casinos/countries/uk/{page}'.format(page=page)
driver.get(URL)
time.sleep(2)
urls= []
data = []
page_links =driver.find_elements(By.XPATH, "//div[#class='card__desc']//a[starts-with(#href, '/online')]")
for link in page_links:
href=link.get_attribute("href")
urls.append(href)
product=[]
for url in urls:
wev={}
driver.get(url)
time.sleep(1)
try:
title=driver.find_element(By.CSS_SELECTOR,"h1.review-intro__title").text
except:
pass
wev['Title']=title
soup = BeautifulSoup(driver.page_source,"lxml")
pays=soup.select("div#tabPayments")
for pay in pays:
try:
t1=pay.select_one(".review-details-wrapper:nth-child(1) .review-details__item:nth-child(1) .review-details__text").get_text(' ',strip=True)
except:
pass
wev['deposit_method']=t1
try:
t2=pay.select_one(".review-details-wrapper:nth-child(1) .review-details__item+ .review-details__item .review-details__text").get_text(' ',strip=True)
except:
pass
wev['curriences']=t2
try:
t3=pay.select_one(" .review-details-wrapper+ .review-details-wrapper .review-details__item:nth-child(1) .review-details__text").get_text(' ',strip=True)
except:
pass
wev['with_drawl method']=t3
try:
t4 = pay.select_one(" .review-details-wrapper+ .review-details-wrapper .review-details__item:nth-child(2) .review-details__text")
t4 = [i.replace("\n", "") for i in t4 if i.text]
except:
pass
wev['with_drawl_time']=t4
product.append(wev)
df=pd.DataFrame(product)
df.to_csv('casino.csv')
All result in 1 file :
from selenium import webdriver
import time
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
import pandas as pd
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from csv import writer
options = webdriver.ChromeOptions()
options.add_argument("--no-sandbox")
options.add_argument("--disable-gpu")
options.add_argument("--window-size=1920x1080")
options.add_argument("--disable-extensions")
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
wait = WebDriverWait(driver, 20)
product=[]
for page in range(1,4):
URL = 'https://www.askgamblers.com/online-casinos/countries/uk/{page}'.format(page=page)
driver.get(URL)
time.sleep(2)
urls= []
data = []
page_links =driver.find_elements(By.XPATH, "//div[#class='card__desc']//a[starts-with(#href, '/online')]")
for link in page_links:
href=link.get_attribute("href")
urls.append(href)
for url in urls:
wev={}
driver.get(url)
time.sleep(1)
try:
title=driver.find_element(By.CSS_SELECTOR,"h1.review-intro__title").text
except:
pass
wev['Title']=title
soup = BeautifulSoup(driver.page_source,"lxml")
pays=soup.select("div#tabPayments")
for pay in pays:
try:
t1=pay.select_one(".review-details-wrapper:nth-child(1) .review-details__item:nth-child(1) .review-details__text").get_text(' ',strip=True)
except:
pass
wev['deposit_method']=t1
try:
t2=pay.select_one(".review-details-wrapper:nth-child(1) .review-details__item+ .review-details__item .review-details__text").get_text(' ',strip=True)
except:
pass
wev['curriences']=t2
try:
t3=pay.select_one(" .review-details-wrapper+ .review-details-wrapper .review-details__item:nth-child(1) .review-details__text").get_text(' ',strip=True)
except:
pass
wev['with_drawl method']=t3
try:
t4 = pay.select_one(" .review-details-wrapper+ .review-details-wrapper .review-details__item:nth-child(2) .review-details__text")
t4 = [i.replace("\n", "") for i in t4 if i.text]
except:
pass
wev['with_drawl_time']=t4
product.append(wev)
df=pd.DataFrame(product)
df.to_csv('casino.csv')
In first loop its running only 2 times :
Change it to 1,4 as below then it will give you [1,2,3]:
for page in range(1,4):
Then data getting overwritten because output file name is same:
change file name as below:
df.to_csv(f'casino_{page}.csv')
I have a code that does the following.
Enter the website
Log in
Click on a link
Click on a date
and select an item from the dropdown.
I wanted him to do this search in the first dropdown option that would be:
wait.until(EC.visibility_of_element_located((By.XPATH, '//*[#id="ctl00_ctl00_Content_Content_ddlVagasTerminalEmpresa"]/option[1]'))).click()
And add it to my csv file and then do it in option 2 and option 3. How to do this?
Below is my code
from bs4 import BeautifulSoup
from selenium.webdriver.common.by import By
from selenium.webdriver.firefox import options
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.support.ui import Select
import pandas as pd
import json
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from time import sleep
options = Options()
options.headless = False
dia = '{:0>2}'.format(input("Qual o dia do agendamento ? = "))
navegador = webdriver.Firefox(options = options)
wait = WebDriverWait(navegador, 30)
link = 'https://extranet.ecopatio.com.br/'
navegador.get(url = link)
inicio_str_dia = "a[title='"
final_str_dia = " de dezembro']"
diadoagenda = (inicio_str_dia+dia+final_str_dia)
wait.until(EC.element_to_be_clickable((By.ID, "ctl00_ctl00_Content_Content_txtLogin"))).send_keys('*********')
wait.until(EC.element_to_be_clickable((By.ID, "ctl00_ctl00_Content_Content_txtSenha"))).send_keys('*********')
wait.until(EC.element_to_be_clickable((By.ID, "ctl00_ctl00_Content_Content_btnEnviar"))).click()
wait.until(EC.element_to_be_clickable((By.ID, "ctl00_ctl00_Content_Content_TreeView2t8"))).click()
wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, diadoagenda))).click()
wait.until(EC.element_to_be_clickable((By.ID, "ctl00_ctl00_Content_Content_ddlVagasTerminalEmpresa"))).click()
wait.until(EC.element_to_be_clickable((By.ID, "ctl00_ctl00_Content_Content_ddlVagasTerminalEmpresa"))).click()
wait.until(EC.visibility_of_element_located((By.XPATH, '//*[#id="ctl00_ctl00_Content_Content_ddlVagasTerminalEmpresa"]/option[3]'))).click()
wait.until(EC.presence_of_element_located((By.XPATH, '//*[#id="divScroll"]')))
sleep(3)
teste = navegador.find_element(By.XPATH, '//*[#id="divScroll"]').get_attribute('innerHTML')
soup = BeautifulSoup(teste, "html.parser")
Vagas = soup.find_all(title="Vaga disponível.")
temp=[]
for i in Vagas:
on_click = i.get('onclick')
temp.append(on_click)
texto = str(temp)
b = {'amp;': '', 'Cadastro': 'https://extranet.ecopatio.com.br/agendamento/Cadastro'}
for x,y in b.items():
texto = texto.replace(x, y)
achado2 = texto.split('\'')[1::6]
#achado2_series = pd.Series(achado2)
df = pd.DataFrame(achado2)
df.to_csv('testa.csv', mode='a', header=False, index=False)```
# 뉴스 크롤링.py
#######################################'사용후핵연료' 키워드 검색##################################################
import sys, os
from bs4 import BeautifulSoup
import requests
from selenium import webdriver
import selenium
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
from datetime import datetime, timedelta
from pandas import DataFrame
import time
from openpyxl.workbook import Workbook
sleep_sec = 0.5
wb = Workbook()
# User-Agent를 입력해주세요.
headers = {'User-Agent' : '________________'}
query = 'spent nuclear fuel'
yesterday = (datetime.today() - timedelta(1)).strftime("%Y.%m.%d")
def news_crawling():
service = Service(executable_path=ChromeDriverManager().install())
browser = webdriver.Chrome(service=service)
print('브라우저를 실행시킵니다(자동 제어)\n')
news_url = 'https://www.google.com/search?q={0}&tbm=nws&source-news]'.format(query, yesterday)
browser.get(news_url)
time.sleep(sleep_sec)
print('\n크롤링을 시작합니다.')
#####동적 제어로 페이지 넘어가며 크롤링
news_dict = {}
idx = 1
cur_page = 1
news_num = 1000000
while True:
table = browser.find_element("xpath",'.//div[#data-hveid="CBAQAA"]')
li_list = table.find_elements("xpath",'.//li[contains(#class="vJOb1e aIfcHf Hw13jc"]')
area_list = [li.find_element("xpath",'.//div[#class="mCBkyc y355M ynAwRc MBeuO nDgy9d"]') for li in li_list]
for a in area_list[:min(len(area_list), news_num-idx+1)]:
n = a.find_element("xpath",'.//div[#role="heading"]')
n_url = n.get_attribute('href')
try:
img = a.find_element(By.CSS_SELECTOR,'img#dimg_').find_element(By.CSS_SELECTOR, 'img')
img = img.get_attribute('src')
except:
img = " "
news_dict[idx] = {'Title' : n.get_attribute('title'),
'url' : n_url,
'thumbnail': img}
idx += 1
try:
next_btn = browser.find_element(By.CSS_SELECTOR, 'a#pnnext')
next_btn.click()
cur_page +=1
# pages = browser.find_element("xpath",'//div[#class="sc_page_inner"]')
# next_page_url = [p for p in pages.find_elements("xpath",'.//a') if p.text == str(cur_page)][0].get_attribute('href')
pages = browser.find_element("xpath",'//table[#class="fl"]')
next_page_url = [p for p in pages.find_elements("xpath",'.//a') if p.text == str(cur_page)][0].get_attribute('aria-lable')
browser.get(next_page_url)
time.sleep(sleep_sec)
except:
print('\n브라우저를 종료합니다.\n' + '=' * 100)
time.sleep(0.7)
browser.close()
break
########################################################여기까지 수정 완료################################################################
# 엑셀파일 추출
print('데이터프레임 변환\n')
news_df = DataFrame(news_dict).T
folder_path = os.getcwd()
xlsx_file_name = '{}_{}.xlsx'.format(query, yesterday)
news_df.to_excel(xlsx_file_name, index=False)
print('엑셀 저장 완료 | 경로 : {}\\{}\n'.format(folder_path, xlsx_file_name))
news_crawling()
this is my code. I use it on Korean website and it works well. But after I modified it for google search, it wouldn't work.
I want to search something on google and then get the news titles into a xlsx file.
I before used it in Korean website, so I changed the part below
table = browser.find_element("xpath",'.//div[#data-hveid="CBAQAA"]')
li_list = table.find_elements("xpath",'.//li[contains(#class="vJOb1e aIfcHf Hw13jc"]')
area_list = [li.find_element("xpath",'.//div[#class="mCBkyc y355M ynAwRc MBeuO nDgy9d"]') for li in li_list]
and when I run the code, it only gives me an empty xlsx file.
can anyone help with this please? I would be so appreciate.
Here is one possible solution:
from openpyxl import Workbook
from datetime import datetime
from datetime import timedelta
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
def get_url(query: str, min_date: str, max_date: str) -> str:
return f'https://www.google.com/search?q={query}&tbm=nws&source-news&tbs=cdr:1,cd_min:{min_date},cd_max:{max_date}'
options = webdriver.ChromeOptions()
options.add_argument('--headless')
options.add_experimental_option("excludeSwitches", ["enable-automation", "enable-logging"])
service = Service(executable_path="path/to/your/chromedriver.exe")
driver = webdriver.Chrome(service=service, options=options)
wait = WebDriverWait(driver, 5)
yesterday = (datetime.now() - timedelta(1)).strftime("%m.%d.%Y")
driver.get(get_url('spent nuclear fuel', yesterday, yesterday))
url_locator = (By.CSS_SELECTOR, '#rso a')
title_locator = (By.CSS_SELECTOR, 'a div[role="heading"]')
thumbnail_locator = (By.CSS_SELECTOR, '#rso a>div>div:first-child img')
workbook = Workbook()
worksheet = workbook.active
page = 1
while True:
print(f'Current page: {page}')
url_web_elements = wait.until(EC.visibility_of_all_elements_located(url_locator))
title_web_elements = wait.until(EC.presence_of_all_elements_located(title_locator))
thumbnail_web_elements = wait.until(EC.visibility_of_all_elements_located(thumbnail_locator))
titles = [title.text.replace(',', '.') for title in title_web_elements]
urls = [link.get_attribute('href') for link in url_web_elements]
thumbnails = [thumbnail.get_attribute('src') for thumbnail in thumbnail_web_elements]
for data in zip(titles, urls, thumbnails):
news = {
'title' : data[0],
'url' : data[1],
'thumbnail': data[2]
}
worksheet.append(list(news.values()))
try:
page += 1
wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, '#pnnext'))).click()
except TimeoutException:
break
workbook.save(f'google_news_{yesterday}.xlsx')
driver.quit()
Output is xlsx file google_news_11.10.2022.xlsx
In the get_url function, you can pass a range of dates for which the news will be displayed. For example get_url('spent nuclear fuel', 01.11.2022, 11.11.2022)
You can also save data to csv using this solution:
import csv
from datetime import datetime
from datetime import timedelta
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
def get_url(query: str, min_date: str, max_date: str) -> str:
return f'https://www.google.com/search?q={query}&tbm=nws&source-news&tbs=cdr:1,cd_min:{min_date},cd_max:{max_date}'
def save_to_csv(data: list) -> None:
with open(file='google_news.csv', mode='a', encoding="utf-8") as f:
writer = csv.writer(f, lineterminator='\n')
writer.writerow([*data])
options = webdriver.ChromeOptions()
options.add_argument('--headless')
options.add_experimental_option("excludeSwitches", ["enable-automation", "enable-logging"])
service = Service(executable_path="path/to/your/chromedriver.exe")
driver = webdriver.Chrome(service=service, options=options)
wait = WebDriverWait(driver, 5)
yesterday = (datetime.now() - timedelta(1)).strftime("%m.%d.%Y")
driver.get(get_url('spent nuclear fuel', yesterday, yesterday))
url_locator = (By.CSS_SELECTOR, '#rso a')
title_locator = (By.CSS_SELECTOR, 'a div[role="heading"]')
thumbnail_locator = (By.CSS_SELECTOR, '#rso a>div>div:first-child img')
page = 1
while True:
print(f'Current page: {page}')
url_web_elements = wait.until(EC.visibility_of_all_elements_located(url_locator))
title_web_elements = wait.until(EC.presence_of_all_elements_located(title_locator))
thumbnail_web_elements = wait.until(EC.visibility_of_all_elements_located(thumbnail_locator))
titles = [title.text.replace(',', '.') for title in title_web_elements]
urls = [link.get_attribute('href') for link in url_web_elements]
thumbnails = [thumbnail.get_attribute('src') for thumbnail in thumbnail_web_elements]
for data in zip(titles, urls, thumbnails):
news = {
'title' : data[0],
'url' : data[1],
'thumbnail': data[2]
}
save_to_csv(news.values())
try:
page += 1
wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, '#pnnext'))).click()
except TimeoutException:
break
driver.quit()
Output is csv file google_news.csv:
COP27: nuclear boss doesn't expect surge in waste recycling,https://news.yahoo.com/cop27-nuclear-boss-doesnt-expect-072631885.html,""
UN Nuclear Chief Says Recycling Nuclear Waste 'Difficult ...,https://www.theepochtimes.com/un-nuclear-chief-says-recycling-nuclear-waste-difficult-after-biden-looks-to-fund-reprocessing-projects_4855151.html,""
COP27: UN nuclear chief says radioactive waste recycling is 'difficult' technology,https://www.deccanherald.com/international/world-news-politics/cop27-un-nuclear-chief-says-radioactive-waste-recycling-is-difficult-technology-1161036.html,""
Tested on Python 3.9.10. Used Selenium 4.5.0, openpyxl 3.0.10
I have a scraper to get movies from IMDB, it seems to work fine, but in some cases it gets the wrong data. And that doesn't always happen, but for example, it opens the page of the movie and takes the image of that movie, then the window is closed and a new one opens with another movie, but it ends up saving the image of the previous movie and in some cases does not get all the data. But that doesn't always happen, so I have no idea what might be happening. Can anyone give me an idea of what could be happening?
My code is this:
import re
from time import sleep
import csv
import sqlite3
import pickle
from turtle import title
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
from os.path import exists
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from selenium.common.exceptions import NoSuchElementException
from requests_html import HTML
import pandas
from slugify import slugify
da=[]
def parse_details(driver,asin,data_id):
sleep(2)
nulo='NULL'
try:
titulo_filme=driver.find_element(By.XPATH,'//h1[#data-testid="hero-title-block__title"]').text
slug_titulo = slugify(titulo_filme)
except:
titulo_filme=' '
try:
nota_imdb=driver.find_element(By.XPATH,'//div[#data-testid="hero-rating-bar__aggregate-rating__score"]/span[1]').text
except:
nota_imdb=' '
try:
#genum=driver.find_element(By.XPATH,'//*[#data-testid="storyline-genres"]/div/ul/li[1]').text
#gendois=driver.find_element(By.XPATH,'//*[#data-testid="storyline-genres"]/div/ul/li[2]').text
#gentres=driver.find_element(By.XPATH,'//*[#data-testid="storyline-genres"]/div/ul/li[3]').text
#genquatro=driver.find_element(By.XPATH,'//*[#data-testid="storyline-genres"]/div/ul/li[4]').text
#genero = genum + '- ' + gendois + '- ' + gentres + '- ' + genquatro
get_gen = driver.find_elements(By.XPATH,'//*[#data-testid="genres"]/div/a')
lista_gen = []
for gene in get_gen:
lista_gen.append(gene.text)
genero = lista_gen
except:
get_gen = driver.find_elements(By.XPATH,'//*[#data-testid="storyline-genres"]/div/ul/li/a')
lista_gen = []
for gene in get_gen:
lista_gen.append(gene.text)
genero = lista_gen
try:
data_lancamento=driver.find_element(By.XPATH,'//*[#data-testid="title-details-releasedate"]/div').text
except:
data_lancamento=''
try:
tempo_duracao=driver.find_element(By.XPATH,'//*[#data-testid="title-techspec_runtime"]/div').text
except:
tempo_duracao=''
try:
idioma=driver.find_elements(By.XPATH,'//*[#data-testid="title-details-languages"]/div/*/li/a')
lista_idioma = []
for idiom in idioma:
lista_idioma.append(idiom.text)
idioma=lista_idioma
except:
idioma=''
try:
empresa=driver.find_elements(By.XPATH,'//*[#data-testid="title-details-companies"]/div/*/li/a')
lista_empresa = []
for empres in empresa:
lista_empresa.append(empres.text)
empresa_produtora=lista_empresa
except:
empresa_produtora=''
try:
tbm_conhecido_como=driver.find_element(By.XPATH,'//*[#data-testid="title-details-akas"]/div').text
except:
tbm_conhecido_como=''
try:
pais=driver.find_elements(By.XPATH,'//*[#data-testid="title-details-origin"]/div/*/li/a')
lista_pais = []
for pai in pais:
lista_pais.append(pai.text)
pais_origem=lista_pais
except:
pais_origem=''
try:
som=driver.find_elements(By.XPATH,'//*[#data-testid="title-techspec_soundmix"]/div/*/li/a')
lista_som = []
for so in som:
lista_som.append(so.text)
codec_som=lista_som
except:
codec_som=''
try:
geet=driver.find_elements(By.XPATH,'//*[#data-testid="title-cast-item__actor"]')
lista = []
for gen in geet:
lista.append(gen.text)
elenco_principal = lista
except:
elenco_principal=''
try:
diretor_get=driver.find_elements(By.XPATH,'//*[#data-testid="title-pc-wide-screen"]//*[#data-testid="title-pc-principal-credit"]//*[contains(text(),"Direção")]/parent::li/div/ul/li')
diretor_get_criacao=driver.find_elements(By.XPATH,'//*[#data-testid="title-pc-wide-screen"]//*[#data-testid="title-pc-principal-credit"]//*[contains(text(),"Criação")]/parent::li/div/ul/li')
lista_diretor = []
for diret in diretor_get:
lista_diretor.append(diret.text)
for diret_criacao in diretor_get_criacao:
lista_diretor.append(diret_criacao.text)
diretor=lista_diretor
except:
diretor=''
try:
roteiristas_get=driver.find_elements(By.XPATH,'//*[#data-testid="title-pc-wide-screen"]//*[#data-testid="title-pc-principal-credit"]//*[contains(text(),"Roteiristas")]/parent::li/div/ul/li')
roteiristas_get_single=driver.find_elements(By.XPATH,'//*[#data-testid="title-pc-wide-screen"]//*[#data-testid="title-pc-principal-credit"]//*[contains(text(),"Roteirista")]/parent::li/div/ul/li')
lista_roteiristas = []
for roteiro in roteiristas_get:
lista_roteiristas.append(roteiro.text)
for roteiro_single in roteiristas_get_single:
lista_roteiristas.append(roteiro_single.text)
roteiristas=lista_roteiristas
except:
roteiristas=''
try:
artistas_get=driver.find_elements(By.XPATH,'//*[#data-testid="title-pc-wide-screen"]//*[#data-testid="title-pc-principal-credit"]//*[contains(text(),"Artistas")]/parent::li/div/ul/li')
lista_artistas_principal = []
for artist in artistas_get:
lista_artistas_principal.append(artist.text)
artistas_principal=lista_artistas_principal
except:
artistas_principal=''
try:
classif_indicativa=WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.XPATH, "//*[#data-testid='storyline-certificate']/div/ul/li"))).text
except:
classif_indicativa=''
try:
slogan=driver.find_element(By.XPATH,'//*[#data-testid="storyline-taglines"]/div').text
except:
slogan=''
try:
img=WebDriverWait(driver, 2).until(EC.presence_of_element_located((By.XPATH, "//img[#class='ipc-image']"))).get_attribute('srcset')
except:
img='testeeee.jpg'
try:
des=driver.find_element(By.XPATH,'//meta[#name="description"]').get_attribute('content')
except:
des=''
try:
tipo=driver.find_element(By.XPATH,'//*[#data-testid="hero-title-block__metadata"]//*[contains(text(),"Minissérie")] | //*[#data-testid="hero-title-block__metadata"]//*[contains(text(),"Série de TV")] | //*[#data-testid="hero-title-block__metadata"]//*[contains(text(),"Especial de TV")]').text
except:
tipo='Filme'
try:
prime_video_link=driver.find_element(By.XPATH,'//*[#data-testid="tm-box-pwo-btn"]//div[contains(text(),"Prime Video")]/ancestor::a').get_attribute('href')
except:
try:
driver.find_element(By.XPATH,'//*[#data-testid="tm-box-mwo-btn"]//*[contains(text(),"Mais opções")]').click()
sleep(1)
prime_video_link=driver.find_element(By.XPATH,'//*[#data-testid="promptable"]//*[#data-focus-lock-disabled="false"]//*[contains(text(),"RENT/BUY")]/parent::div/ul/a').get_attribute('href')
except:
prime_video_link=''
da.append([nulo, data_id, titulo_filme, slug_titulo, tipo, nota_imdb, prime_video_link, data_lancamento, img, des, genero, tempo_duracao, idioma, empresa_produtora, tbm_conhecido_como, pais_origem, codec_som, elenco_principal, diretor, roteiristas, artistas_principal, classif_indicativa, slogan])
df=pandas.DataFrame(da,columns=['nulo','id','titulo_filme', 'slug_titulo', 'tipo', 'nota_imdb', 'prime_video_link', 'data_lancamento', 'imagem','descricao', 'genero', 'tempo_duracao', 'idioma', 'empresa_produtora', 'tbm_conhecido_como', 'pais_origem', 'codec_som', 'elenco_principal', 'diretor', 'roteiristas', 'artistas_principal', 'classif_indicativa', 'slogan'])
df.to_csv(f'{file_name}.csv',index=False)
print(da)
driver.close()
def collecting_links(linkss):
for link in linkss:
id_link = link.get_attribute('href')
#asi = id_link.split("/")[5]
dat_id=id_link.split('/')[-2]
driver.execute_script(f"window.open('{id_link}')")
driver.switch_to.window(driver.window_handles[-1])
parse_details(driver,asin='asi',data_id=dat_id)
driver.switch_to.window(driver.window_handles[0])
def main(driver):
sub_category_link=[]
driver.get(input_url)
#change_location()
links = driver.find_elements(By.XPATH, '//h3[#class="lister-item-header"]/a')
collecting_links(linkss=links)
while True:
try:
driver.find_element(By.LINK_TEXT,'Next »').click()
links = driver.find_elements(By.XPATH, '//h3[#class="lister-item-header"]/a')
collecting_links(linkss=links)
except:
print("Finalizado ---- Links:")
print(driver.find_element(By.LINK_TEXT,'Previous').get_attribute('href'))
break
if __name__ == "__main__":
print("Starting Scraper")
daa=[]
input_url=input('Enter your url: ')
file_name=input('Enter your file name: ')
daa.append([input_url,str(file_name)+'.csv'])
df=pandas.DataFrame(daa,columns=['','']).to_csv('scraped_url.txt',index=False,header=False,mode='a')
s = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(ChromeDriverManager().install())
driver.maximize_window()
main(driver)
sleep(2)
So i am working on a custom web scraper for any kind of ecommerce site, i want it to scrape names and prices of listings on a site and then export them to csv, but the problem is it exports only one line of (name, price) and it prints it on every line of csv, i couldnt find a good solution for this, i hope im not asking an extremely stupid thing, although i think the fix is easy. I hope someone will read my code and help me, thank you !
###imports
from selenium.webdriver.common.keys import Keys
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import csv
import pandas as pd
#driver path
driver = webdriver.Firefox(executable_path="D:\Programy\geckoDriver\geckodriver.exe")
#init + search
driver.get("https://pc.bazos.sk/pc/")
time.sleep(1)
nazov = driver.find_element_by_name("hledat")
nazov.send_keys("xeon")
cenamin = driver.find_element_by_name("cenaod")
cenamin.send_keys("")
cenamax = driver.find_element_by_name("cenado")
cenamax.send_keys("300")
driver.find_element_by_name("Submit").click()
##cookie acceptor
driver.find_element_by_xpath("/html/body/div[1]/button").click()
##main
x = 3
for i in range(x):
try:
main = WebDriverWait(driver, 7).until(
EC.presence_of_element_located((By.XPATH, "/html/body/div[1]/table/tbody/tr/td[2]"))
)
##find listings in table
inzeraty = main.find_elements_by_class_name("vypis")
for vypis in inzeraty:
nadpis = vypis.find_element_by_class_name("nadpis")
##print listings to check correctness
nadpist = nadpis.text
print(nadpist)
##find the price and print
for vypis in inzeraty:
cena = vypis.find_element_by_class_name("cena")
cenat = cena.text
print(cenat)
##export to csv - not working
time.sleep(1)
print("Writing to csv")
d = {"Nazov": [nadpist]*20*x,"Cena": [cenat]*20*x}
df = pd.DataFrame(data=d)
df.to_csv("bobo.csv")
time.sleep(1)
print("Writing to csv done !")
##next page
dalsia = driver.find_element_by_link_text("Ďalšia")
dalsia.click()
except:
driver.quit()
i want the csv to look like:
name,price
name2, price2
it would be great is the csv had only two columns and x rows depending on the number of listings
from selenium.webdriver.common.keys import Keys
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import pandas as pd
#driver path
driver = webdriver.Chrome()
#init + search
driver.get("https://pc.bazos.sk/pc/")
time.sleep(1)
nazov = driver.find_element_by_name("hledat")
nazov.send_keys("xeon")
cenamin = driver.find_element_by_name("cenaod")
cenamin.send_keys("")
cenamax = driver.find_element_by_name("cenado")
cenamax.send_keys("300")
driver.find_element_by_name("Submit").click()
##cookie acceptor
time.sleep(10)
driver.find_element_by_xpath("/html/body/div[1]/button").click()
##main
x = 3
d = []
for i in range(x):
try:
main = WebDriverWait(driver, 7).until(
EC.presence_of_element_located(
(By.XPATH, "/html/body/div[1]/table/tbody/tr/td[2]")))
##find listings in table
inzeraty = main.find_elements_by_class_name("vypis")
for vypis in inzeraty:
d.append({"Nazov": vypis.find_element_by_class_name("nadpis").text,
"Cena": vypis.find_element_by_class_name("cena").text
})
##next page
dalsia = driver.find_element_by_link_text("Ďalšia")
dalsia.click()
except:
driver.quit()
time.sleep(1)
print("Writing to csv")
df = pd.DataFrame(data=d)
df.to_csv("bobo.csv",index=False)
this gives me 59 items with price. first added to dict then to list, then send that to pandas.
All you need to do is create two empty lists nadpist_l, cenat_l and append data to that lists, finally save the lists as a dataframe.
UPDATED as per the comment
Check if this works
###imports
from selenium.webdriver.common.keys import Keys
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import pandas as pd
#driver path
driver = webdriver.Chrome()
#init + search
driver.get("https://pc.bazos.sk/pc/")
time.sleep(1)
nazov = driver.find_element_by_name("hledat")
nazov.send_keys("xeon")
cenamin = driver.find_element_by_name("cenaod")
cenamin.send_keys("")
cenamax = driver.find_element_by_name("cenado")
cenamax.send_keys("300")
driver.find_element_by_name("Submit").click()
##cookie acceptor
time.sleep(10)
driver.find_element_by_xpath("/html/body/div[1]/button").click()
##main
x = 3
d = {}
for i in range(x):
try:
main = WebDriverWait(driver, 7).until(
EC.presence_of_element_located(
(By.XPATH, "/html/body/div[1]/table/tbody/tr/td[2]")))
##find listings in table
inzeraty = main.find_elements_by_class_name("vypis")
nadpist_l = []
for vypis in inzeraty:
nadpis = vypis.find_element_by_class_name("nadpis")
##print listings to check correctness
nadpist = nadpis.text
nadpist_l.append(nadpist)
# print(nadpist)
##find the price and print
cenat_l = []
for vypis in inzeraty:
cena = vypis.find_element_by_class_name("cena")
cenat = cena.text
cenat_l.append(cenat)
print(len(cenat_l))
##export to csv - not working
d.update({"Nazov": [nadpist_l] * 20 * x, "Cena": [cenat_l] * 20 * x})
##next page
dalsia = driver.find_element_by_link_text("Ďalšia")
dalsia.click()
except:
driver.quit()
time.sleep(1)
print("Writing to csv")
df = pd.DataFrame(data=d)
df.to_csv("bobo.csv")
time.sleep(1)
print("Writing to csv done !")