web scraping with python selenium loop and save problem - python

'Hi,I want to save the data I took as csv and txt, but I couldn't.
Moreover;
How can I repeat this process multiple times?'
nextInput = driver.find_element("xpath",'//*[#id="pnnext"]/span[2]').click()
result = driver.find_elements(By.CSS_SELECTOR, ".GyAeWb cite.iUh30")
'
Code;
'
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
import pandas as pd
import time
import csv
import re
driver = webdriver.Chrome()
url ="http://google.com"
driver.get(url)
searchInput = driver.find_element("xpath",'/html/body/div[1]/div[3]/form/div[1]/div[1]/div[1]/div/div[2]/input')
time.sleep(1)
searchInput.send_keys("dişçi")
time.sleep(2)
searchInput.send_keys(Keys.ENTER)
time.sleep(2)
result = driver.page_source
result = driver.find_elements(By.CSS_SELECTOR, ".GyAeWb cite.iUh30")
for index,element in enumerate (result):
print(index+1,element.text)
result = []
result = list(set(result))
time.sleep(2)
nextInput = driver.find_element("xpath",'//*[#id="pnnext"]/span[2]').click()
result = driver.find_elements(By.CSS_SELECTOR, ".GyAeWb cite.iUh30")
for index,element in enumerate (result):
print(index+1,element.text)
count = 1
with open("siteler.txt","w",encoding="UTF-8") as file:
for item in result:
file.write(f"{count}-{item}\n")
count+=1
driver.close()

try:
while 1:
nextInput = driver.find_element("xpath",'//*[#id="pnnext"]/span[2]').click()
result = driver.find_elements(By.CSS_SELECTOR, ".GyAeWb cite.iUh30")
for index,element in enumerate (result):
print(index+1,element.text)
count = 1
with open("siteler.txt","w",encoding="UTF-8") as file:
for item in result:
file.write(f"{count}-{item}\n")
count+=1
except Exception as e:
print(e)
finally:
print("there is no element with '//*[#id='pnnext']/span[2]' XPATH")

Related

Scraped data is not saving to csv file as it keeps returning a blank csv file

My scraper is calling the website and hitting each of the 44 pages and creating a csv file but the csv file is empty. I am returning after each of the functions and saving the data to a csv at the end of the scraper.
Can anyone see what is wrong with my code?
Code:
import pandas,requests,bs4,time
from seleniumwire import webdriver
from webdriver_manager.firefox import GeckoDriverManager
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
import datetime
TODAY = datetime.datetime.today().strftime("%Y%m%d")
SAVE_FILENAME = "/Users/180284/jupyter-1.0.0/pssi_jobs-"+TODAY+".csv"
driver = webdriver.Chrome('~/Desktop/chromedriver_mac64')
driver.implicitly_wait(30)
URL_BASE = "https://jobs.pssi.com/us/en/search-resultskeywords=%22food%20safety%20team%20member%22&s=1"
MAX_PAGE = 44
HEADERS = {
'From': 'myemail'
}
def interceptor(request):
del request.headers['From']
request.headers['From'] = HEADERS["From"]
driver.request_interceptor = interceptor
def parse_job_post_div(div_html):
soup = bs4.BeautifulSoup(div_html)
job_ls = soup.findAll("div",{"class":"information"})
job_data = []
for job in job_ls:
job_listing = job.find("div",{"class":"information"}).get_text(separator=", ").strip()
title = job.find("span",{"role":"heading"}).get_text(separator=", ").strip()
job_location = job.find("p",{"class":"job-info"}).get_text(separator=", ").strip()
new_row = {"job_listing":job,"title":title,"job_location":job_location}
job_data.append(new_row)
return job_data
def get_data(wd):
job_postings = driver.find_element(By.CLASS_NAME, "information")
html = job_postings.get_attribute("innerHTML")
parsed = parse_job_post_div(html)
return pandas.DataFrame(parsed)
def process_page(url):
driver.get(url)
master_data = []
i = 0
while True:
df = get_data(driver)
master_data.append(df)
if i == (MAX_PAGE - 1):
break
driver.find_element(By.XPATH, "//span[#class='icon icon-arrow-right']").click()
time.sleep(10)
print(i)
i+=1
return pandas.concat(master_data,ignore_index=True)
data = process_page(URL_BASE)
data.to_csv(SAVE_FILENAME)
`
I have tried the above code.
The first problem I found in your code is that the job_ls is an empty list, i.e. soup.findAll("div",{"class":"information"}) doesn't find anything.
Moreover, job_postings contains only one webelement (i.e. the first job of the list) instead of all 10 jobs shown in the page, that's because you used .find_element instead of .find_elements. As a result of these and other problems, process_page(URL_BASE) returns an empty dataframe.
In this case you can speed up the process and use less code using directly selenium instead of bs4
driver.get(URL_BASE)
driver.implicitly_wait(30)
MAX_PAGE = 4
titles, locations, descriptions = [], [], []
for i in range(MAX_PAGE):
print('current page:',i+1,end='\r')
titles += [title.text for title in driver.find_elements(By.CSS_SELECTOR, '.information > span[role=heading]')]
locations += [loc.text.replace('\n',', ') for loc in driver.find_elements(By.CSS_SELECTOR, '.information > p[class=job-info]')]
descriptions += [title.text for title in driver.find_elements(By.CSS_SELECTOR, '.information > p[data-ph-at-id=jobdescription-text')]
if i < MAX_PAGE-1:
driver.find_element(By.XPATH, "//span[#class='icon icon-arrow-right']").click()
else:
break
df = pandas.DataFrame({'title':titles,'location':locations,'description':descriptions})
df.to_csv(SAVE_FILENAME, index=False)
and df will be something like

Selenium getting wrong data

I have a scraper to get movies from IMDB, it seems to work fine, but in some cases it gets the wrong data. And that doesn't always happen, but for example, it opens the page of the movie and takes the image of that movie, then the window is closed and a new one opens with another movie, but it ends up saving the image of the previous movie and in some cases does not get all the data. But that doesn't always happen, so I have no idea what might be happening. Can anyone give me an idea of what could be happening?
My code is this:
import re
from time import sleep
import csv
import sqlite3
import pickle
from turtle import title
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
from os.path import exists
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from selenium.common.exceptions import NoSuchElementException
from requests_html import HTML
import pandas
from slugify import slugify
da=[]
def parse_details(driver,asin,data_id):
sleep(2)
nulo='NULL'
try:
titulo_filme=driver.find_element(By.XPATH,'//h1[#data-testid="hero-title-block__title"]').text
slug_titulo = slugify(titulo_filme)
except:
titulo_filme=' '
try:
nota_imdb=driver.find_element(By.XPATH,'//div[#data-testid="hero-rating-bar__aggregate-rating__score"]/span[1]').text
except:
nota_imdb=' '
try:
#genum=driver.find_element(By.XPATH,'//*[#data-testid="storyline-genres"]/div/ul/li[1]').text
#gendois=driver.find_element(By.XPATH,'//*[#data-testid="storyline-genres"]/div/ul/li[2]').text
#gentres=driver.find_element(By.XPATH,'//*[#data-testid="storyline-genres"]/div/ul/li[3]').text
#genquatro=driver.find_element(By.XPATH,'//*[#data-testid="storyline-genres"]/div/ul/li[4]').text
#genero = genum + '- ' + gendois + '- ' + gentres + '- ' + genquatro
get_gen = driver.find_elements(By.XPATH,'//*[#data-testid="genres"]/div/a')
lista_gen = []
for gene in get_gen:
lista_gen.append(gene.text)
genero = lista_gen
except:
get_gen = driver.find_elements(By.XPATH,'//*[#data-testid="storyline-genres"]/div/ul/li/a')
lista_gen = []
for gene in get_gen:
lista_gen.append(gene.text)
genero = lista_gen
try:
data_lancamento=driver.find_element(By.XPATH,'//*[#data-testid="title-details-releasedate"]/div').text
except:
data_lancamento=''
try:
tempo_duracao=driver.find_element(By.XPATH,'//*[#data-testid="title-techspec_runtime"]/div').text
except:
tempo_duracao=''
try:
idioma=driver.find_elements(By.XPATH,'//*[#data-testid="title-details-languages"]/div/*/li/a')
lista_idioma = []
for idiom in idioma:
lista_idioma.append(idiom.text)
idioma=lista_idioma
except:
idioma=''
try:
empresa=driver.find_elements(By.XPATH,'//*[#data-testid="title-details-companies"]/div/*/li/a')
lista_empresa = []
for empres in empresa:
lista_empresa.append(empres.text)
empresa_produtora=lista_empresa
except:
empresa_produtora=''
try:
tbm_conhecido_como=driver.find_element(By.XPATH,'//*[#data-testid="title-details-akas"]/div').text
except:
tbm_conhecido_como=''
try:
pais=driver.find_elements(By.XPATH,'//*[#data-testid="title-details-origin"]/div/*/li/a')
lista_pais = []
for pai in pais:
lista_pais.append(pai.text)
pais_origem=lista_pais
except:
pais_origem=''
try:
som=driver.find_elements(By.XPATH,'//*[#data-testid="title-techspec_soundmix"]/div/*/li/a')
lista_som = []
for so in som:
lista_som.append(so.text)
codec_som=lista_som
except:
codec_som=''
try:
geet=driver.find_elements(By.XPATH,'//*[#data-testid="title-cast-item__actor"]')
lista = []
for gen in geet:
lista.append(gen.text)
elenco_principal = lista
except:
elenco_principal=''
try:
diretor_get=driver.find_elements(By.XPATH,'//*[#data-testid="title-pc-wide-screen"]//*[#data-testid="title-pc-principal-credit"]//*[contains(text(),"Direção")]/parent::li/div/ul/li')
diretor_get_criacao=driver.find_elements(By.XPATH,'//*[#data-testid="title-pc-wide-screen"]//*[#data-testid="title-pc-principal-credit"]//*[contains(text(),"Criação")]/parent::li/div/ul/li')
lista_diretor = []
for diret in diretor_get:
lista_diretor.append(diret.text)
for diret_criacao in diretor_get_criacao:
lista_diretor.append(diret_criacao.text)
diretor=lista_diretor
except:
diretor=''
try:
roteiristas_get=driver.find_elements(By.XPATH,'//*[#data-testid="title-pc-wide-screen"]//*[#data-testid="title-pc-principal-credit"]//*[contains(text(),"Roteiristas")]/parent::li/div/ul/li')
roteiristas_get_single=driver.find_elements(By.XPATH,'//*[#data-testid="title-pc-wide-screen"]//*[#data-testid="title-pc-principal-credit"]//*[contains(text(),"Roteirista")]/parent::li/div/ul/li')
lista_roteiristas = []
for roteiro in roteiristas_get:
lista_roteiristas.append(roteiro.text)
for roteiro_single in roteiristas_get_single:
lista_roteiristas.append(roteiro_single.text)
roteiristas=lista_roteiristas
except:
roteiristas=''
try:
artistas_get=driver.find_elements(By.XPATH,'//*[#data-testid="title-pc-wide-screen"]//*[#data-testid="title-pc-principal-credit"]//*[contains(text(),"Artistas")]/parent::li/div/ul/li')
lista_artistas_principal = []
for artist in artistas_get:
lista_artistas_principal.append(artist.text)
artistas_principal=lista_artistas_principal
except:
artistas_principal=''
try:
classif_indicativa=WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.XPATH, "//*[#data-testid='storyline-certificate']/div/ul/li"))).text
except:
classif_indicativa=''
try:
slogan=driver.find_element(By.XPATH,'//*[#data-testid="storyline-taglines"]/div').text
except:
slogan=''
try:
img=WebDriverWait(driver, 2).until(EC.presence_of_element_located((By.XPATH, "//img[#class='ipc-image']"))).get_attribute('srcset')
except:
img='testeeee.jpg'
try:
des=driver.find_element(By.XPATH,'//meta[#name="description"]').get_attribute('content')
except:
des=''
try:
tipo=driver.find_element(By.XPATH,'//*[#data-testid="hero-title-block__metadata"]//*[contains(text(),"Minissérie")] | //*[#data-testid="hero-title-block__metadata"]//*[contains(text(),"Série de TV")] | //*[#data-testid="hero-title-block__metadata"]//*[contains(text(),"Especial de TV")]').text
except:
tipo='Filme'
try:
prime_video_link=driver.find_element(By.XPATH,'//*[#data-testid="tm-box-pwo-btn"]//div[contains(text(),"Prime Video")]/ancestor::a').get_attribute('href')
except:
try:
driver.find_element(By.XPATH,'//*[#data-testid="tm-box-mwo-btn"]//*[contains(text(),"Mais opções")]').click()
sleep(1)
prime_video_link=driver.find_element(By.XPATH,'//*[#data-testid="promptable"]//*[#data-focus-lock-disabled="false"]//*[contains(text(),"RENT/BUY")]/parent::div/ul/a').get_attribute('href')
except:
prime_video_link=''
da.append([nulo, data_id, titulo_filme, slug_titulo, tipo, nota_imdb, prime_video_link, data_lancamento, img, des, genero, tempo_duracao, idioma, empresa_produtora, tbm_conhecido_como, pais_origem, codec_som, elenco_principal, diretor, roteiristas, artistas_principal, classif_indicativa, slogan])
df=pandas.DataFrame(da,columns=['nulo','id','titulo_filme', 'slug_titulo', 'tipo', 'nota_imdb', 'prime_video_link', 'data_lancamento', 'imagem','descricao', 'genero', 'tempo_duracao', 'idioma', 'empresa_produtora', 'tbm_conhecido_como', 'pais_origem', 'codec_som', 'elenco_principal', 'diretor', 'roteiristas', 'artistas_principal', 'classif_indicativa', 'slogan'])
df.to_csv(f'{file_name}.csv',index=False)
print(da)
driver.close()
def collecting_links(linkss):
for link in linkss:
id_link = link.get_attribute('href')
#asi = id_link.split("/")[5]
dat_id=id_link.split('/')[-2]
driver.execute_script(f"window.open('{id_link}')")
driver.switch_to.window(driver.window_handles[-1])
parse_details(driver,asin='asi',data_id=dat_id)
driver.switch_to.window(driver.window_handles[0])
def main(driver):
sub_category_link=[]
driver.get(input_url)
#change_location()
links = driver.find_elements(By.XPATH, '//h3[#class="lister-item-header"]/a')
collecting_links(linkss=links)
while True:
try:
driver.find_element(By.LINK_TEXT,'Next »').click()
links = driver.find_elements(By.XPATH, '//h3[#class="lister-item-header"]/a')
collecting_links(linkss=links)
except:
print("Finalizado ---- Links:")
print(driver.find_element(By.LINK_TEXT,'Previous').get_attribute('href'))
break
if __name__ == "__main__":
print("Starting Scraper")
daa=[]
input_url=input('Enter your url: ')
file_name=input('Enter your file name: ')
daa.append([input_url,str(file_name)+'.csv'])
df=pandas.DataFrame(daa,columns=['','']).to_csv('scraped_url.txt',index=False,header=False,mode='a')
s = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(ChromeDriverManager().install())
driver.maximize_window()
main(driver)
sleep(2)

I am very new to scraping please bear with me and this is my 1st project. I am trying to scrape a site using selenium

"problem lines"
for_tariff_loop = driver.find_elements_by_xpath("//span[#class='phx-radio__element']")
radio_label_list = for_tariff_loop[i].find_element_by_css_selector('span[class="phx-radio__label"]')
print(radio_label_list)
time.sleep(1)
website I'm scraping https://www.telekom.de/unterwegs/apple/apple-iphone-13-pro/graphit-512gb
label image
I was not able to print the radio buttons label according to checked button. I don't know what is the mistake and where I did it. could anyone help on this. It will be helpful for me to learn. Change tariff links given below links,
import xlwt
from selenium import webdriver
import re
import time
from datetime import date
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.keys import Keys
class telekommobiles:
def __init__(self):
self.url="https://www.telekom.de/mobilfunk/geraete/smartphone?page=1&pageFilter=promotion"
self.country='DE'
self.currency='GBP'
self.VAT='Included'
self.shipping = 'free shipping within 3-4 weeks'
self.Pre_PromotionPrice ='N/A'
self.color ='N/A'
def telekom(self):
#try:
driver=webdriver.Chrome()
driver.maximize_window()
driver.get(self.url)
today = date.today()
time.sleep(5)
cookies = driver.find_element_by_css_selector('button.cl-btn.cl-btn--accept-all').click()
print("cookies accepted")
links_prod_check = []
prod_models = []
prod_manufacturer =[]
prod_memorys = []
product_colors =[]
product_price_monthly_payments = []
product_price_one_time_payments =[]
product_links = []
containers = driver.find_elements_by_css_selector('div[class="styles_item__12Aw4"]')
i = 1
for container in containers:
p_links =container.find_element_by_tag_name('a').get_attribute('href')
i = i + 1
product_links.append(p_links)
#print(p_links)
for links in product_links:
driver.get(links)
#time.sleep(5)
#print(driver.current_url)
#links_prod_check.append(driver.current_url)
coloroptions = WebDriverWait(driver, 30).until(EC.presence_of_all_elements_located((By.XPATH,"//li[#data-qa='list_ColorVariant']")))
#print(coloroptions)
for i in range(len(coloroptions)):
coloroption = driver.find_elements_by_xpath("//li[#data-qa='list_ColorVariant']")
coloroption[i].click()
#print(coloroption[i])
time.sleep(3)
memoryoptions = WebDriverWait(driver, 30).until(EC.presence_of_all_elements_located((By.XPATH,"//span[#class='phx-radio__element']")))
for i in range(len(memoryoptions)):
memoryoption = driver.find_elements_by_xpath("//span[#class='phx-radio__element']")
try:
memoryoption[i].click()
except:
pass
time.sleep(5)
change_traiff = driver.find_element_by_css_selector('button[class="phx-link phx-list-of-links__link js-mod tracking-added"]').click()
time.sleep(3)
#looping for each section
section_loops = driver.find_elements_by_css_selector('section[class="tariff-catalog--layer"]')
#print(len(section_loops))
for section_loop in section_loops:
#print(section_loop)
time.sleep(5)
#Headings
heading_1 = section_loop.find_element_by_css_selector('h2[class="page-title page-title--lowercase"]').text
print(heading_1)
# looping for each separate boxes
each_box_subcontainers = section_loop.find_elements_by_css_selector('.phx-tariff-box__section')
#print(len(each_box_subcontainers))
for subcontainer in each_box_subcontainers:
#print(subcontainer)
looping_for_tariff = WebDriverWait(driver, 10).until(EC.presence_of_all_elements_located((By.XPATH,"//span[#class='phx-radio__element']")))
#print(looping_for_tariff)
for i in range(len(looping_for_tariff)):
#print(i)
try:
for_tariff_loop = driver.find_elements_by_xpath("//span[#class='phx-radio__element']")
for_tariff_loop[i].click()
time.sleep(3)
except:
pass
for_tariff_loop = driver.find_elements_by_xpath("//span[#class='phx-radio__element']")
radio_label_list = for_tariff_loop[i].find_element_by_css_selector('span[class="phx-radio__label"]')
print(radio_label_list)
time.sleep(1)
change_traiff_close_button = driver.find_element_by_css_selector('span[class="icon-after-yellow-close right close popup-close-tr js-popup-close"]').click()
telekom_de=telekommobiles()
telekom_de.telekom()
You are trying to find element within an element. Finding radio_label_list using for_tariff_loop[i], xpath for radio_label_list will become like below:
//span[#class='phx-radio__element']//span[#class="phx-radio__label"]
Which does not exist in the DOM.
I tried the last part of the code. And was able to print the Memory size like below. Do try and confirm:
Replaced css-selector for radio_label_list with this xpath ./following-sibling::span
looping_for_tariff = WebDriverWait(driver, 10).until(EC.presence_of_all_elements_located((By.XPATH, "//span[#class='phx-radio__element']")))
# print(looping_for_tariff)
for i in range(len(looping_for_tariff)):
# print(i)
try:
for_tariff_loop = driver.find_elements_by_xpath("//span[#class='phx-radio__element']")
for_tariff_loop[i].click()
time.sleep(3)
except:
pass
for_tariff_loop = driver.find_elements_by_xpath("//span[#class='phx-radio__element']")
radio_label_list = for_tariff_loop[i].find_element_by_xpath("./following-sibling::span").text
print(radio_label_list)
time.sleep(1)
As per the comments, check this code:
driver.get("https://www.telekom.de/unterwegs/apple/apple-iphone-13-pro/graphit-512gb")
wait = WebDriverWait(driver,30)
wait.until(EC.element_to_be_clickable((By.XPATH,"//button[text()='Accept All']"))).click()
wait.until(EC.element_to_be_clickable((By.XPATH,"//ul[contains(#class,'phx-tariff-notification-box-new__element--desktop-tablet')]/li[2]/button"))).click()
length = len(driver.find_elements_by_class_name("phx-tariff-box__section"))
for i in range(length):
print("----------------------------------------------------------------------------------------------------------")
options = driver.find_elements_by_class_name("phx-tariff-box__section")
datas = options[i].find_element_by_xpath(".//div[contains(#class,'phx-tariff-box__volume')]").get_attribute("innerText")
print("data: {}".format(datas))
len_types = len(options[i].find_elements_by_xpath(".//div[#class='phx-tariff-box__radios-inner']//label"))
types = options[i].find_elements_by_xpath(".//div[#class='phx-tariff-box__radios-inner']//label")
if len(types) == 0:
price = options[i].find_element_by_xpath(".//p[#data-qa='block_TariffPrice']").get_attribute("innerText")
print(price)
else:
for j in range(len_types):
types[j].click()
time.sleep(2)
options = driver.find_elements_by_class_name("phx-tariff-box__section")
types = options[i].find_elements_by_xpath(".//div[#class='phx-tariff-box__radios-inner']//label")
try:
types[j].find_element_by_xpath("./input[#checked]")
type = types[j].find_element_by_xpath("./span[2]").get_attribute("innerText")
price = options[i].find_element_by_xpath(".//p[#data-qa='block_TariffPrice']").get_attribute("innerText")
print(f"{type}: {price}")
except:
pass

Scraped data is not saving to csv file as it keeps returning a blank csv file.

Can anyone see what's wrong with this code?
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions
from selenium.webdriver.support.ui import WebDriverWait
import csv
def races(main_url):
driver = webdriver.Chrome()
driver.get(main_url)
driver.implicitly_wait(2)
races = driver.find_elements_by_class_name('time-location')
races = [race.text[:5] for race in races]
races = [race.replace(':', '') for race in races]
driver.close()
return races
def scrape(url):
driver = webdriver.Chrome()
driver.get(url)
driver.implicitly_wait(2)
driver.find_elements_by_class_name('racecard-ajax-link')[1].click()
WebDriverWait(driver, 5).until(expected_conditions.presence_of_element_located((By.XPATH, '//[#id="tab-racecard-sectional-times"]/div/div[1]/div[1]/div[2]/div/button')))
for horse in driver.find_elements_by_class_name('card-item'):
horseName = horse.find_element_by_class_name('form-link').text
times = horse.find_elements_by_class_name('sectionals-time')
times = [time.text for time in times]
print('{}: {}'.format(horseName, times))
print()
driver.close()
So at this next point below I am trying to save the data to df, but it returns a blank doc when opened. Should df = open('jan1.csv', 'w+') not store the scraped data into the csv file. I'm obviously missing something but can't see what.
def main():
df = open('jan1.csv', 'w+')
df.close()
date = '1-January-2018'
main_url = 'http://www.attheraces.com/racecard/Southwell/' + date
for race in races(main_url):
url = main_url + '/' + race
print(url)
scrape(url)
if __name__ == '__main__':
main()
Your code seems broken in several places and even with fixing it I get Timeout errors.
Try these steps:
Add pandas for easy data handling:
import pandas as pd
def scrape(url):
driver = webdriver.Chrome()
driver.get(url)
driver.implicitly_wait(2)
driver.find_elements_by_class_name('racecard-ajax-link')[1].click()
WebDriverWait(driver, 5).until(expected_conditions.presence_of_element_located((By.XPATH, '//[#id="tab-racecard-sectional-times"]/div/div[1]/div[1]/div[2]/div/button')))
# add empty list to save scraped data
data = []
for horse in driver.find_elements_by_class_name('card-item'):
horseName = horse.find_element_by_class_name('form-link').text
times = horse.find_elements_by_class_name('sectionals-time')
times = [time.text for time in times]
print('{}: {}'.format(horseName, times))
data.append([horseName, times])
print()
driver.close()
# return your data!
return data
Then change this in your main function:
def main():
date = '1-January-2018'
main_url = 'http://www.attheraces.com/racecard/Southwell/' + date
tmp = []
for race in races(main_url):
url = main_url + '/' + race
print(url)
tmp.append(scrape(url))
df = pd.DataFrame(tmp)
df.to_csv("jan1.csv")
Or if you want to stick to csv only (no pandas):
with open("jan1.csv", "w+") as file:
file.write(your_data_var_here)

Selenium Python webscraper really slow

I'm a newbie getting into web scrapers. I've made something that works, but it takes hours and hours to get everything I need. I read something about using parallel processes to process the URLs but I have no clue how to go about it and incorporate it in what I already have. Help is much appreciated!
Here is my, still extremely messy, code. I'm still learning :)
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from bs4 import BeautifulSoup
from selenium.common.exceptions import NoSuchElementException
import time
import random
import pprint
import itertools
import csv
import pandas as pd
start_url = "https://www.nationalevacaturebank.nl/vacature/zoeken?query=&location=&distance=city&limit=100&sort=relevance&filters%5BcareerLevel%5D%5B%5D=Starter&filters%5BeducationLevel%5D%5B%5D=MBO"
driver = webdriver.Firefox()
driver.set_page_load_timeout(20)
driver.get(start_url)
driver.find_element_by_xpath('//*[#id="form_save"]').click() #accepts cookies
wait = WebDriverWait(driver, random.randint(1500,3200)/1000.0)
j = random.randint(1500,3200)/1000.0
time.sleep(j)
num_jobs = int(driver.find_element_by_xpath('/html/body/div[3]/div/main/div[2]/div[3]/div/header/h2/span').text)
num_pages = int(num_jobs/102)
urls = []
list_of_links = []
for i in range(num_pages+1):
try:
elements = wait.until(EC.presence_of_all_elements_located((By.XPATH, '//*[#id="search-results-container"]//article/job/a')))
for i in elements:
list_of_links.append(i.get_attribute('href'))
j = random.randint(1500,3200)/1000.0
time.sleep(j)
if 'page=3' not in driver.current_url:
driver.find_element_by_xpath('//html/body/div[3]/div/main/div[2]/div[3]/div/paginator/div/nav[1]/ul/li[6]/a').click()
else:
driver.find_element_by_xpath('//html/body/div[3]/div/main/div[2]/div[3]/div/paginator/div/nav[1]/ul/li[5]/a').click()
url = driver.current_url
if url not in urls:
print(url)
urls.append(url)
else:
break
except:
continue
set_list_of_links = list(set(list_of_links))
print(len(set_list_of_links), "results")
driver.close()
def grouper(n, iterable):
it = iter(iterable)
while True:
chunk = tuple(itertools.islice(it, n))
if not chunk:
return
yield chunk
def remove_empty_lists(l):
keep_going = True
prev_l = l
while keep_going:
new_l = remover(prev_l)
#are they identical objects?
if new_l == prev_l:
keep_going = False
#set prev to new
prev_l = new_l
#return the result
return new_l
def remover(l):
newlist = []
for i in l:
if isinstance(i, list) and len(i) != 0:
newlist.append(remover(i))
if not isinstance(i, list):
newlist.append(i)
return newlist
vacatures = []
chunks = grouper(100, set_list_of_links)
chunk_count = 0
for chunk in chunks:
chunk_count +=1
print(chunk_count)
j = random.randint(1500,3200)/1000.0
time.sleep(j)
for url in chunk:
driver = webdriver.Firefox()
driver.set_page_load_timeout(20)
try:
driver.get(url)
driver.find_element_by_xpath('//*[#id="form_save"]').click() #accepts cookies
vacature = []
vacature.append(url)
j = random.randint(1500,3200)/1000.0
time.sleep(j)
elements = driver.find_elements_by_tag_name('dl')
p_elements = driver.find_elements_by_tag_name('p')
li_elements = driver.find_elements_by_tag_name('li')
for i in elements:
if "Salaris:" not in i.text:
vacature.append(i.text)
running_text = list()
for p in p_elements:
running_text.append(p.text)
text= [''.join(running_text)]
remove_ls = ['vacatures', 'carrièretips', 'help', 'inloggen', 'inschrijven', 'Bezoek website', 'YouTube',
'Over Nationale Vacaturebank', 'Werken bij de Persgroep', 'Persberichten', 'Autotrack', 'Tweakers',
'Tweakers Elect', 'ITBanen', 'Contact', 'Carrière Mentors', 'Veelgestelde vragen',
'Vacatures, stages en bijbanen', 'Bruto Netto Calculator', 'Salariswijzer', 'Direct vacature plaatsen',
'Kandidaten zoeken', 'Bekijk de webshop', 'Intermediair', 'Volg ons op Facebook']
for li in li_elements:
if li.text not in remove_ls:
text.append(li.text)
text = ''. join(text)
vacature.append(text)
vacatures.append(vacature)
driver.close()
except TimeoutException as ex:
isrunning = 0
print("Exception has been thrown. " + str(ex))
driver.close()
except NoSuchElementException:
continue
Python Selenium webdriver is not thread-safe. This means your browser can not correctly consume asynchronous calls from multiple threads. Try to scrape websites with requests and bs4 + lxml. It's much faster than Selenium. This answer can be helpful.
You're using Firefox which is slower than Chrome in almost all real-life applications.
Xpath is the slowest selector, match by id or class. If that is not possible then by CSS.
Use headless mode and don't load images unless you need to.
You can use Scrapy and this is much faster and more flexible than anything. See link for more information.

Categories

Resources