I can't get all data in CSV, only last. When scraping is done only last one scraped is saving CSV file but I want to save from all pages.
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
import time
import pandas as pd
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.common.by import By
driver = webdriver.Chrome()
mainurl = 'https://austin.craigslist.org/search/cta?s=0'
driver.get(mainurl)
res = driver.execute_script("return document.documentElement.outerHTML")
page_soup = BeautifulSoup(res, 'html.parser')
lnk_opener = driver.find_element_by_xpath('//*[#id="sortable-results"]/ul/li[1]/p/a').click()
time.sleep(4)
records = []
i = 1
while i <3:
i+=1
try:
print(driver.current_url)
except Exception:
print('Internet Error Detected')
try:
title = driver.find_element_by_xpath('//*[#id="titletextonly"]').text
print(title)
except Exception:
print('No Title Given')
try:
price = driver.find_element_by_xpath('/html/body/section/section/h2/span/span[2]').text
print(price)
except Exception:
print('No Price Given')
try:
phone = driver.find_element_by_xpath('//*[#id="postingbody"]/h2[1]/big').text
print(phone)
records.append((phone))
except Exception:
print('No Mobile number avalible')
try:
loc = driver.find_element_by_xpath('/html/body/section/section/section/div[1]/div/div[2]').text
print(loc)
except Exception:
print('No Location Data Avalible')
try:
img = page_soup.find('img')
immg = print(img.get('src','\n'))
except Exception:
print('No img Found')
nxtpg = driver.find_element_by_xpath('/html/body/section/section/header/div[1]/div/a[3]')
nxtpg.click()
time.sleep(4)
url = driver.find_element_by_xpath("/html/body/section/section/header/div[1]/div/a[3]").get_attribute("href")
if url == None:
bckbtn = driver.find_element_by_class_name('backup').click()
time.sleep(5)
nextbuttton = driver.find_element_by_xpath('//*[#id="searchform"]/div[3]/div[3]/span[2]/a[3]').click()
time.sleep(6)
print(records)
records.append((driver.current_url, title, price, loc, immg))
df = pd.DataFrame(records, columns=['Product Url', 'Title/Model/Make', 'Price', 'GM Location', 'Image Link'])
print(df)
df.to_csv('zzz.csv')
time.sleep(4)
driver.quit()
I think this line
records.append((driver.current_url, title, price, loc, immg))
should be inside the while statement. Also, move i += 1 to the end of the statement, otherwise you're skipping the first iteration.
Related
This code worked perfectly fine the other day for scraping a website then I ran it today and it fails I have spent litteraly days on this trying to figure it out to no avail. I have also totally uninstalled and reinstalled Python and all required libararies to no avail. Any help would be appricated below is the code and error:
Thanks for any help you could provide:
the error us: KeyError: 'date'
import shutil
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
import pandas as pd
from bs4 import BeautifulSoup
import warnings
import json
import shutil
import os
from difflib import get_close_matches
import time
import imdb
import datetime
# creating an instance of the IMDB()
##ia = imdb.IMDb()
ia = imdb.Cinemagoer()
# Using the Search movie method
warnings.filterwarnings("ignore")
def get_movie_name(name):
try:
items = []
while len(items) == 0 and len(name) > 0:
time.sleep(2)
items = ia.search_movie(name)
if len(items) == 0:
name = name.split(' ')
name = name[:-1]
name = ' '.join(name)
closest_match = get_close_matches(name, [str(i) for i in items])
if len(closest_match) ==0:
return False
else:
return closest_match[0]
except Exception as e:
print(e)
return False
def read_inputs():
f = open('config.json')
data = json.load(f)
return data
data = read_inputs()
download_folder = 'C:\\Users\\jerem\\Downloads'
date = data["date"]
username = data["username"]
password = data["password"]
# initialize the Chrome driver
options = webdriver.ChromeOptions()
options.add_experimental_option("excludeSwitches", ["enable-logging",'--headless','--disable-gpu'])
driver = webdriver.Chrome(options = options,executable_path = "chromedriver")
driver.get("http://www.bd25.eu/index.php")
#username
##driver.find_element_by_xpath("//input[#class='lista'][#type='text']").send_keys(username)
driver.find_element(By.XPATH, "//input[#type='text']").send_keys(username)
##driver.find_element('type',"text").send_keys(username)
#password
driver.find_element(By.XPATH, "//input[#type='password']").send_keys(password)
#submit
driver.find_element(By.XPATH, "//input[#type='submit']").click()
#list of nzbs
driver.find_element(By.LINK_TEXT, 'List of NZBs').click()
movie_df,final_df = pd.DataFrame(),pd.DataFrame()
get_url = driver.current_url
for page in range(1,int(data['page_number_max'])):
try:
if page > 1:
add_on = '&order=3&by=2&pages='+str(page)
driver.get(get_url+add_on)
body = driver.find_element(By.XPATH, "/html/body/table/tbody/tr")
all_text = body.text
y = all_text.split('\n')
for i in range(330,430):
movie_df = movie_df.append({'text':y[i]},ignore_index=True)
value_list = y[i].split(' ')
name = value_list[1:-7]
name = ' '.join(name)
if value_list[-3] != 'Upload':
final_df = final_df.append({'date':value_list[-3],
'name':name},ignore_index=True)
value_list = []
print(str(page)+' page is done')
except:
break
#final_df = final_df[1:]
date = datetime.datetime.strptime(date,'%d/%m/%Y')
final_df['Upload_date'] = pd.to_datetime(final_df['date'],dayfirst=True)
final_df = final_df.loc[final_df['Upload_date'] > date]
for i in range(len(final_df)):
try:
driver.find_element(By.LINK_TEXT, 'List of NZBs').click()
driver.find_element(By.ID, "searchinput").send_keys(final_df['name'].iloc[i])
driver.find_element(By.XPATH, "//input[#type='submit']").click()
driver.find_element(By.XPATH, "//img[#src='images/download.gif']").click()
time.sleep(2)
space_list = final_df['name'].iloc[i].split(' ')
foldername = '.'.join(space_list)
try:
list_of_folder = [x for x in os.listdir(download_folder)]
closest_match = get_close_matches(foldername, list_of_folder)
src = closest_match[0]
except:
time.sleep(2)
print('waiting for download')
list_of_folder = [x for x in os.listdir(download_folder)]
closest_match = get_close_matches(foldername, list_of_folder)
src = closest_match[0]
##click on link to open password
driver.find_element(By.PARTIAL_LINK_TEXT, final_df['name'].iloc[i]).click()
time.sleep(10)
##driver.find_element(By.ID, 'ty').click()
driver.find_element(By.XPATH, "//input[#type='button']").click()
password_to_save = driver.find_element(By.XPATH, "//div[#id='thanks_div']").text
while len(password_to_save)==0:
password_to_save = driver.find_element(By.XPATH, "//div[#id='thanks_div']").text
print('getting password')
time.sleep(2)
print('got password')
movie_name = get_movie_name(final_df['name'].iloc[i])
if movie_name == False:
movie_name = final_df['name'].iloc[i]
print(movie_name)
try:
# Destination
try:
dest = movie_name + ' password='+password_to_save+'.rar'
# Renaming the file
os.rename(download_folder+src, download_folder+dest)
except:
try:
dest = movie_name+'.rar'
os.rename(download_folder+src, download_folder+dest)
except:
try:
movie_name = ''.join(e for e in movie_name if e.isalnum())
dest = movie_name + ' password='+password_to_save+'.rar'
#dest = movie_name+'.rar'
os.rename(download_folder+src, download_folder+dest)
except:
movie_name = ''.join(e for e in movie_name if e.isalnum())
dest = movie_name + '.rar'
os.rename(download_folder+src, download_folder+dest)
except Exception as e:
print(e)
try:
print('renamed file')
shutil.move( download_folder+dest, data['target_directory_file'])
# get description
body = driver.find_element(By.XPATH, "/html/body/table/tbody/tr")
a = body.text
soup = BeautifulSoup(a)
description = (soup.prettify())
desc_index =description.find('Description')
screen_index = description.find('Screenshots')
final_description = description[desc_index:screen_index]
#create_info_file(movie_name,description)
os.chdir(data['target_directory_info_file'])
with open(movie_name+'.txt', 'w') as f:
f.write(final_description)
f.write('password = '+password_to_save)
print(i)
except:
print('already exists')
except Exception as e:
print(e)
======================
Below is the Error
Traceback (most recent call last):
File "C:\Python310\lib\site-packages\pandas\core\indexes\base.py", line 3800, in get_loc
return self._engine.get_loc(casted_key)
File "pandas\_libs\index.pyx", line 138, in pandas._libs.index.IndexEngine.get_loc
File "pandas\_libs\index.pyx", line 165, in pandas._libs.index.IndexEngine.get_loc
File "pandas\_libs\hashtable_class_helper.pxi", line 5745, in pandas._libs.hashtable.PyObjectHashTable.get_item
File "pandas\_libs\hashtable_class_helper.pxi", line 5753, in pandas._libs.hashtable.PyObjectHashTable.get_item
KeyError: 'date'
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "C:\Users\jerem\Desktop\Final\BD25 Scraper\project2.py", line 104, in <module>
final_df['Upload_date'] = pd.to_datetime(final_df['date'],dayfirst=True)
File "C:\Python310\lib\site-packages\pandas\core\frame.py", line 3805, in __getitem__
indexer = self.columns.get_loc(key)
File "C:\Python310\lib\site-packages\pandas\core\indexes\base.py", line 3802, in get_loc
raise KeyError(key) from err
KeyError: 'date'
I am expecting the output to allow me to scrape all entries after a date I put in.
This code worked a few days ago, I have made no edits now it doesn't.
Would someone be able to help me figure out why it cannot find'date' when I have it defined?
I have a scraper to get movies from IMDB, it seems to work fine, but in some cases it gets the wrong data. And that doesn't always happen, but for example, it opens the page of the movie and takes the image of that movie, then the window is closed and a new one opens with another movie, but it ends up saving the image of the previous movie and in some cases does not get all the data. But that doesn't always happen, so I have no idea what might be happening. Can anyone give me an idea of what could be happening?
My code is this:
import re
from time import sleep
import csv
import sqlite3
import pickle
from turtle import title
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
from os.path import exists
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from selenium.common.exceptions import NoSuchElementException
from requests_html import HTML
import pandas
from slugify import slugify
da=[]
def parse_details(driver,asin,data_id):
sleep(2)
nulo='NULL'
try:
titulo_filme=driver.find_element(By.XPATH,'//h1[#data-testid="hero-title-block__title"]').text
slug_titulo = slugify(titulo_filme)
except:
titulo_filme=' '
try:
nota_imdb=driver.find_element(By.XPATH,'//div[#data-testid="hero-rating-bar__aggregate-rating__score"]/span[1]').text
except:
nota_imdb=' '
try:
#genum=driver.find_element(By.XPATH,'//*[#data-testid="storyline-genres"]/div/ul/li[1]').text
#gendois=driver.find_element(By.XPATH,'//*[#data-testid="storyline-genres"]/div/ul/li[2]').text
#gentres=driver.find_element(By.XPATH,'//*[#data-testid="storyline-genres"]/div/ul/li[3]').text
#genquatro=driver.find_element(By.XPATH,'//*[#data-testid="storyline-genres"]/div/ul/li[4]').text
#genero = genum + '- ' + gendois + '- ' + gentres + '- ' + genquatro
get_gen = driver.find_elements(By.XPATH,'//*[#data-testid="genres"]/div/a')
lista_gen = []
for gene in get_gen:
lista_gen.append(gene.text)
genero = lista_gen
except:
get_gen = driver.find_elements(By.XPATH,'//*[#data-testid="storyline-genres"]/div/ul/li/a')
lista_gen = []
for gene in get_gen:
lista_gen.append(gene.text)
genero = lista_gen
try:
data_lancamento=driver.find_element(By.XPATH,'//*[#data-testid="title-details-releasedate"]/div').text
except:
data_lancamento=''
try:
tempo_duracao=driver.find_element(By.XPATH,'//*[#data-testid="title-techspec_runtime"]/div').text
except:
tempo_duracao=''
try:
idioma=driver.find_elements(By.XPATH,'//*[#data-testid="title-details-languages"]/div/*/li/a')
lista_idioma = []
for idiom in idioma:
lista_idioma.append(idiom.text)
idioma=lista_idioma
except:
idioma=''
try:
empresa=driver.find_elements(By.XPATH,'//*[#data-testid="title-details-companies"]/div/*/li/a')
lista_empresa = []
for empres in empresa:
lista_empresa.append(empres.text)
empresa_produtora=lista_empresa
except:
empresa_produtora=''
try:
tbm_conhecido_como=driver.find_element(By.XPATH,'//*[#data-testid="title-details-akas"]/div').text
except:
tbm_conhecido_como=''
try:
pais=driver.find_elements(By.XPATH,'//*[#data-testid="title-details-origin"]/div/*/li/a')
lista_pais = []
for pai in pais:
lista_pais.append(pai.text)
pais_origem=lista_pais
except:
pais_origem=''
try:
som=driver.find_elements(By.XPATH,'//*[#data-testid="title-techspec_soundmix"]/div/*/li/a')
lista_som = []
for so in som:
lista_som.append(so.text)
codec_som=lista_som
except:
codec_som=''
try:
geet=driver.find_elements(By.XPATH,'//*[#data-testid="title-cast-item__actor"]')
lista = []
for gen in geet:
lista.append(gen.text)
elenco_principal = lista
except:
elenco_principal=''
try:
diretor_get=driver.find_elements(By.XPATH,'//*[#data-testid="title-pc-wide-screen"]//*[#data-testid="title-pc-principal-credit"]//*[contains(text(),"Direção")]/parent::li/div/ul/li')
diretor_get_criacao=driver.find_elements(By.XPATH,'//*[#data-testid="title-pc-wide-screen"]//*[#data-testid="title-pc-principal-credit"]//*[contains(text(),"Criação")]/parent::li/div/ul/li')
lista_diretor = []
for diret in diretor_get:
lista_diretor.append(diret.text)
for diret_criacao in diretor_get_criacao:
lista_diretor.append(diret_criacao.text)
diretor=lista_diretor
except:
diretor=''
try:
roteiristas_get=driver.find_elements(By.XPATH,'//*[#data-testid="title-pc-wide-screen"]//*[#data-testid="title-pc-principal-credit"]//*[contains(text(),"Roteiristas")]/parent::li/div/ul/li')
roteiristas_get_single=driver.find_elements(By.XPATH,'//*[#data-testid="title-pc-wide-screen"]//*[#data-testid="title-pc-principal-credit"]//*[contains(text(),"Roteirista")]/parent::li/div/ul/li')
lista_roteiristas = []
for roteiro in roteiristas_get:
lista_roteiristas.append(roteiro.text)
for roteiro_single in roteiristas_get_single:
lista_roteiristas.append(roteiro_single.text)
roteiristas=lista_roteiristas
except:
roteiristas=''
try:
artistas_get=driver.find_elements(By.XPATH,'//*[#data-testid="title-pc-wide-screen"]//*[#data-testid="title-pc-principal-credit"]//*[contains(text(),"Artistas")]/parent::li/div/ul/li')
lista_artistas_principal = []
for artist in artistas_get:
lista_artistas_principal.append(artist.text)
artistas_principal=lista_artistas_principal
except:
artistas_principal=''
try:
classif_indicativa=WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.XPATH, "//*[#data-testid='storyline-certificate']/div/ul/li"))).text
except:
classif_indicativa=''
try:
slogan=driver.find_element(By.XPATH,'//*[#data-testid="storyline-taglines"]/div').text
except:
slogan=''
try:
img=WebDriverWait(driver, 2).until(EC.presence_of_element_located((By.XPATH, "//img[#class='ipc-image']"))).get_attribute('srcset')
except:
img='testeeee.jpg'
try:
des=driver.find_element(By.XPATH,'//meta[#name="description"]').get_attribute('content')
except:
des=''
try:
tipo=driver.find_element(By.XPATH,'//*[#data-testid="hero-title-block__metadata"]//*[contains(text(),"Minissérie")] | //*[#data-testid="hero-title-block__metadata"]//*[contains(text(),"Série de TV")] | //*[#data-testid="hero-title-block__metadata"]//*[contains(text(),"Especial de TV")]').text
except:
tipo='Filme'
try:
prime_video_link=driver.find_element(By.XPATH,'//*[#data-testid="tm-box-pwo-btn"]//div[contains(text(),"Prime Video")]/ancestor::a').get_attribute('href')
except:
try:
driver.find_element(By.XPATH,'//*[#data-testid="tm-box-mwo-btn"]//*[contains(text(),"Mais opções")]').click()
sleep(1)
prime_video_link=driver.find_element(By.XPATH,'//*[#data-testid="promptable"]//*[#data-focus-lock-disabled="false"]//*[contains(text(),"RENT/BUY")]/parent::div/ul/a').get_attribute('href')
except:
prime_video_link=''
da.append([nulo, data_id, titulo_filme, slug_titulo, tipo, nota_imdb, prime_video_link, data_lancamento, img, des, genero, tempo_duracao, idioma, empresa_produtora, tbm_conhecido_como, pais_origem, codec_som, elenco_principal, diretor, roteiristas, artistas_principal, classif_indicativa, slogan])
df=pandas.DataFrame(da,columns=['nulo','id','titulo_filme', 'slug_titulo', 'tipo', 'nota_imdb', 'prime_video_link', 'data_lancamento', 'imagem','descricao', 'genero', 'tempo_duracao', 'idioma', 'empresa_produtora', 'tbm_conhecido_como', 'pais_origem', 'codec_som', 'elenco_principal', 'diretor', 'roteiristas', 'artistas_principal', 'classif_indicativa', 'slogan'])
df.to_csv(f'{file_name}.csv',index=False)
print(da)
driver.close()
def collecting_links(linkss):
for link in linkss:
id_link = link.get_attribute('href')
#asi = id_link.split("/")[5]
dat_id=id_link.split('/')[-2]
driver.execute_script(f"window.open('{id_link}')")
driver.switch_to.window(driver.window_handles[-1])
parse_details(driver,asin='asi',data_id=dat_id)
driver.switch_to.window(driver.window_handles[0])
def main(driver):
sub_category_link=[]
driver.get(input_url)
#change_location()
links = driver.find_elements(By.XPATH, '//h3[#class="lister-item-header"]/a')
collecting_links(linkss=links)
while True:
try:
driver.find_element(By.LINK_TEXT,'Next »').click()
links = driver.find_elements(By.XPATH, '//h3[#class="lister-item-header"]/a')
collecting_links(linkss=links)
except:
print("Finalizado ---- Links:")
print(driver.find_element(By.LINK_TEXT,'Previous').get_attribute('href'))
break
if __name__ == "__main__":
print("Starting Scraper")
daa=[]
input_url=input('Enter your url: ')
file_name=input('Enter your file name: ')
daa.append([input_url,str(file_name)+'.csv'])
df=pandas.DataFrame(daa,columns=['','']).to_csv('scraped_url.txt',index=False,header=False,mode='a')
s = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(ChromeDriverManager().install())
driver.maximize_window()
main(driver)
sleep(2)
'Hi,I want to save the data I took as csv and txt, but I couldn't.
Moreover;
How can I repeat this process multiple times?'
nextInput = driver.find_element("xpath",'//*[#id="pnnext"]/span[2]').click()
result = driver.find_elements(By.CSS_SELECTOR, ".GyAeWb cite.iUh30")
'
Code;
'
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
import pandas as pd
import time
import csv
import re
driver = webdriver.Chrome()
url ="http://google.com"
driver.get(url)
searchInput = driver.find_element("xpath",'/html/body/div[1]/div[3]/form/div[1]/div[1]/div[1]/div/div[2]/input')
time.sleep(1)
searchInput.send_keys("dişçi")
time.sleep(2)
searchInput.send_keys(Keys.ENTER)
time.sleep(2)
result = driver.page_source
result = driver.find_elements(By.CSS_SELECTOR, ".GyAeWb cite.iUh30")
for index,element in enumerate (result):
print(index+1,element.text)
result = []
result = list(set(result))
time.sleep(2)
nextInput = driver.find_element("xpath",'//*[#id="pnnext"]/span[2]').click()
result = driver.find_elements(By.CSS_SELECTOR, ".GyAeWb cite.iUh30")
for index,element in enumerate (result):
print(index+1,element.text)
count = 1
with open("siteler.txt","w",encoding="UTF-8") as file:
for item in result:
file.write(f"{count}-{item}\n")
count+=1
driver.close()
try:
while 1:
nextInput = driver.find_element("xpath",'//*[#id="pnnext"]/span[2]').click()
result = driver.find_elements(By.CSS_SELECTOR, ".GyAeWb cite.iUh30")
for index,element in enumerate (result):
print(index+1,element.text)
count = 1
with open("siteler.txt","w",encoding="UTF-8") as file:
for item in result:
file.write(f"{count}-{item}\n")
count+=1
except Exception as e:
print(e)
finally:
print("there is no element with '//*[#id='pnnext']/span[2]' XPATH")
I am trying to scrape instagram post data (number of likes, Caption, Hashtags, Mentions and number of comments) from a collection of links in a .csv for data analysis to put towards my Masters Thesis. however i am coming across an error where the xpath or element cannot be found. Here is the error message:
selenium.common.exceptions.NoSuchElementException: Message: no such element: Unable to locate element: {"method":"xpath","selector":"//*[#id="react-root"]/section/main/div/div/article/div[2]/section[2]/div/div/button"}
Here is the code block i have written using selenium:
def scrape_post_data():
influencerpostsdata = []
# Specify the path to chromedriver.exe
chromedriver_path = r"C:\\Users\\stuar\\Instagram Scraper\\ChromeDrivers\chromedriver.exe"
driver = webdriver.Chrome(executable_path=chromedriver_path)
time.sleep(2)
# Open the webpage
url = "https://www.instagram.com"
driver.get(url)
time.sleep(3)
# Alert number 1
time.sleep(5)
alert = WebDriverWait(driver, 15).until(EC.element_to_be_clickable((By.XPATH, '//button[contains(text(), "Accept All")]'))).click()
# Target Username Entry
username = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.CSS_SELECTOR, "input[name='username']")))
password = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.CSS_SELECTOR, "input[name='password']")))
# Enter Username and Password
login_username = str(enter_loginusername_entry.get())
login_password = str(enter_password_entry.get())
username.clear()
username.send_keys(login_username)
password.clear()
password.send_keys(login_password)
button = WebDriverWait(driver, 2).until(EC.element_to_be_clickable((By.CSS_SELECTOR, "button[type='submit']"))).click()
# Alert number 2
time.sleep(5)
alert2 = WebDriverWait(driver, 15).until(EC.element_to_be_clickable((By.XPATH, '//button[contains(text(), "Not Now")]'))).click()
# Alert number 3
time.sleep(5)
alert3 = WebDriverWait(driver, 15).until(EC.element_to_be_clickable((By.XPATH, '//button[contains(text(), "Not Now")]'))).click()
with open(r"C:\\Users\\stuar\\Instagram Scraper\\SourceCode/influencerpostlinks1.csv",'r') as csv_file:
csv_reading = csv.reader(csv_file)
for line in csv_reading:
links = line[1]
try:
Page = driver.get(links)
except Exception as e:
Page = None
time.sleep(20)
try:
# This captures the standard like count.
likes = driver.find_element_by_xpath("""//*[#id="react-root"]/section/main/div/div/article/div[2]/section[2]/div/div/button""").text.split()[0]
post_type = 'photo'
except:
# This captures the like count for videos which is stored
likes = driver.find_element_by_xpath("""//*[#id="react-root"]/section/main/div/div/article/div[2]/section[2]/div/span""").text.split()[0]
post_type = 'video'
age = driver.find_element_by_css_selector('a time').text
comment = driver.find_element_by_xpath("""//*[#id="react-root"]/section/main/div/div/article/div[2]/div[1]/ul/div/li/div/div/div[2]/span""").text
hashtags = find_hashtags(comment)
mentions = find_mentions(comment)
post_details = {'link': url, 'type': post_type, 'likes/views': likes,
'age': age, 'comment': comment, 'hashtags': hashtags,
'mentions': mentions}
time.sleep(10)
#turning data into a .csv file
influencerpostsdata.append(post_details)
df = pd.DataFrame(influencerposts)
print(df)
df.to_csv('influencerpostsdata.csv')
driver.close()
Not To worry i have resolved the problem..
with open(r"C:\\Users\\stuar\\Instagram Scraper\\SourceCode/influencerpostlinks1.csv",'r') as csv_file:
csv_reading = csv.reader(csv_file)
for line in csv_reading:
links = line[1]
try:
Page = driver.get(links)
except Exception as e:
Page = None
time.sleep(20)
try:
likes = driver.find_element_by_xpath('/html/body/div[1]/section/main/div/div[1]/article/div[3]/section[2]/div/div/a/span')
except Exception as e:
likes = None
try:
likes2 = likes.text
except Exception as e:
likes2 = None
time.sleep(20)
try:
age = driver.find_element_by_xpath('/html/body/div[1]/section/main/div/div[1]/article/div[3]/div[2]/a/time')
except Exception as e:
age = None
try:
age2 = age.text
except Exception as e:
age2 = None
time.sleep(20)
try:
caption = driver.find_element_by_xpath('/html/body/div[1]/section/main/div/div[1]/article/div[3]/div[1]/ul/div/li/div/div/div[2]/span')
except Exception as e:
caption = None
try:
caption2 = caption.text
except Exception as e:
caption2 = None
time.sleep(20)
try:
AccountName = driver.find_element_by_xpath('/html/body/div[1]/section/main/div/div[1]/article/header/div[2]/div[1]/div/span/a')
except Exception as e:
AccountName = None
try:
AccountName2 = AccountName.text
except Exception as e:
AccountName2 = None
time.sleep(20)
post_details = {'Username': AccountName2,'Caption': caption2, 'Likes/Views': likes2,
'Age': age2 }
#turning data into a .csv file
influencerpostsdata.append(post_details)
df = pd.DataFrame(influencerpostsdata)
print(df)
df.to_csv('influencerpostsdata.csv')
driver.close()
Trying to close a popover while scraping Glassdoor for jobs [It keeps popping up from time to time - need to close it every time].. I've tried quite a few things
Tried closing it by looking for the close button. Please help !
driver.find_element_by_class_name("SVG_Inline modal_closeIcon").click()
Tried looking for a ElementClickInterceptedException when the bot couldn't click on the next company, and everywhere else there was a click
element = WebDriverWait(driver, 3).until(EC.presence_of_element_located((By.CLASS_NAME, "SVG_Inline-svg modal_closeIcon-svg")))
element.click()
This is the website:
https://www.glassdoor.co.uk/Job/web-developer-jobs-SRCH_KO0,13.htm
This is the complete code:
from selenium.common.exceptions import NoSuchElementException, ElementClickInterceptedException, StaleElementReferenceException
from selenium import webdriver
import time
import pandas as pd
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
def get_jobs(keyword, num_jobs, verbose, place):
'''Gathers jobs as a dataframe, scraped from Glassdoor'''
#Initializing the webdriver
options = webdriver.ChromeOptions()
#Uncomment the line below if you'd like to scrape without a new Chrome window every time.
#options.add_argument('headless')
#Change the path to where chromedriver is in your home folder.
#driver = webdriver.Chrome(executable_path="/Users/omersakarya/Documents/GitHub/scraping-glassdoor-selenium/chromedriver", options=options)
driver = webdriver.Chrome()
driver.set_window_size(1120, 1000)
url = "https://www.glassdoor.co.uk/Job/web-developer-jobs-SRCH_KO0,13.htm"
driver.get(url)
jobs = []
time.sleep(3)
driver.find_element_by_id("onetrust-accept-btn-handler").click()
time.sleep(3)
while len(jobs) < num_jobs: #If true, should be still looking for new jobs.
job_buttons = driver.find_elements_by_class_name("jl") #jl for Job Listing. These are the buttons we're going to click.
try:
for job_button in job_buttons:
if len(jobs) >= num_jobs:
break
print("Progress: {}".format("" + str(len(jobs)) + "/" + str(num_jobs)))
job_button.click()
collected_successfully = False
while not collected_successfully:
try:
company_name = driver.find_element_by_xpath('.//div[#class="employerName"]').text
location = driver.find_element_by_xpath('.//div[#class="location"]').text
job_title = driver.find_element_by_xpath('.//div[contains(#class, "title")]').text
job_description = driver.find_element_by_xpath('.//div[#class="jobDescriptionContent desc"]').text
collected_successfully = True
except:
time.sleep(5)
try:
#salary_estimate = driver.find_element_by_xpath('.//span[#class="gray salary"]').text
salary_estimate = driver.find_element_by_xpath('//*[#id="HeroHeaderModule"]/div[3]/div[1]/div[4]/span').text
except NoSuchElementException:
salary_estimate = -1 #You need to set a "not found value. It's important."
try:
rating = driver.find_element_by_xpath('.//span[#class="rating"]').text
except NoSuchElementException:
rating = -1 #You need to set a "not found value. It's important."
#Printing for debugging
if verbose:
print("Job Title: {}".format(job_title))
print("Salary Estimate: {}".format(salary_estimate))
print("Job Description: {}".format(job_description[:500]))
print("Rating: {}".format(rating))
print("Company Name: {}".format(company_name))
print("Location: {}".format(location))
#Going to the Company tab...
#clicking on this:
#<div class="tab" data-tab-type="overview"><span>Company</span></div>
try:
driver.find_element_by_xpath('.//div[#class="tab" and #data-tab-type="overview"]').click()
try:
#<div class="infoEntity">
# <label>Headquarters</label>
# <span class="value">San Francisco, CA</span>
#</div>
headquarters = driver.find_element_by_xpath('.//div[#class="infoEntity"]//label[text()="Headquarters"]//following-sibling::*').text
except NoSuchElementException:
headquarters = -1
try:
size = driver.find_element_by_xpath('.//div[#class="infoEntity"]//label[text()="Size"]//following-sibling::*').text
except NoSuchElementException:
size = -1
try:
founded = driver.find_element_by_xpath('.//div[#class="infoEntity"]//label[text()="Founded"]//following-sibling::*').text
except (NoSuchElementException, StaleElementReferenceException):
founded = -1
try:
type_of_ownership = driver.find_element_by_xpath('.//div[#class="infoEntity"]//label[text()="Type"]//following-sibling::*').text
except NoSuchElementException:
type_of_ownership = -1
try:
industry = driver.find_element_by_xpath('.//div[#class="infoEntity"]//label[text()="Industry"]//following-sibling::*').text
except NoSuchElementException:
industry = -1
try:
sector = driver.find_element_by_xpath('.//div[#class="infoEntity"]//label[text()="Sector"]//following-sibling::*').text
except NoSuchElementException:
sector = -1
try:
revenue = driver.find_element_by_xpath('.//div[#class="infoEntity"]//label[text()="Revenue"]//following-sibling::*').text
except NoSuchElementException:
revenue = -1
try:
competitors = driver.find_element_by_xpath('.//div[#class="infoEntity"]//label[text()="Competitors"]//following-sibling::*').text
except NoSuchElementException:
competitors = -1
except (NoSuchElementException,ElementClickInterceptedException,StaleElementReferenceException): #Rarely, some job postings do not have the "Company" tab.
if NoSuchElementException:
time.sleep(1)
headquarters = -1
size = -1
founded = -1
type_of_ownership = -1
industry = -1
sector = -1
revenue = -1
competitors = -1
else:
driver.find_element_by_class_name("selected").click()
driver.find_element_by_class_name("SVG_Inline modal_closeIcon").click()
element = WebDriverWait(driver, 3).until(EC.presence_of_element_located((By.CLASS_NAME, "SVG_Inline-svg modal_closeIcon-svg")))
element.click()
pass
if verbose:
print("Headquarters: {}".format(headquarters))
print("Size: {}".format(size))
print("Founded: {}".format(founded))
print("Type of Ownership: {}".format(type_of_ownership))
print("Industry: {}".format(industry))
print("Sector: {}".format(sector))
print("Revenue: {}".format(revenue))
print("Competitors: {}".format(competitors))
print("####################################################")
jobs.append({"Job Title" : job_title,
"Salary Estimate" : salary_estimate,
"Job Description" : job_description,
"Rating" : rating,
"Company Name" : company_name,
"Location" : location,
"Headquarters" : headquarters,
"Size" : size,
"Founded" : founded,
"Type of ownership" : type_of_ownership,
"Industry" : industry,
"Sector" : sector,
"Revenue" : revenue,
"Competitors" : competitors})
#You might
#time.sleep(0.5)
except (ElementClickInterceptedException, StaleElementReferenceException):
alertObj = driver.switch_to.alert
alertObj.accept()
alertObj.dismiss()
driver.find_element_by_class_name("selected").click()
driver.find_element_by_class_name("SVG_Inline modal_closeIcon").click()
element = WebDriverWait(driver, 3).until(EC.presence_of_element_located((By.CLASS_NAME, "SVG_Inline-svg modal_closeIcon-svg")))
element.click()
pass
#add job to jobs
#Clicking on the "next page" button
# try:
# driver.find_element_by_xpath('.//li[#class="page"]//a').click()
# except NoSuchElementException:
# print("Scraping terminated before reaching target number of jobs. Needed {}, got {}.".format(num_jobs, len(jobs)))
# break
# time.sleep(5)
try:
driver.find_element_by_xpath('.//li[#class="next"]//a').click()
except (ElementClickInterceptedException):
#print("Scraping terminated before reaching target number of jobs. Needed {}, got {}.".format(num_jobs, len(jobs)))
driver.find_element_by_class_name("selected").click()
driver.find_element_by_class_name("SVG_Inline modal_closeIcon").click()
element = WebDriverWait(driver, 3).until(EC.presence_of_element_located((By.CLASS_NAME, "SVG_Inline-svg modal_closeIcon-svg")))
element.click()
element.text
pass
#break
return pd.DataFrame(jobs) #This line converts the dictionary object into a pandas DataFrame.
df = gs.get_jobs(keyword, num_jobs, False, place)
Trying to get rid of this:
enter image description here
[Screenshot of the element I need to close and continue with the loop][2]