Scraping with Beautifulsoup-Python - python

I want to scrape the name of the hotel in the tripadvisor in each review page of the hotel.
I wrote a code in python which is very simple and I think that it isn't false.
But every time it stops at a different point(page for example the first time stopped in page 150 second time in the page 330).
I am 100% that my code are correct. Is there any possibility that tripadvisor block me every time?
I update the code and i use selenium too but the problem is still remain
The updated code is the following:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from selenium.common.exceptions import NoSuchElementException
from bs4 import BeautifulSoup
import os
import urllib.request
import time
import re
file2 = open(os.path.expanduser(r"~/Desktop/TripAdviser Reviews2.csv"), "wb")
file2.write(b"hotel,Address,HelpCount,HotelCount,Reviewer" + b"\n")
Checker ="REVIEWS"
# example option: add 'incognito' command line arg to options
option = webdriver.ChromeOptions()
option.add_argument("--incognito")
# create new instance of chrome in incognito mode
browser = webdriver.Chrome(executable_path='/Users/thimios/AppData/Local/Google/chromedriver.exe', chrome_options=option)
#print(browser)
# go to website of interest
for i in range(10,50,10):
Websites=["https://www.tripadvisor.ca/Hotel_Review-g190479-d3587956-Reviews-or"+str(i)+"-The_Thief-Oslo_Eastern_Norway.html#REVIEWS"]
print(Websites)
for theurl in Websites:
thepage=browser.get(theurl)
thepage1 = urllib.request.urlopen(theurl)
soup = BeautifulSoup(thepage1, "html.parser")
# wait up to 10 seconds for page to load
timeout = 5
try:
WebDriverWait(browser, timeout).until(EC.visibility_of_element_located((By.XPATH, '//*[#id="HEADING"]')))
#print(WebDriverWait(browser, timeout).until(EC.visibility_of_element_located((By.XPATH, '//*[#id="HEADING"]'))))
except TimeoutException:
print("Timed out waiting for page to load")
browser.quit()
# Extract the helpful votes, hotel reviews
helpcountarray = ""
hotelreviewsarray = ""
for profile in soup.findAll(attrs={"class": "memberBadging g10n"}):
image = profile.text.replace("\n", "|||||").strip()
#print(image)
if image.find("helpful vote") > 0:
counter = re.findall('\d+', image.split("helpful vote", 1)[0].strip()[-4:])
if len(helpcountarray) == 0:
helpcountarray = [counter]
else:
helpcountarray.append(counter)
elif image.find("helpful vote") < 0:
if len(helpcountarray) == 0:
helpcountarray = ["0"]
else:
helpcountarray.append("0")
print(helpcountarray)
#print(len(helpcountarray))
if image.find("hotel reviews") > 0:
counter = re.findall('\d+', image.split("hotel reviews", 1)[0].strip()[-4:])
if len(hotelreviewsarray) == 0:
hotelreviewsarray = counter
else:
hotelreviewsarray.append(counter)
elif image.find("hotel reviews") < 0:
if len(hotelreviewsarray) == 0:
hotelreviewsarray = ['0']
else:
hotelreviewsarray.append("0")
print(hotelreviewsarray)
#print(len(hotelreviewsarray))
hotel_element = browser.find_elements_by_xpath('//*[#id="HEADING"]')
Address_element = browser.find_elements_by_xpath('//*[#id="HEADING_GROUP"]/div/div[3]/address/div/div[1]')
for i in range(0,10):
print(i)
for x in hotel_element:
hotel = x.text
print(hotel)
#print(type(hotel))
for y in Address_element:
Address = y.text.replace(',', '').replace('\n', '').strip()
print(Address)
#print(type(Address))
HelpCount = helpcountarray[i]
HelpCount = " ".join(str(w) for w in HelpCount)
print(HelpCount)
#print(type(HelpCount))
HotelCount = hotelreviewsarray[i]
HotelCount = " ".join(str(w) for w in HotelCount)
print(HotelCount)
#print(type(HotelCount))
Reviewer = soup.findAll(attrs={"class": "username mo"})[i].text.replace(',', ' ').replace('”', '').replace('“', '').replace('"', '').strip()
print(Reviewer)
Record2 = hotel + "," + Address +"," + HelpCount +"," + HotelCount+"," +Reviewer
if Checker == "REVIEWS":
file2.write(bytes(Record2, encoding="ascii", errors='ignore') + b"\n")
file2.close()
I read somewhere that I should add a header. Something like
headers={'user-agent':'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'}
in order for the web site to allow me to scrape it. Is that true?
Thanks for your help

Yes. there is such a possibility.
Websites use to prevent web scraping, such as detecting and disallowing bots from crawling (viewing) their pages.
The default User-Agent typically refers to automated processes implemented using a python software, so you will want to change it to browser like User-Agent.
Even though, I do not believe you were blocked by TripAdvisor.

Try to slow down the downloading by
import time
...
time.sleep(1)

No, try REAL life slowing it down, using Backoff so the target website doesn't think you're a bot...
import time
for term in ["web scraping", "web crawling", "scrape this site"]:
t0 = time.time()
r = requests.get("http://example.com/search", params=dict(
query=term
))
response_delay = time.time() - t0
time.sleep(10*response_delay) # wait 10x longer than it took them to respond
source:
https://blog.hartleybrody.com/web-scraping-cheat-sheet/#delays-and-backing-off

Related

How to run 'implicity_wait()' in a 'for loop' with respect to Web Scraping using Python?

Actually, I want to scrape the 'title' and 'product description' for all the products and from all the pages, and then save it into the '.csv' file.
URL:- hhttps://www.nykaa.com/makeup/body-art/c/3024?page_no=1&sort=popularity&ptype=lst&id=3024&root=nav_2&dir=desc&order=popularity&eq=desktop
This is what, I have tried.
from msilib.schema import Error
from os import sep
from tkinter import ON
from turtle import goto
import time
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import numpy as np
from random import randint
import pandas as pd
import requests
import csv
title_list = []
para_list = []
expiry_list = []
country_list = []
importer_list = []
address_list = []
myDict = {'body-art': 3024}
browser = webdriver.Chrome(
r'C:\Users\paart\.wdm\drivers\chromedriver\win32\97.0.4692.71\chromedriver.exe')
browser.maximize_window()
browser.implicitly_wait(20)
for item_name in myDict:
page_num = 1
while True:
try:
page = f"https://www.nykaa.com/makeup/{item_name}/c/{myDict[item_name]}?page_no={page_num}&sort=popularity&ptype=lst&id={myDict[item_name]}&root=nav_2&dir=desc&order=popularity&eq=desktop"
print(page)
requests.get(page)
soup = BeautifulSoup(requests.get(page).content, 'html.parser')
urls = [item.get("href")
for item in soup.find_all("a", class_="css-qlopj4")]
# print(urls)
if len(urls) == 0:
break
for i in range(0, 2): #Since, it's a huge amount of data, that's why I have taken 2 products on one page, otherwise it will be in the range(0,30). It will cover all the products from an individual pages.
try:
url = urls[i]
browser.get("https://www.nykaa.com" + url)
title_data = browser.find_elements(
By.CLASS_NAME, 'css-1gc4x7i').text
print(title_data)
for t in title_data:
title_list.append(t)
browser.execute_script("document.body.style.zoom='50%'")
browser.execute_script("document.body.style.zoom='100%'")
# Creates "load more" button object.
browser.implicitly_wait(20)
loadMore = browser.find_element(
By.XPATH, "/html/body/div[1]/div/div[3]/div[1]/div[2]/div/div/div[2]")
loadMore.click()
browser.implicitly_wait(20)
desc_data = browser.find_elements(By.ID, 'content-details')
for desc in desc_data:
para_details = browser.find_element(By.XPATH,
'//*[#id="content-details"]/p[1]').text
para_list.append(para_details)
expiry = browser.find_element(By.XPATH,
'//*[#id="content-details"]/p[2]').text
expiry_list.append(expiry)
country = browser.find_element(By.XPATH,
'//*[#id="content-details"]/p[3]').text
country_list.append(country)
importer = browser.find_element(By.XPATH,
'//*[#id="content-details"]/p[4]').text
importer_list.append(importer)
address = browser.find_element(By.XPATH,
'//*[#id="content-details"]/p[5]').text
address_list.append(address)
except:
break
except:
break
page_num += 1
title_list = [i.split('.css', 1)[0] for i in title_list]
print(*title_list, sep="\n")
print(*para_list, sep="\n")
print(*expiry_list, sep="\n")
print(*country_list, sep="\n")
print(*importer_list, sep="\n")
print(*address_list, "\n")
data_new = {"Title": title_list, "Para": para_list, "Expiry": expiry_list,
"Country": country_list, "Importer": importer_list, "Address": address_list}
df = pd.DataFrame(data_new)
df.to_csv("nykaa_makeup_bodyArt_new.csv")
# print(df)
The Output, I am receiving is as:
DevTools listening on ws://127.0.0.1:30887/devtools/browser/a222842a-7ce3-4070-a684-7e8bb8772279
https://www.nykaa.com/makeup/body-art/c/3024?page_no=1&sort=popularity&ptype=lst&id=3024&root=nav_2&dir=desc&order=popularity&eq=desktop
https://www.nykaa.com/makeup/body-art/c/3024?page_no=2&sort=popularity&ptype=lst&id=3024&root=nav_2&dir=desc&order=popularity&eq=desktop
https://www.nykaa.com/makeup/body-art/c/3024?page_no=3&sort=popularity&ptype=lst&id=3024&root=nav_2&dir=desc&order=popularity&eq=desktop
https://www.nykaa.com/makeup/body-art/c/3024?page_no=4&sort=popularity&ptype=lst&id=3024&root=nav_2&dir=desc&order=popularity&eq=desktop
https://www.nykaa.com/makeup/body-art/c/3024?page_no=5&sort=popularity&ptype=lst&id=3024&root=nav_2&dir=desc&order=popularity&eq=desktop
PS E:\Web Scraping - Nykaa>
I think, due to the implicity_wait() function, it's not able to fetch the product's title & description. After my code runs, the '.csv' file is created, but it's a blank file. Maybe, I am wrong. Please help me regarding this. Do I need change to add/change some parts of the code?
Thanks 🙏🏻
There is no need to set browser.implicitly_wait multiple times.
browser.implicitly_wait is setting the timeout, how much time the driver will try to pool the DOM in order to locate an element on the page before it races exception.
browser.implicitly_wait is normally set per driver session.
This is definetely not a pause command like time.sleep.
So, in case you need to put a pause in your code you should use time.sleep while this is not recommended.
Also, it's much preferably to use Expected Conditions explicit waits rather than browser.implicitly_wait since browser.implicitly_wait waits for element presence i.e. it will release the run when element is just appeared while it may not be completely rendered.
In order to wait for element completely rendered and containing it text you should use something like
wait.until(EC.visibility_of_element_located((By.XPATH, "/html/body/div[1]/div/div[3]/div[1]/div[2]/div/div/div[2]")))
Where "/html/body/div[1]/div/div[3]/div[1]/div[2]/div/div/div[2]" is XPath of element you wishe to get the text from.

Problem with Instagram Scraping using Selenium when trying to append the urls to a list of urls

Guys i maybe have a tricky problem over here.
I was trying to made a bot that will download all the photos/videos urls of an instagram account, append them to a list and in the end save them to a file. But while i was seeing if it was working, i find out that the list of urls, it was containing 51 urls all the time, and every time i was appending new urls while the program was working, those urls on the list was changing with the new 51 urls and the last urls was removed from the list, instead of add them up to the existing urls to the list and continue appending the new ones. Why is happening such a thing? I need your knowledge guys :)
The code of the bot is below:
#Here is the run.py from where I'm running the program
import os
from time import sleep
from selenium import webdriver
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.chrome.options import Options
import autoit
from selenium.webdriver.common.keys import Keys
import requests
import coockies
import PopUpsClose
import login
import link
import url_extraxction
def main():
#Makes an mobile emulator to start Instagram like a smartphone
mobile_emulation = {
"deviceMetrics": { "width": 360, "height": 640, "pixelRatio": 3.0 },
"userAgent": "Mozilla/5.0 (Linux; Android 4.2.1; en-us; Nexus 5 Build/JOP40D) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.166 Mobile Safari/535.19" }
chrome_options = Options()
chrome_options.add_experimental_option("mobileEmulation", mobile_emulation)
browser = webdriver.Chrome(chrome_options = chrome_options)
browser.get('https://www.instagram.com/accounts/login/')
coockies.close_coockies(browser)
login.Insta_login(browser)
PopUpsClose.pop_up(browser)
######################################
#Here it takes the url from the file
url = link.page_link(browser)
browser.get(url)
sleep(5)
#Scrolling down to the page and getting the URLS
url_extraxction.extract(browser, url)
main()
Here is the login function
from time import sleep
def Insta_login(browser):
login_file = open(r'C:\Users\bilakos\Desktop\PYTHON_PROJECTS\InstaAutoPhotoUpload\login.txt', 'r')
username = login_file.readline()
while username != '':
password = login_file.readline()
username_ = username.rstrip("\n")
password = password.rstrip("\n")
username = login_file.readline()
sleep(2)
browser.find_element_by_xpath("""//*[#id="loginForm"]/div[1]/div[3]/div/label/input""").send_keys(username_)
browser.find_element_by_xpath("""//*[#id="loginForm"]/div[1]/div[4]/div/label/input""").send_keys(password)
sleep(2)
browser.find_element_by_xpath("""/html/body/div[1]/section/main/div[1]/div/div/div/form/div[1]/div[6]/button/div""").click()
sleep(10)
login_file.close()
Here is the coockies function
def close_coockies(browser):
coockies_accept = browser.find_element_by_xpath("""/html/body/div[2]/div/div/div/div[2]/button[1]""")
coockies_accept.click()
Here is the PopUpsClose function
from time import sleep
def pop_up(browser):
#Εδώ βρίσκει που είναι σημείο για να κλείσει το 1ο Pop Up
not_now_button = browser.find_element_by_xpath("""/html/body/div[1]/section/main/div/div/div/button""")
not_now_button.click()
sleep(10)
#Εδώ βρίσκει που είναι σημείο για να κλείσει το 2ο Pop Up
not_now_button2 = browser.find_element_by_xpath("""/html/body/div[4]/div/div/div/div[3]/button[2]""")
not_now_button2.click()
sleep(2)
And last is the url_extraction function in where i have the problem
from time import sleep
import requests
import os
def extract(browser, url):
header = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36 OPR/73.0.3856.329"}
requests.get(url, headers = header)
#SCROLL DOWN
print("This process maybe it will take like 5 minutes.\n", "Don't close the program......")
last_height = 0
proceed = ''
while True:
browser.execute_script('window.scrollTo(0, document.body.scrollHeight);')
sleep(1)
#GET THE URLS
elements = browser.find_elements_by_xpath('//a[#href]')
links = []
for elem in elements:
urls = elem.get_attribute('href')
if urls not in links and 'p' in urls.split('/'):
links.append(urls)
print(links)
sleep(2)
new_height = browser.execute_script("return document.body.scrollHeight")
if new_height == last_height:
break
last_height = new_height
if False:
proceed = False
else:
proceed = True
sleep(10)
#Create a folder with the name of the profile
if proceed == True:
name = browser.find_element_by_class_name("_7UhW9.fKFbl.yUEEX.KV-D4.fDxYl")
text = name.text
print("Wait to create a Folder to pass the extracted links.\nPlease don't close the program.")
print('' * 2)
sleep(5)
path = "C:\\Users\\bilakos\\Desktop\\PYTHON_PROJECTS\\InstaAutoPhotoUpload\\" + text
sleep(2)
try:
os.mkdir(path)
link_extraction = open('C:\\Users\\bilakos\\Desktop\\PYTHON_PROJECTS\\InstaAutoPhotoUpload\\' + text
+ '\\extracted_links.txt', 'w')
sleep(2)
print("The extracted_links.txt file is created.")
print('' * 2)
for i in links:
link_extraction.write(i + '\n')
link_extraction.close()
sleep(2)
print('The links transferred succesfully to the file.')
except FileExistsError:
print('The file already exist.')
link_extraction = open('C:\\Users\\bilakos\\Desktop\\PYTHON_PROJECTS\\InstaAutoPhotoUpload\\' + text
+ '\\extracted_links.txt', 'w')
sleep(2)
print("The extracted_links.txt file is created.")
print('' * 2)
for i in links:
link_extraction.write(i + '\n')
link_extraction.close()
sleep(2)
print('The links transferred successfully to the file.')
Inside the url_extraction function i have a #GET THE URLS and after that is where the problem occurs.
in your while loop you are redefining the list everytime you scroll. so in effect you are only saving the last scroll to file.
def extract(browser, url):
...
while True:
# scroll down
...
links = [] # <--- (1) ---
for elem in elements:
urls = elem.get_attribute('href')
if urls not in links and 'p' in urls.split('/'):
links.append(urls) # <--- (2) ---
print(links)
...
# check if at end and if yes then break out of loop
at (1) you are defining a new list. at (2) you are appending to the list. but in the next iteration of the while loop you are again defining a new list at (1) and the previous items are lost.
to keep the results you must define the list outside of the while loop.
def extract(browser, url):
...
links = [] # <--- (1) ---
while True:
# scroll down
...
for elem in elements:
urls = elem.get_attribute('href')
if urls not in links and 'p' in urls.split('/'):
links.append(urls) # <--- (2) ---
print(links)
...
# check if at end and if yes then break out of loop

How to stop selenium scraper from redirecting to another internal weblink of the scraped website?

Was wondering if anyone knows of a way for instructing a selenium script to avoid visiting/redirecting to an internal page that wasn't part of the code. Essentially, my code opens up this page:
https://cryptwerk.com/companies/?coins=1,6,11,2,3,8,17,7,13,4,25,29,24,32,9,38,15,30,43,42,41,12,40,44,20
keeps clicking on show more button until there's none (at end of page) - which by then - it should have collected the links of all the products listed on the page it scrolled through till the end, then visit each one respectively.
What happens instead, it successfully clicks on show more till the end of the page, but then it visits this weird promotion page of the same website instead of following each of the gathered links respectively and then scraping further data points located off each of those newly opened ones.
In a nutshell, would incredibly appreciate it if someone can explain how to avoid this automated redirection on its own! And this is the code in case someone can gratefully nudge me in the right direction :)
from selenium.webdriver import Chrome
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
import time
from selenium.common.exceptions import NoSuchElementException, ElementNotVisibleException
import json
import selenium.common.exceptions as exception
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.keys import Keys
from selenium import webdriver
webdriver = '/Users/karimnabil/projects/selenium_js/chromedriver-1'
driver = Chrome(webdriver)
driver.implicitly_wait(5)
url = 'https://cryptwerk.com/companies/?coins=1,6,11,2,3,8,17,7,13,4,25,29,24,32,9,38,15,30,43,42,41,12,40,44,20'
driver.get(url)
links_list = []
coins_list = []
all_names = []
all_cryptos = []
all_links = []
all_twitter = []
all_locations = []
all_categories = []
all_categories2 = []
wait = WebDriverWait(driver, 2)
sign_in = driver.find_element_by_xpath("//li[#class='nav-item nav-guest']/a")
sign_in.click()
time.sleep(2)
user_name = wait.until(EC.presence_of_element_located((By.XPATH, "//input[#name='login']")))
user_name.send_keys("karimnsaber95#gmail.com")
password = wait.until(EC.presence_of_element_located((By.XPATH, "//input[#name='password']")))
password.send_keys("PleomaxCW#2")
signIn_Leave = driver.find_element_by_xpath("//div[#class='form-group text-center']/button")
signIn_Leave.click()
time.sleep(3)
while True:
try:
loadMoreButton = driver.find_element_by_xpath("//button[#class='btn btn-outline-primary']")
time.sleep(2)
loadMoreButton.click()
time.sleep(2)
except exception.StaleElementReferenceException:
print('stale element')
break
print('no more elements to show')
try:
company_links = driver.find_elements_by_xpath("//div[#class='companies-list items-infinity']/div[position() > 3]/div[#class='media-body']/div[#class='title']/a")
for link in company_links:
links_list.append(link.get_attribute('href'))
except:
pass
try:
with open("links_list.json", "w") as f:
json.dump(links_list, f)
with open("links_list.json", "r") as f:
links_list = json.load(f)
except:
pass
try:
for link in links_list:
driver.get(link)
name = driver.find_element_by_xpath("//div[#class='title']/h1").text
try:
show_more_coins = driver.find_element_by_xpath("//a[#data-original-title='Show more']")
show_more_coins.click()
time.sleep(1)
except:
pass
try:
categories = driver.find_elements_by_xpath("//div[contains(#class, 'categories-list')]/a")
categories_list = []
for category in categories:
categories_list.append(category.text)
except:
pass
try:
top_page_categories = driver.find_elements_by_xpath("//ol[#class='breadcrumb']/li/a")
top_page_categories_list = []
for category in top_page_categories:
top_page_categories_list.append(category.text)
except:
pass
coins_links = driver.find_elements_by_xpath("//div[contains(#class, 'company-coins')]/a")
all_coins = []
for coin in coins_links:
all_coins.append(coin.get_attribute('href'))
try:
location = driver.find_element_by_xpath("//div[#class='addresses mt-3']/div/div/div/div/a").text
except:
pass
try:
twitter = driver.find_element_by_xpath("//div[#class='links mt-2']/a[2]").get_attribute('href')
except:
pass
try:
print('-----------')
print('Company name is: {}'.format(name))
print('Potential Categories are: {}'.format(categories_list))
print('Potential top page categories are: {}'.format(top_page_categories_list))
print('Supporting Crypto is:{}'.format(all_coins))
print('Registered location is: {}'.format(location))
print('Company twitter profile is: {}'.format(twitter))
time.sleep(1)
except:
pass
all_names.append(name)
all_categories.append(categories_list)
all_categories2.append(top_page_categories_list)
all_cryptos.append(all_coins)
all_twitter.append(twitter)
all_locations.append(location)
except:
pass
df = pd.DataFrame(list(zip(all_names, all_categories, all_categories2, all_cryptos, all_twitter, all_locations)), columns=['Company name', 'Categories1', 'Categories2', 'Supporting Crypto', 'Twitter Handle', 'Registered Location'])
CryptoWerk_Data = df.to_csv('CryptoWerk4.csv', index=False)
Redirect calls happen for two reasons, in your case either by executing some javascript code when clicking the last time on the load more button or by receiving an HTTP 3xx code, which is the least likely in your case.
So you need to identify when this javascript code is executed and send an ESC_KEY before it loads and then executing the rest of your script.
You could also scrape the links and append them to your list before clicking the load more button and each time it is clicked, make an if statement the verify the link of the page you're in, if it is that of the promotion page then execute the rest of your code, else click load more.
while page_is_same:
scrape_elements_add_to_list()
click_load_more()
verify_current_page_link()
if current_link_is_same != link_of_scraped_page:
page_is_same = False
# rest of the code here

how to save data from multiple pages using webdriver into a single csv

so i'm trying to save data from googlescholar using selenium (webdriver) and so far i can print the data that i want, but i when i saved it into a csv it only saves the first page
from selenium import webdriver
from selenium.webdriver.common.by import By
# Import statements for explicit wait
from selenium.webdriver.support.ui import WebDriverWait as W
from selenium.webdriver.support import expected_conditions as EC
import time
import csv
from csv import writer
exec_path = r"C:\Users\gvste\Desktop\proyecto\chromedriver.exe"
URL = r"https://scholar.google.com/citations?view_op=view_org&hl=en&authuser=2&org=8337597745079551909"
button_locators = ['//*[#id="gsc_authors_bottom_pag"]/div/button[2]', '//*[#id="gsc_authors_bottom_pag"]/div/button[2]','//*[#id="gsc_authors_bottom_pag"]/div/button[2]']
wait_time = 3
driver = webdriver.Chrome(executable_path=exec_path)
driver.get(URL)
wait = W(driver, wait_time)
#driver.maximize_window()
for j in range(len(button_locators)):
button_link = wait.until(EC.element_to_be_clickable((By.XPATH, button_locators[j])))
address = driver.find_elements_by_class_name("gsc_1usr")
#for post in address:
#print(post.text)
time.sleep(4)
with open('post.csv','a') as s:
for i in range(len(address)):
addresst = address
#if addresst == 'NONE':
# addresst = str(address)
#else:
addresst = address[i].text.replace('\n',',')
s.write(addresst+ '\n')
button_link.click()
time.sleep(4)
#driver.quit()
You only get one first page data because your program stops after it clicks next page button. You have to put all that in a for loop.
Notice i wrote in range(7), because I know there are 7 pages to open, in reality we should never do that. Imagine if we have thousands of pages. We should add some logic to check if the "next page button" exists or something and loop until it doesn't
exec_path = r"C:\Users\gvste\Desktop\proyecto\chromedriver.exe"
URL = r"https://scholar.google.com/citations?view_op=view_org&hl=en&authuser=2&org=8337597745079551909"
button_locators = "/html/body/div/div[8]/div[2]/div/div[12]/div/button[2]"
wait_time = 3
driver = webdriver.Chrome(executable_path=exec_path)
driver.get(URL)
wait = W(driver, wait_time)
time.sleep(4)
# 7 pages. In reality, we should get this number programmatically
for page in range(7):
# read data from new page
address = driver.find_elements_by_class_name("gsc_1usr")
# write to file
with open('post.csv','a') as s:
for i in range(len(address)):
addresst = address[i].text.replace('\n',',')
s.write(addresst+ '\n')
# find and click next page button
button_link = wait.until(EC.element_to_be_clickable((By.XPATH, button_locators)))
button_link.click()
time.sleep(4)
also in the future you should look to change all these time.sleeps to wait.until. Because sometimes your page loads quicker, and the program could do it's job faster. Or even worse, your network might get a lag and that would screw up your script.

Selenium Python webscraper really slow

I'm a newbie getting into web scrapers. I've made something that works, but it takes hours and hours to get everything I need. I read something about using parallel processes to process the URLs but I have no clue how to go about it and incorporate it in what I already have. Help is much appreciated!
Here is my, still extremely messy, code. I'm still learning :)
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from bs4 import BeautifulSoup
from selenium.common.exceptions import NoSuchElementException
import time
import random
import pprint
import itertools
import csv
import pandas as pd
start_url = "https://www.nationalevacaturebank.nl/vacature/zoeken?query=&location=&distance=city&limit=100&sort=relevance&filters%5BcareerLevel%5D%5B%5D=Starter&filters%5BeducationLevel%5D%5B%5D=MBO"
driver = webdriver.Firefox()
driver.set_page_load_timeout(20)
driver.get(start_url)
driver.find_element_by_xpath('//*[#id="form_save"]').click() #accepts cookies
wait = WebDriverWait(driver, random.randint(1500,3200)/1000.0)
j = random.randint(1500,3200)/1000.0
time.sleep(j)
num_jobs = int(driver.find_element_by_xpath('/html/body/div[3]/div/main/div[2]/div[3]/div/header/h2/span').text)
num_pages = int(num_jobs/102)
urls = []
list_of_links = []
for i in range(num_pages+1):
try:
elements = wait.until(EC.presence_of_all_elements_located((By.XPATH, '//*[#id="search-results-container"]//article/job/a')))
for i in elements:
list_of_links.append(i.get_attribute('href'))
j = random.randint(1500,3200)/1000.0
time.sleep(j)
if 'page=3' not in driver.current_url:
driver.find_element_by_xpath('//html/body/div[3]/div/main/div[2]/div[3]/div/paginator/div/nav[1]/ul/li[6]/a').click()
else:
driver.find_element_by_xpath('//html/body/div[3]/div/main/div[2]/div[3]/div/paginator/div/nav[1]/ul/li[5]/a').click()
url = driver.current_url
if url not in urls:
print(url)
urls.append(url)
else:
break
except:
continue
set_list_of_links = list(set(list_of_links))
print(len(set_list_of_links), "results")
driver.close()
def grouper(n, iterable):
it = iter(iterable)
while True:
chunk = tuple(itertools.islice(it, n))
if not chunk:
return
yield chunk
def remove_empty_lists(l):
keep_going = True
prev_l = l
while keep_going:
new_l = remover(prev_l)
#are they identical objects?
if new_l == prev_l:
keep_going = False
#set prev to new
prev_l = new_l
#return the result
return new_l
def remover(l):
newlist = []
for i in l:
if isinstance(i, list) and len(i) != 0:
newlist.append(remover(i))
if not isinstance(i, list):
newlist.append(i)
return newlist
vacatures = []
chunks = grouper(100, set_list_of_links)
chunk_count = 0
for chunk in chunks:
chunk_count +=1
print(chunk_count)
j = random.randint(1500,3200)/1000.0
time.sleep(j)
for url in chunk:
driver = webdriver.Firefox()
driver.set_page_load_timeout(20)
try:
driver.get(url)
driver.find_element_by_xpath('//*[#id="form_save"]').click() #accepts cookies
vacature = []
vacature.append(url)
j = random.randint(1500,3200)/1000.0
time.sleep(j)
elements = driver.find_elements_by_tag_name('dl')
p_elements = driver.find_elements_by_tag_name('p')
li_elements = driver.find_elements_by_tag_name('li')
for i in elements:
if "Salaris:" not in i.text:
vacature.append(i.text)
running_text = list()
for p in p_elements:
running_text.append(p.text)
text= [''.join(running_text)]
remove_ls = ['vacatures', 'carrièretips', 'help', 'inloggen', 'inschrijven', 'Bezoek website', 'YouTube',
'Over Nationale Vacaturebank', 'Werken bij de Persgroep', 'Persberichten', 'Autotrack', 'Tweakers',
'Tweakers Elect', 'ITBanen', 'Contact', 'Carrière Mentors', 'Veelgestelde vragen',
'Vacatures, stages en bijbanen', 'Bruto Netto Calculator', 'Salariswijzer', 'Direct vacature plaatsen',
'Kandidaten zoeken', 'Bekijk de webshop', 'Intermediair', 'Volg ons op Facebook']
for li in li_elements:
if li.text not in remove_ls:
text.append(li.text)
text = ''. join(text)
vacature.append(text)
vacatures.append(vacature)
driver.close()
except TimeoutException as ex:
isrunning = 0
print("Exception has been thrown. " + str(ex))
driver.close()
except NoSuchElementException:
continue
Python Selenium webdriver is not thread-safe. This means your browser can not correctly consume asynchronous calls from multiple threads. Try to scrape websites with requests and bs4 + lxml. It's much faster than Selenium. This answer can be helpful.
You're using Firefox which is slower than Chrome in almost all real-life applications.
Xpath is the slowest selector, match by id or class. If that is not possible then by CSS.
Use headless mode and don't load images unless you need to.
You can use Scrapy and this is much faster and more flexible than anything. See link for more information.

Categories

Resources