Web scraping Google Maps with Selenium uses too much data - python

I am scraping travel times from Google Maps. The below code scrapes travel times between 1 million random points in Tehran, which works perfectly fine. I also use multiprocessing to get travel times simultaneously. The results are fully replicable, feel free to run the code in a terminal (but not in an interactive session like Spyder as the multiprocessing won't work). This is how what I am scraping looks like on google maps (in this case 22 min is the travel time):
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from multiprocessing import Process, Pipe, Pool, Value
import time
from multiprocessing.pool import ThreadPool
import threading
import gc
threadLocal = threading.local()
class Driver:
def __init__(self):
options = webdriver.ChromeOptions()
options.add_argument("--headless")
options.add_experimental_option('excludeSwitches', ['enable-logging'])
self.driver = webdriver.Chrome(options=options)
def __del__(self):
self.driver.quit() # clean up driver when we are cleaned up
print('The driver has been "quitted".')
#classmethod
def create_driver(cls):
the_driver = getattr(threadLocal, 'the_driver', None)
if the_driver is None:
print('Creating new driver.')
the_driver = cls()
threadLocal.the_driver = the_driver
driver = the_driver.driver
the_driver = None
return driver
success = Value('i', 0)
error = Value('i', 0)
def f(x):
global success
global error
with success.get_lock():
success.value += 1
print("Number of errors", success.value)
with error.get_lock():
error.value += 1
print("counter.value:", error.value)
def scraper(url):
"""
This now scrapes a single URL.
"""
global success
global error
try:
driver = Driver.create_driver()
driver.get(url)
time.sleep(1)
trip_times = driver.find_element(By.XPATH, "//div[contains(#aria-labelledby,'section-directions-trip-title')]//span[#jstcache='198']")
print("got data from: ", url)
print(trip_times.text)
with success.get_lock():
success.value += 1
print("Number of sucessful scrapes: ", success.value)
except Exception as e:
# print(f"Error: {e}")
with error.get_lock():
error.value += 1
print("Number of errors", error.value)
import random
min_x = 35.617487
max_x = 35.783375
min_y = 51.132557
max_y = 51.492329
urls = []
for i in range(1000000):
x = random.uniform(min_x, max_x)
y = random.uniform(min_y, max_y)
url = f'https://www.google.com/maps/dir/{x},{y}/35.8069533,51.4261312/#35.700769,51.5571612,21z'
urls.append(url)
number_of_processes = min(2, len(urls))
start_time = time.time()
with ThreadPool(processes=number_of_processes) as pool:
# result_array = pool.map(scraper, urls)
result_array = pool.map(scraper, urls)
# Must ensure drivers are quitted before threads are destroyed:
del threadLocal
# This should ensure that the __del__ method is run on class Driver:
gc.collect()
pool.close()
pool.join()
print(result_array)
print( "total time: ", round((time.time()-start_time)/60, 1), "number of urls: ", len(URLs))
But after having it run for only 24 hours, it has already used around 80 GB of data! Is there a way to make this more efficient in terms of data usage?
I suspect this excessive data usage is because Selenium has to load each URL completely every time before it can access the HTML and get the target node. Can I change anything in my code to prevent that and still get the travel time?
*Please note that using the Google Maps API is not an option. Because the limit is too small for my application and the service is not provided in my country.

You can use Page Load Strategy.
A Selenium WebDriver has 3 Page Load Strategy:
normal - Waits for all resources to download.
eager - DOM access is ready, but other resources like images may still be loading.
none - Does not block WebDriver at all.
options.page_load_strategy = "none" # ["normal", "eager", "none"]
It might help you (obviously it doesn't perform mirakle, but better than nothing).

Related

User Defined Function Returning Empty Dataframe before Running For Loop

I know questions very similar to mine have been asked many times, but I have just about reviewed them all and cannot solve my own code, so I'm hoping someone has the answer.
I'm trying to loop through csv downloads and append them together inside a user-defined function that calls a for loop. I've added in print lines so that I can see where the function fails. The function prints 1, 2, 3 and the returns dataframe df that is empty.
Why is the for loop getting skipped and the df returning empty? When run outside of the user-designed function, it works perfectly.
Thanks in advance!
# LoadPackages
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
import pandas as pd
# ================================================== download spp tie flows
# set directories
directory = r"E:/Data/sophie/line vision/sources/spp public/downloaded/"
driverPath = r"/Users/sophi/Downloads/chromedriver_win32/chromedriver.exe"
# define urls
page_url = "https://marketplace.spp.org/pages/historical-tie-flow"
prefix_download_url = ("https://marketplace.spp.org/file-browser-api/download/" +
"historical-tie-flow?path=%2F")
xpath = "//*[#id='main-content']/div/div/div[2]/div/div[3]/div/ul/li/a/span[#class='fname name' and contains(text(), '2021')]"
driver = webdriver.Chrome(ChromeDriverManager().install())
def download_and_append(page_url, prefix_download_url, xpath) :
driver.get(page_url)
print(1)
# create empty dataframe to append to
df = pd.DataFrame()
print(2)
# retrieve data from page
elements = driver.find_elements(By.XPATH, xpath)
print(3)
for element in elements:
index = element.text.split()[0]
print(index)
data = pd.read_csv(prefix_download_url + index)
print(4)
# clean dataframe and concatenate to df
df = pd.concat([df, data])
print(5)
return df
hourly_tie_flows_2021 = download_and_append(page_url, prefix_download_url, xpath)
hourly_tie_flows_2021
# ========================================================== export data
hourly_tie_flows_2021.to_csv(directory + "/spp_tie_flows_by_ba_2021.csv")`
Short answer (add a sleep to let the javascript load completely) took 205.64 seconds to complete:
from time import sleep
sleep(2)
elements = driver.find_elements(By.XPATH, xpath)
A longer answer (Only use Selenium to get the urls. Use requests and concurrent.futures to download the files concurrently) took 35.08 seconds to complete:
import os
from concurrent.futures import ProcessPoolExecutor
from io import StringIO
from time import sleep
import pandas as pd
import requests
from selenium import webdriver
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
class Scrape:
def __init__(self, **kwargs):
self.year = kwargs.get("year")
self.urls = self.get_urls()
self.output_file_path = r"E:/Data/sophie/line vision/sources/spp public/downloaded/"
self.driver_path = r"/Users/sophi/Downloads/chromedriver_win32/chromedriver.exe"
def get_urls(self) -> list:
root_url = "https://marketplace.spp.org"
page_url = f"{root_url}/pages/historical-tie-flow"
download_url = f"{root_url}/file-browser-api/download/historical-tie-flow?path="
driver = webdriver.Chrome(ChromeDriverManager().install())
driver.get(page_url)
sleep(2)
elements = driver.find_elements(By.XPATH, f"//*[contains(text(),'{self.year}')]")
return [f"{download_url}{x.text}" for x in elements]
def process_urls(self) -> None:
with ProcessPoolExecutor(max_workers=os.cpu_count()) as executor:
(pd
.concat(executor.map(self.download_data, self.urls))
.sort_values("GMTTIME")
.reset_index(drop=True)
).to_csv(f"{self.output_file_path}/spp_tie_flows_by_ba_{self.year}.csv")
#staticmethod
def download_data(url: str) -> pd.DataFrame:
with requests.Session() as request:
response = request.get(url)
if response.status_code != 200:
print(response.raise_for_status())
return pd.read_csv(StringIO(response.text), sep=",")
if __name__ == "__main__":
Scrape(year=2021).process_urls()

MultiThreading with Selenium in Python

I want to fetch data from both classes at one time but after hit the run both classes open in one tab after adding driver.execute_script(f"window. open {link}, 'new window')") showing an error for keeps class I want to open 2 tabs at one time. Trying to open a new tab but showing an error only I can print Cardano class not keeps class. is there any way to run both classes at the same time in different tabes with the stable condition? please help I am a beginner in selenium.
//Modules
from time import sleep
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from threading import *
import pandas as pd
//Code
driver = webdriver.Chrome(ChromeDriverManager().install())
driver.minimize_window()
wait = WebDriverWait(driver, 30)
df = pd.read_excel('chains.xlsx', sheet_name='first')
chains_links = list(df['links'])
class Cardano(Thread):
def run(self):
cardano = 1
for link in chains_links:
if 'cardanoscan' in link:
driver.get(link)
sleep(0.5)
try:
status = wait.until(EC.visibility_of_element_located(
(By.XPATH, "/html/body/div[2]/main/div/div/div[2]/div/div/div[1]/div[1]/div[1]/div[1]/div[1]/div[2]/button"))).text
saturation = wait.until(EC.visibility_of_element_located(
(By.XPATH, "/html/body/div[2]/main/div/div/div[2]/div/div/div[3]/div[1]/div/div/div/div/div[1]"))).text
except:
status = None
saturation = None
if status == "Active" and saturation != "0" and saturation != None:
print(
f"Cardano {cardano}: is {status} and Staturation is {saturation}")
else:
print(f"Something Wrong with Cardano {cardano}")
cardano = cardano + 1
class Keeps(Thread):
def run(self):
keeps = 1
for link in chains_links:
if "allthekeeps" in link:
driver.execute_script(f"window.open {link}, 'new window')")
sleep(0.5)
try:
fault = wait.until(EC.visibility_of_element_located(
(By.XPATH, "/html/body/div/div[2]/div/div/div[1]/div[2]/div[4]/div[2]/div"))).text
except:
fault = None
if fault != None and fault == "0":
print(f"Keeps {keeps}: is active with {fault} faults:")
else:
print(
f"Something wrong with Keeps {keeps}: {fault} Faults founded")
keeps = keeps + 1
t1 = Cardano()
t2 = Keeps()
t1.start()
t2.start()
t1.join()
t2.join()
driver.close()

selenium with threading, crashing and memory to be 99%

I want to check which webpage exists on Twitter; the amount of 3000 Twitter URLs needs to be checked. The below code is working as expected without threading. To save time, I added the multi_threads. However, it constantly crashes. I have checked the task management; the memory is 100%. Can any help me? Appreciate any help in advance.
import time
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.support import expected_conditions as EC
import threading
from threading import Thread
import xlsxwriter
#check
def check_link(ceo_url):
options = webdriver.ChromeOptions()
options.add_argument('--headless')
driver = webdriver.Chrome(executable_path=driver_path,options=options)
driver.get(ceo_url)
try:
`find element which not exsit in the empty`
WebDriverWait(driver, 10).until(EC.visibility_of_element_located((By.XPATH,'.//div[#data-testid="primaryColumn"]//section[#aria-labelledby="accessible-list-0"]')))
`if above element, then write in the excel column 1`
worksheet.write(i+1,0,driver.current_url)
i += 1
except TimeoutException:
'if not write in the column 2'
worksheet.write(j+1,1,driver.current_url)
j += 1
driver.quit()
def open_file():
my_file = open('information.txt','r')
names = my_file.read().lower()
data_into_list = names.replace(' ','').split('\n')
my_file.close()
return data_into_list
def print_urls(data_into_list):
CEO_names = data_into_list
ceo_urls = []
for CEO_name in CEO_names:
url = 'https://twitter.com/{}'.format(CEO_name)
ceo_urls.append(url)
return ceo_urls
def main():
i = 0
j = 0
threads = []
workbook = xlsxwriter.Workbook('Twitter.xlsx')
worksheet = workbook.add_worksheet('Sheet1')
`read name from txt files`
data_into_list = open_file()
ceo_urls = print_urls(data_into_list)
`add thread`
for i, ceo_url in enumerate(ceo_urls):
threads.append(Thread(target=check_link,args=(ceo_url)))
for thread in threads:
thread.start()
for thread in threads:
thread.join()
if __name__ = '__main__':
main()

How to periodically fetch records from a website using selenium?

I have a small script that fetches company data from a website. This website gets regularly updated with new company information. How can I update my csv with new records on a periodic basis? Also as you can see in the code I have used an explicit range for the pages, what other solutions are possible?
The following is the code -
from selenium.webdriver import Firefox
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from time import sleep
import csv
#navigate to the ystory companies page
#start collecting data from ystory
START_URL = 'https://yourstory.com/companies/search?page=1&hitsPerPage=30'
#when the collection populates 30 elements then click on next page
class CompDeetz():
def __init__(self):
self.browser = Firefox()
self.browser.get(START_URL)
sleep(20)
self.browser.find_element_by_xpath('/html/body/div[12]/div/div/button').click()
sleep(5)
self.browser.find_element_by_xpath('/html/body/div[1]/div[4]').click()
self.database = []
def write_row(self,record):
with open('test.csv', 'a') as t:
writer = csv.writer(t)
writer.writerows(record)
def get_everything(self):
all_list = [ (a.text) for a in self.browser.find_elements_by_xpath('//tr[#class="hit"]')]
all_records = []
for company in all_list:
record = company.split('\n')
all_records.append(record)
self.write_row(all_records)
def next_page(self):
self.browser.find_element_by_xpath('//ul[#class="ais-Pagination-list"]/li[7]/a').click()
sleep(20)
def main():
t = CompDeetz()
t.get_everything()
for i in range(33):
t.next_page()
t.get_everything()
if __name__ == "__main__":
main()
Instead of having two different methods get_everything and next_page and calling them multiple times. You can have one method get_everything and call it once.
def get_everything(self):
all_records = []
nextPage = True
while nextPage:
all_list = [ (a.text) for a in self.browser.find_elements_by_xpath('//tr[#class="hit"]')]
for company in all_list:
record = company.split('\n')
all_records.append(record)
try:
nextPagelink = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, "//a[#aria-label='Next page']")))
driver.execute_script("arguments[0].scrollIntoView();", nextPagelink)
driver.execute_script("arguments[0].click();", nextPagelink)
time.sleep(5) # for next [age to load
#As on last page, next page link is not available. It will throw exception
except NoSuchElementException:
nextpage = False
self.write_row(all_records)
Note : take care of Pop up coming on page. I hope you already have mechanism to handle it.

how to save data from multiple pages using webdriver into a single csv

so i'm trying to save data from googlescholar using selenium (webdriver) and so far i can print the data that i want, but i when i saved it into a csv it only saves the first page
from selenium import webdriver
from selenium.webdriver.common.by import By
# Import statements for explicit wait
from selenium.webdriver.support.ui import WebDriverWait as W
from selenium.webdriver.support import expected_conditions as EC
import time
import csv
from csv import writer
exec_path = r"C:\Users\gvste\Desktop\proyecto\chromedriver.exe"
URL = r"https://scholar.google.com/citations?view_op=view_org&hl=en&authuser=2&org=8337597745079551909"
button_locators = ['//*[#id="gsc_authors_bottom_pag"]/div/button[2]', '//*[#id="gsc_authors_bottom_pag"]/div/button[2]','//*[#id="gsc_authors_bottom_pag"]/div/button[2]']
wait_time = 3
driver = webdriver.Chrome(executable_path=exec_path)
driver.get(URL)
wait = W(driver, wait_time)
#driver.maximize_window()
for j in range(len(button_locators)):
button_link = wait.until(EC.element_to_be_clickable((By.XPATH, button_locators[j])))
address = driver.find_elements_by_class_name("gsc_1usr")
#for post in address:
#print(post.text)
time.sleep(4)
with open('post.csv','a') as s:
for i in range(len(address)):
addresst = address
#if addresst == 'NONE':
# addresst = str(address)
#else:
addresst = address[i].text.replace('\n',',')
s.write(addresst+ '\n')
button_link.click()
time.sleep(4)
#driver.quit()
You only get one first page data because your program stops after it clicks next page button. You have to put all that in a for loop.
Notice i wrote in range(7), because I know there are 7 pages to open, in reality we should never do that. Imagine if we have thousands of pages. We should add some logic to check if the "next page button" exists or something and loop until it doesn't
exec_path = r"C:\Users\gvste\Desktop\proyecto\chromedriver.exe"
URL = r"https://scholar.google.com/citations?view_op=view_org&hl=en&authuser=2&org=8337597745079551909"
button_locators = "/html/body/div/div[8]/div[2]/div/div[12]/div/button[2]"
wait_time = 3
driver = webdriver.Chrome(executable_path=exec_path)
driver.get(URL)
wait = W(driver, wait_time)
time.sleep(4)
# 7 pages. In reality, we should get this number programmatically
for page in range(7):
# read data from new page
address = driver.find_elements_by_class_name("gsc_1usr")
# write to file
with open('post.csv','a') as s:
for i in range(len(address)):
addresst = address[i].text.replace('\n',',')
s.write(addresst+ '\n')
# find and click next page button
button_link = wait.until(EC.element_to_be_clickable((By.XPATH, button_locators)))
button_link.click()
time.sleep(4)
also in the future you should look to change all these time.sleeps to wait.until. Because sometimes your page loads quicker, and the program could do it's job faster. Or even worse, your network might get a lag and that would screw up your script.

Categories

Resources