selenium with threading, crashing and memory to be 99% - python

I want to check which webpage exists on Twitter; the amount of 3000 Twitter URLs needs to be checked. The below code is working as expected without threading. To save time, I added the multi_threads. However, it constantly crashes. I have checked the task management; the memory is 100%. Can any help me? Appreciate any help in advance.
import time
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.support import expected_conditions as EC
import threading
from threading import Thread
import xlsxwriter
#check
def check_link(ceo_url):
options = webdriver.ChromeOptions()
options.add_argument('--headless')
driver = webdriver.Chrome(executable_path=driver_path,options=options)
driver.get(ceo_url)
try:
`find element which not exsit in the empty`
WebDriverWait(driver, 10).until(EC.visibility_of_element_located((By.XPATH,'.//div[#data-testid="primaryColumn"]//section[#aria-labelledby="accessible-list-0"]')))
`if above element, then write in the excel column 1`
worksheet.write(i+1,0,driver.current_url)
i += 1
except TimeoutException:
'if not write in the column 2'
worksheet.write(j+1,1,driver.current_url)
j += 1
driver.quit()
def open_file():
my_file = open('information.txt','r')
names = my_file.read().lower()
data_into_list = names.replace(' ','').split('\n')
my_file.close()
return data_into_list
def print_urls(data_into_list):
CEO_names = data_into_list
ceo_urls = []
for CEO_name in CEO_names:
url = 'https://twitter.com/{}'.format(CEO_name)
ceo_urls.append(url)
return ceo_urls
def main():
i = 0
j = 0
threads = []
workbook = xlsxwriter.Workbook('Twitter.xlsx')
worksheet = workbook.add_worksheet('Sheet1')
`read name from txt files`
data_into_list = open_file()
ceo_urls = print_urls(data_into_list)
`add thread`
for i, ceo_url in enumerate(ceo_urls):
threads.append(Thread(target=check_link,args=(ceo_url)))
for thread in threads:
thread.start()
for thread in threads:
thread.join()
if __name__ = '__main__':
main()

Related

User Defined Function Returning Empty Dataframe before Running For Loop

I know questions very similar to mine have been asked many times, but I have just about reviewed them all and cannot solve my own code, so I'm hoping someone has the answer.
I'm trying to loop through csv downloads and append them together inside a user-defined function that calls a for loop. I've added in print lines so that I can see where the function fails. The function prints 1, 2, 3 and the returns dataframe df that is empty.
Why is the for loop getting skipped and the df returning empty? When run outside of the user-designed function, it works perfectly.
Thanks in advance!
# LoadPackages
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
import pandas as pd
# ================================================== download spp tie flows
# set directories
directory = r"E:/Data/sophie/line vision/sources/spp public/downloaded/"
driverPath = r"/Users/sophi/Downloads/chromedriver_win32/chromedriver.exe"
# define urls
page_url = "https://marketplace.spp.org/pages/historical-tie-flow"
prefix_download_url = ("https://marketplace.spp.org/file-browser-api/download/" +
"historical-tie-flow?path=%2F")
xpath = "//*[#id='main-content']/div/div/div[2]/div/div[3]/div/ul/li/a/span[#class='fname name' and contains(text(), '2021')]"
driver = webdriver.Chrome(ChromeDriverManager().install())
def download_and_append(page_url, prefix_download_url, xpath) :
driver.get(page_url)
print(1)
# create empty dataframe to append to
df = pd.DataFrame()
print(2)
# retrieve data from page
elements = driver.find_elements(By.XPATH, xpath)
print(3)
for element in elements:
index = element.text.split()[0]
print(index)
data = pd.read_csv(prefix_download_url + index)
print(4)
# clean dataframe and concatenate to df
df = pd.concat([df, data])
print(5)
return df
hourly_tie_flows_2021 = download_and_append(page_url, prefix_download_url, xpath)
hourly_tie_flows_2021
# ========================================================== export data
hourly_tie_flows_2021.to_csv(directory + "/spp_tie_flows_by_ba_2021.csv")`
Short answer (add a sleep to let the javascript load completely) took 205.64 seconds to complete:
from time import sleep
sleep(2)
elements = driver.find_elements(By.XPATH, xpath)
A longer answer (Only use Selenium to get the urls. Use requests and concurrent.futures to download the files concurrently) took 35.08 seconds to complete:
import os
from concurrent.futures import ProcessPoolExecutor
from io import StringIO
from time import sleep
import pandas as pd
import requests
from selenium import webdriver
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
class Scrape:
def __init__(self, **kwargs):
self.year = kwargs.get("year")
self.urls = self.get_urls()
self.output_file_path = r"E:/Data/sophie/line vision/sources/spp public/downloaded/"
self.driver_path = r"/Users/sophi/Downloads/chromedriver_win32/chromedriver.exe"
def get_urls(self) -> list:
root_url = "https://marketplace.spp.org"
page_url = f"{root_url}/pages/historical-tie-flow"
download_url = f"{root_url}/file-browser-api/download/historical-tie-flow?path="
driver = webdriver.Chrome(ChromeDriverManager().install())
driver.get(page_url)
sleep(2)
elements = driver.find_elements(By.XPATH, f"//*[contains(text(),'{self.year}')]")
return [f"{download_url}{x.text}" for x in elements]
def process_urls(self) -> None:
with ProcessPoolExecutor(max_workers=os.cpu_count()) as executor:
(pd
.concat(executor.map(self.download_data, self.urls))
.sort_values("GMTTIME")
.reset_index(drop=True)
).to_csv(f"{self.output_file_path}/spp_tie_flows_by_ba_{self.year}.csv")
#staticmethod
def download_data(url: str) -> pd.DataFrame:
with requests.Session() as request:
response = request.get(url)
if response.status_code != 200:
print(response.raise_for_status())
return pd.read_csv(StringIO(response.text), sep=",")
if __name__ == "__main__":
Scrape(year=2021).process_urls()

Web scraping Google Maps with Selenium uses too much data

I am scraping travel times from Google Maps. The below code scrapes travel times between 1 million random points in Tehran, which works perfectly fine. I also use multiprocessing to get travel times simultaneously. The results are fully replicable, feel free to run the code in a terminal (but not in an interactive session like Spyder as the multiprocessing won't work). This is how what I am scraping looks like on google maps (in this case 22 min is the travel time):
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from multiprocessing import Process, Pipe, Pool, Value
import time
from multiprocessing.pool import ThreadPool
import threading
import gc
threadLocal = threading.local()
class Driver:
def __init__(self):
options = webdriver.ChromeOptions()
options.add_argument("--headless")
options.add_experimental_option('excludeSwitches', ['enable-logging'])
self.driver = webdriver.Chrome(options=options)
def __del__(self):
self.driver.quit() # clean up driver when we are cleaned up
print('The driver has been "quitted".')
#classmethod
def create_driver(cls):
the_driver = getattr(threadLocal, 'the_driver', None)
if the_driver is None:
print('Creating new driver.')
the_driver = cls()
threadLocal.the_driver = the_driver
driver = the_driver.driver
the_driver = None
return driver
success = Value('i', 0)
error = Value('i', 0)
def f(x):
global success
global error
with success.get_lock():
success.value += 1
print("Number of errors", success.value)
with error.get_lock():
error.value += 1
print("counter.value:", error.value)
def scraper(url):
"""
This now scrapes a single URL.
"""
global success
global error
try:
driver = Driver.create_driver()
driver.get(url)
time.sleep(1)
trip_times = driver.find_element(By.XPATH, "//div[contains(#aria-labelledby,'section-directions-trip-title')]//span[#jstcache='198']")
print("got data from: ", url)
print(trip_times.text)
with success.get_lock():
success.value += 1
print("Number of sucessful scrapes: ", success.value)
except Exception as e:
# print(f"Error: {e}")
with error.get_lock():
error.value += 1
print("Number of errors", error.value)
import random
min_x = 35.617487
max_x = 35.783375
min_y = 51.132557
max_y = 51.492329
urls = []
for i in range(1000000):
x = random.uniform(min_x, max_x)
y = random.uniform(min_y, max_y)
url = f'https://www.google.com/maps/dir/{x},{y}/35.8069533,51.4261312/#35.700769,51.5571612,21z'
urls.append(url)
number_of_processes = min(2, len(urls))
start_time = time.time()
with ThreadPool(processes=number_of_processes) as pool:
# result_array = pool.map(scraper, urls)
result_array = pool.map(scraper, urls)
# Must ensure drivers are quitted before threads are destroyed:
del threadLocal
# This should ensure that the __del__ method is run on class Driver:
gc.collect()
pool.close()
pool.join()
print(result_array)
print( "total time: ", round((time.time()-start_time)/60, 1), "number of urls: ", len(URLs))
But after having it run for only 24 hours, it has already used around 80 GB of data! Is there a way to make this more efficient in terms of data usage?
I suspect this excessive data usage is because Selenium has to load each URL completely every time before it can access the HTML and get the target node. Can I change anything in my code to prevent that and still get the travel time?
*Please note that using the Google Maps API is not an option. Because the limit is too small for my application and the service is not provided in my country.
You can use Page Load Strategy.
A Selenium WebDriver has 3 Page Load Strategy:
normal - Waits for all resources to download.
eager - DOM access is ready, but other resources like images may still be loading.
none - Does not block WebDriver at all.
options.page_load_strategy = "none" # ["normal", "eager", "none"]
It might help you (obviously it doesn't perform mirakle, but better than nothing).

MultiThreading with Selenium in Python

I want to fetch data from both classes at one time but after hit the run both classes open in one tab after adding driver.execute_script(f"window. open {link}, 'new window')") showing an error for keeps class I want to open 2 tabs at one time. Trying to open a new tab but showing an error only I can print Cardano class not keeps class. is there any way to run both classes at the same time in different tabes with the stable condition? please help I am a beginner in selenium.
//Modules
from time import sleep
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from threading import *
import pandas as pd
//Code
driver = webdriver.Chrome(ChromeDriverManager().install())
driver.minimize_window()
wait = WebDriverWait(driver, 30)
df = pd.read_excel('chains.xlsx', sheet_name='first')
chains_links = list(df['links'])
class Cardano(Thread):
def run(self):
cardano = 1
for link in chains_links:
if 'cardanoscan' in link:
driver.get(link)
sleep(0.5)
try:
status = wait.until(EC.visibility_of_element_located(
(By.XPATH, "/html/body/div[2]/main/div/div/div[2]/div/div/div[1]/div[1]/div[1]/div[1]/div[1]/div[2]/button"))).text
saturation = wait.until(EC.visibility_of_element_located(
(By.XPATH, "/html/body/div[2]/main/div/div/div[2]/div/div/div[3]/div[1]/div/div/div/div/div[1]"))).text
except:
status = None
saturation = None
if status == "Active" and saturation != "0" and saturation != None:
print(
f"Cardano {cardano}: is {status} and Staturation is {saturation}")
else:
print(f"Something Wrong with Cardano {cardano}")
cardano = cardano + 1
class Keeps(Thread):
def run(self):
keeps = 1
for link in chains_links:
if "allthekeeps" in link:
driver.execute_script(f"window.open {link}, 'new window')")
sleep(0.5)
try:
fault = wait.until(EC.visibility_of_element_located(
(By.XPATH, "/html/body/div/div[2]/div/div/div[1]/div[2]/div[4]/div[2]/div"))).text
except:
fault = None
if fault != None and fault == "0":
print(f"Keeps {keeps}: is active with {fault} faults:")
else:
print(
f"Something wrong with Keeps {keeps}: {fault} Faults founded")
keeps = keeps + 1
t1 = Cardano()
t2 = Keeps()
t1.start()
t2.start()
t1.join()
t2.join()
driver.close()

Scraped data is not saving to csv file as it keeps returning a blank csv file.

Can anyone see what's wrong with this code?
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions
from selenium.webdriver.support.ui import WebDriverWait
import csv
def races(main_url):
driver = webdriver.Chrome()
driver.get(main_url)
driver.implicitly_wait(2)
races = driver.find_elements_by_class_name('time-location')
races = [race.text[:5] for race in races]
races = [race.replace(':', '') for race in races]
driver.close()
return races
def scrape(url):
driver = webdriver.Chrome()
driver.get(url)
driver.implicitly_wait(2)
driver.find_elements_by_class_name('racecard-ajax-link')[1].click()
WebDriverWait(driver, 5).until(expected_conditions.presence_of_element_located((By.XPATH, '//[#id="tab-racecard-sectional-times"]/div/div[1]/div[1]/div[2]/div/button')))
for horse in driver.find_elements_by_class_name('card-item'):
horseName = horse.find_element_by_class_name('form-link').text
times = horse.find_elements_by_class_name('sectionals-time')
times = [time.text for time in times]
print('{}: {}'.format(horseName, times))
print()
driver.close()
So at this next point below I am trying to save the data to df, but it returns a blank doc when opened. Should df = open('jan1.csv', 'w+') not store the scraped data into the csv file. I'm obviously missing something but can't see what.
def main():
df = open('jan1.csv', 'w+')
df.close()
date = '1-January-2018'
main_url = 'http://www.attheraces.com/racecard/Southwell/' + date
for race in races(main_url):
url = main_url + '/' + race
print(url)
scrape(url)
if __name__ == '__main__':
main()
Your code seems broken in several places and even with fixing it I get Timeout errors.
Try these steps:
Add pandas for easy data handling:
import pandas as pd
def scrape(url):
driver = webdriver.Chrome()
driver.get(url)
driver.implicitly_wait(2)
driver.find_elements_by_class_name('racecard-ajax-link')[1].click()
WebDriverWait(driver, 5).until(expected_conditions.presence_of_element_located((By.XPATH, '//[#id="tab-racecard-sectional-times"]/div/div[1]/div[1]/div[2]/div/button')))
# add empty list to save scraped data
data = []
for horse in driver.find_elements_by_class_name('card-item'):
horseName = horse.find_element_by_class_name('form-link').text
times = horse.find_elements_by_class_name('sectionals-time')
times = [time.text for time in times]
print('{}: {}'.format(horseName, times))
data.append([horseName, times])
print()
driver.close()
# return your data!
return data
Then change this in your main function:
def main():
date = '1-January-2018'
main_url = 'http://www.attheraces.com/racecard/Southwell/' + date
tmp = []
for race in races(main_url):
url = main_url + '/' + race
print(url)
tmp.append(scrape(url))
df = pd.DataFrame(tmp)
df.to_csv("jan1.csv")
Or if you want to stick to csv only (no pandas):
with open("jan1.csv", "w+") as file:
file.write(your_data_var_here)

Selenium Python webscraper really slow

I'm a newbie getting into web scrapers. I've made something that works, but it takes hours and hours to get everything I need. I read something about using parallel processes to process the URLs but I have no clue how to go about it and incorporate it in what I already have. Help is much appreciated!
Here is my, still extremely messy, code. I'm still learning :)
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from bs4 import BeautifulSoup
from selenium.common.exceptions import NoSuchElementException
import time
import random
import pprint
import itertools
import csv
import pandas as pd
start_url = "https://www.nationalevacaturebank.nl/vacature/zoeken?query=&location=&distance=city&limit=100&sort=relevance&filters%5BcareerLevel%5D%5B%5D=Starter&filters%5BeducationLevel%5D%5B%5D=MBO"
driver = webdriver.Firefox()
driver.set_page_load_timeout(20)
driver.get(start_url)
driver.find_element_by_xpath('//*[#id="form_save"]').click() #accepts cookies
wait = WebDriverWait(driver, random.randint(1500,3200)/1000.0)
j = random.randint(1500,3200)/1000.0
time.sleep(j)
num_jobs = int(driver.find_element_by_xpath('/html/body/div[3]/div/main/div[2]/div[3]/div/header/h2/span').text)
num_pages = int(num_jobs/102)
urls = []
list_of_links = []
for i in range(num_pages+1):
try:
elements = wait.until(EC.presence_of_all_elements_located((By.XPATH, '//*[#id="search-results-container"]//article/job/a')))
for i in elements:
list_of_links.append(i.get_attribute('href'))
j = random.randint(1500,3200)/1000.0
time.sleep(j)
if 'page=3' not in driver.current_url:
driver.find_element_by_xpath('//html/body/div[3]/div/main/div[2]/div[3]/div/paginator/div/nav[1]/ul/li[6]/a').click()
else:
driver.find_element_by_xpath('//html/body/div[3]/div/main/div[2]/div[3]/div/paginator/div/nav[1]/ul/li[5]/a').click()
url = driver.current_url
if url not in urls:
print(url)
urls.append(url)
else:
break
except:
continue
set_list_of_links = list(set(list_of_links))
print(len(set_list_of_links), "results")
driver.close()
def grouper(n, iterable):
it = iter(iterable)
while True:
chunk = tuple(itertools.islice(it, n))
if not chunk:
return
yield chunk
def remove_empty_lists(l):
keep_going = True
prev_l = l
while keep_going:
new_l = remover(prev_l)
#are they identical objects?
if new_l == prev_l:
keep_going = False
#set prev to new
prev_l = new_l
#return the result
return new_l
def remover(l):
newlist = []
for i in l:
if isinstance(i, list) and len(i) != 0:
newlist.append(remover(i))
if not isinstance(i, list):
newlist.append(i)
return newlist
vacatures = []
chunks = grouper(100, set_list_of_links)
chunk_count = 0
for chunk in chunks:
chunk_count +=1
print(chunk_count)
j = random.randint(1500,3200)/1000.0
time.sleep(j)
for url in chunk:
driver = webdriver.Firefox()
driver.set_page_load_timeout(20)
try:
driver.get(url)
driver.find_element_by_xpath('//*[#id="form_save"]').click() #accepts cookies
vacature = []
vacature.append(url)
j = random.randint(1500,3200)/1000.0
time.sleep(j)
elements = driver.find_elements_by_tag_name('dl')
p_elements = driver.find_elements_by_tag_name('p')
li_elements = driver.find_elements_by_tag_name('li')
for i in elements:
if "Salaris:" not in i.text:
vacature.append(i.text)
running_text = list()
for p in p_elements:
running_text.append(p.text)
text= [''.join(running_text)]
remove_ls = ['vacatures', 'carrièretips', 'help', 'inloggen', 'inschrijven', 'Bezoek website', 'YouTube',
'Over Nationale Vacaturebank', 'Werken bij de Persgroep', 'Persberichten', 'Autotrack', 'Tweakers',
'Tweakers Elect', 'ITBanen', 'Contact', 'Carrière Mentors', 'Veelgestelde vragen',
'Vacatures, stages en bijbanen', 'Bruto Netto Calculator', 'Salariswijzer', 'Direct vacature plaatsen',
'Kandidaten zoeken', 'Bekijk de webshop', 'Intermediair', 'Volg ons op Facebook']
for li in li_elements:
if li.text not in remove_ls:
text.append(li.text)
text = ''. join(text)
vacature.append(text)
vacatures.append(vacature)
driver.close()
except TimeoutException as ex:
isrunning = 0
print("Exception has been thrown. " + str(ex))
driver.close()
except NoSuchElementException:
continue
Python Selenium webdriver is not thread-safe. This means your browser can not correctly consume asynchronous calls from multiple threads. Try to scrape websites with requests and bs4 + lxml. It's much faster than Selenium. This answer can be helpful.
You're using Firefox which is slower than Chrome in almost all real-life applications.
Xpath is the slowest selector, match by id or class. If that is not possible then by CSS.
Use headless mode and don't load images unless you need to.
You can use Scrapy and this is much faster and more flexible than anything. See link for more information.

Categories

Resources