I'm trying to build a multithreading selenium scraper. Let's say I want to get 100.000 websites and print their page sources, using 20 ChromeDriver instances. By now, I have the following code:
from queue import Queue
from threading import Thread
from selenium import webdriver
from numpy.random import randint
selenium_data_queue = Queue()
worker_queue = Queue()
# Start 20 ChromeDriver instances
worker_ids = list(range(20))
selenium_workers = {i: webdriver.Chrome() for i in worker_ids}
for worker_id in worker_ids:
worker_queue.put(worker_id)
def selenium_task(worker, data):
# Open website
worker.get(data)
# Print website page source
print(worker.page_source)
def selenium_queue_listener(data_queue, worker_queue):
while True:
url = data_queue.get()
worker_id = worker_queue.get()
worker = selenium_workers[worker_id]
# Assign current worker and url to your selenium function
selenium_task(worker, url)
# Put the worker back into the worker queue as it has completed it's task
worker_queue.put(worker_id)
data_queue.task_done()
return
if __name__ == '__main__':
selenium_processes = [Thread(target=selenium_queue_listener,
args=(selenium_data_queue, worker_queue)) for _ in worker_ids]
for p in selenium_processes:
p.daemon = True
p.start()
# Adding urls indefinitely to data queue
# Generating random url just for testing
for i in range(100000):
d = f'http://www.website.com/{i}'
selenium_data_queue.put(d)
# Wait for all selenium queue listening processes to complete
selenium_data_queue.join()
# Tearing down web workers
for b in selenium_workers.values():
b.quit()
My question is: if any ChromeDriver abruptly shuts down (i.e. non-recoverable exception like InvalidSessionIdException), is there a way to remove it from the worker queue and insert a new ChromeDriver in its place, so that I still have 20 usable instances? If so, there's a good pratice to accomplish it?
Related
i am using ThreadPoolExecutor to get a lot of requests from websites quickly, but sometimes, maybe 1 in 5 times, ThreadPoolExecutor finishes running all of the thread functions and then just freezes without moving on to the rest of my code. I need this to be reliable for a project i'm working on.
from concurrent.futures import ThreadPoolExecutor
import ballotpedialinks as bl
data =[[link,0],[link,1],[link,2]...[link,500]]
def threadFunction(data):
page = data[0]
counter = data[1]
a = bl.checkLink(page)
print(a[0])
if a[0] == '':
links = bl.generateNewLinks(page,state)
for link in links:
a = bl.checkLink(link)
if a[0] != '':
print(f'{a[0]} is a fixed link')
break
def quickRun(threads):
with ThreadPoolExecutor(threads) as pool:
pool.map(threadFunction,data[0:-1])
quickRun(32)
print('scraper complete')
this is basically what i'm doing but thread function is sending requests to websites. executor finishes all the tasks i give it but sometimes it just freezes once its done. Is there anything i can do to make executor not freeze?
I'm trying to launch a function (my_function) and stop its execution after a certain time is reached.
So i challenged multiprocessing library and everything works well. Here is the code, where my_function() has been changed to only create a dummy message.
from multiprocessing import Queue, Process
from multiprocessing.queues import Empty
import time
timeout=1
# timeout=3
def my_function(something):
time.sleep(2)
return f'my message: {something}'
def wrapper(something, queue):
message ="too late..."
try:
message = my_function(something)
return message
finally:
queue.put(message)
try:
queue = Queue()
params = ("hello", queue)
child_process = Process(target=wrapper, args=params)
child_process.start()
output = queue.get(timeout=timeout)
print(f"ok: {output}")
except Empty:
timeout_message = f"Timeout {timeout}s reached"
print(timeout_message)
finally:
if 'child_process' in locals():
child_process.kill()
You can test and verify that depending on timeout=1 or timeout=3, i can trigger an error or not.
My main problem is that the real my_function() is a torch model inference for which i would like to limit the number of threads (to 4 let's say)
One can easily do so if my_function were in the main process, but in my example i tried a lot of tricks to limit it in the child process without any success (using threadpoolctl.threadpool_limits(4), torch.set_num_threads(4), os.environ["OMP_NUM_THREADS"]=4, os.environ["MKL_NUM_THREADS"]=4).
I'm completely open to other solution that can monitor the time execution of a function while limiting the number of threads used by this function.
thanks
Regards
You can limit simultaneous process with Pool. (https://docs.python.org/3/library/multiprocessing.html#module-multiprocessing.pool)
You can set max tasks done per child. Check it out.
Here you have a sample from superfastpython by Jason Brownlee:
# SuperFastPython.com
# example of limiting the number of tasks per child in the process pool
from time import sleep
from multiprocessing.pool import Pool
from multiprocessing import current_process
# task executed in a worker process
def task(value):
# get the current process
process = current_process()
# report a message
print(f'Worker is {process.name} with {value}', flush=True)
# block for a moment
sleep(1)
# protect the entry point
if __name__ == '__main__':
# create and configure the process pool
with Pool(2, maxtasksperchild=3) as pool:
# issue tasks to the process pool
for i in range(10):
pool.apply_async(task, args=(i,))
# close the process pool
pool.close()
# wait for all tasks to complete
pool.join()
This question already has an answer here:
Basic python threading is not working. What am I missing in this?
(1 answer)
Closed 1 year ago.
I found this simple example demonstrating how to use threading to parallelize opening multiple chrome sessions with selenium.
from selenium import webdriver
import threading
import time
def test_logic():
driver = webdriver.Chrome()
url = 'https://www.google.de'
driver.get(url)
# Implement your test logic
time.sleep(2)
driver.quit()
N = 5 # Number of browsers to spawn
thread_list = list()
# Start test
for i in range(N):
t = threading.Thread(name='Test {}'.format(i), target=test_logic)
t.start()
time.sleep(1)
print(t.name + ' started!')
thread_list.append(t)
# Wait for all threads to complete
for thread in thread_list:
thread.join()
print('Test completed!')
I tested it and it works. However if I modify the test_logic function to include a variable, i.e. j:
def test_logic(j):
driver = webdriver.Chrome()
url = 'https://www.google.de'
driver.get(url)
# Implement your test logic
time.sleep(j)
driver.quit()
and the corresponding part of threading to:
t = threading.Thread(name='Test {}'.format(i), target=test_logic(i))
the code will stop working in parallel and just runs sequentially.
I don't know what I might have not considered and therefore will be very grateful if anybody can give me some advices. Many thanks!
target=test_logic(i) is invoking the function test_logic and give the return value to the thread.
You may want to do:
t = threading.Thread(name='Test {}'.format(i), target=test_logic, args=[i])
where target is the name of the function, and args is the arguments list for the function.
If you function has 2 args, like def test_logic(a,b), the args should contain 2 values.
More info in Python Thread Documentation
You have to pass arguments to function as below:
t = threading.Thread(name='Test {}'.format(i), target=test_logic, args=(i,))
I have an asyncio-based crawler that occasionally offloads crawling that requires the browser to a ThreadPoolExecutor, as follows:
def browserfetch(url):
browser = webdriver.Chrome()
browser.get(url)
# Some explicit wait stuff that can take up to 20 seconds.
return browser.page_source
async def fetch(url, loop):
with concurrent.futures.ThreadPoolExecutor() as pool:
result = await loop.run_in_executor(pool, browserfetch, url)
return result
My issue is that I believe this respawns the headless browser each time I call fetch, which incurs browser startup time on each call to webdriver.Chrome. Is there a way for me to refactor browserfetch or fetch so that the same headless driver can be used on multiple fetch calls?
What have I tried?
I've considered more explicit use of threads/pools to start the Chrome instance in a separate thread/process, communicating within the fetch call via queues, pipes, etc (all run in Executors to keep the calls from blocking). I'm not sure how to make this work, though.
I believe that starting browsers in separate processes and communicate with him via queue is a good approach (and more scalable). The pseudo-code might look like this:
# worker.py
def entrypoint(in_queue, out_queue): # run in process
crawler = Crawler()
browser = Browser()
while not stop:
command = in_queue.get()
result = crawler.process(command, browser)
out_queue.put(result)
# main.py
import worker
in_queue, out_queue = Process(worker.entrypoint)
while not stop:
in_queue.put(new_task)
result = out_queue.get()
Hi I try to open parallely multiple instances of Chrome in python using Webdriver and Multiprocessing.
After running processes, instances are opening smoothly, but they are not sent to my "instance" array and I can't access instances after that. Please help me, there is my code:
from selenium import webdriver
from multiprocessing import Process
import time
num = 3
process = [None] * num
instance = [None] * num
def get():
for i in range(num):
try:
instance[i].get("https://www.youtube.com")
except:
print("Can't connect to the driver")
time.sleep(1)
get()
def create_instance(i):
instance[i] = webdriver.Chrome()
if __name__ == '__main__':
for i in range(num):
process[i] = Process(target = create_instance, args = [i])
process[i].start()
for i in range(num):
process[i].join()
get()
when the multiprocessing try to pickle the webdriver object, it'll occur some weird error, so instead of passing the object, we can pass the class and build the object inside the new process.
BUT, in that kind of situation, you can not access the driver instances anymore, maybe you can try to send signals to the process.
from selenium import webdriver
from multiprocessing import Process
import time
num = 3
process = [None] * num
def get(id, Driver):
driver = Driver()
driver.get(f"https://www.google.com?id={id}")
time.sleep(10)
driver.close()
if __name__ == '__main__':
for i in range(num):
process[i] = Process(target=get, args = [i, webdriver.Chrome])
process[i].start()
for i in range(num):
process[i].join()