How to close selenium drivers while multiprocessing in python on command? - python

I wrote a python script that uses multiprocessing to open up four different windows and search a common query among them; I'm trying to find a way to close the drivers on command instead of calling driver.close() at the end of the function which would automatically close the drivers as soon as they have successfully spawned.
I would like to browse the windows for a bit and close them when I am finished.
Here's what my code looks like:
def main(hosts, inp):
driver = webdriver.Chrome(executable_path='./chromedriver')
driver.get(hosts)
if 'ebay' in driver.current_url:
print 'opened ebay'
search_box = driver.find_element_by_id('gh-ac')
search_box.clear()
search_box.send_keys(inp)
search_box.send_keys(Keys.RETURN)
elif 'google' in driver.current_url:
print 'opened google'
search_box = driver.find_element_by_id('gbqfq')
search_box.clear()
search_box.send_keys(inp)
search_box.send_keys(Keys.RETURN)
elif 'etsy' in driver.current_url:
print 'opened etsy'
search_box = driver.find_element_by_id('search-query')
search_box.clear()
search_box.send_keys(inp)
search_box.send_keys(Keys.RETURN)
elif 'amazon' in driver.current_url:
print 'opened amazon'
search_box = driver.find_element_by_id('twotabsearchtextbox')
search_box.clear()
search_box.send_keys(inp)
search_box.send_keys(Keys.RETURN)
else:
print "--NONE--"
# driver.close()
if __name__ == '__main__':
hosts = ["http://ebay.com", "https://www.google.com/shopping?hl=en", "http://etsy.com", "http://amazon.com"]
num_hosts = len(hosts)
inp = raw_input("What do you want to search?: ")
p = Pool(num_hosts)
partial_main = partial(main, inp=inp)
p.map(partial_main, hosts)
p.close()
p.join()
I would like to do something like this:
done = raw_input("Type 'done' when finished:" )
if done = 'done':
driver.close()
but just injected that into main gives me an EOF error

Related

monitor chrome tabs - python selenium

I'm writing a script where I need you to monitor the number of open tabs. The script reads a table and each information in the table opens a new tab in the browser. It turns out that in order not to have thousands of tabs open, I need the script to monitor the tabs to have only 3 open, when I close a tab I need to automatically open the next tab
def base_dados_processos(self):
try:
df = pd.read_excel('TABLE.xlsx')
self.num_proc = df['COLUM']
except Exception:
pg.alert(text='error', title='ERROR', button='OK')
self.chrome.quit()
def loop_pesquisa(self):
for PROCESSOS in zip(self.num_proc):
num_current_tabs = len(self.chrome.window_handles)
if num_current_tabs < 3:
pg.hotkey('ctrl', '1')
time.sleep(1)
self.chrome.get('https://stackoverflow.com/')
pg.write(str(test))
pg.press('enter')
time.sleep(3)
pg.press('tab', presses=26)
pg.press('enter')
time.sleep(1)
pg.press('enter')
To do this, just add a loop that monitors the guides, it can be as follows:
while len(self.chrome.window_handles) > 3:
time.sleep(0.5)
without your code:
def base_dados_processos(self):
try:
df = pd.read_excel('TABLE.xlsx')
self.num_proc = df['COLUM']
except Exception:
pg.alert(text='error', title='ERROR', button='OK')
self.chrome.quit()
def loop_pesquisa(self):
for PROCESSOS in zip(self.num_proc):
while len(self.chrome.window_handles) > 3:
time.sleep(0.5)
pg.hotkey('ctrl', '1')
time.sleep(1)
self.chrome.get('https://stackoverflow.com/')
pg.write(str(test))
pg.press('enter')
time.sleep(3)
pg.press('tab', presses=26)
pg.press('enter')
time.sleep(1)
pg.press('enter')

Python process starts but does not actually run the target function

I'm using multiprocessing to test proxy server usability. The target function of my process takes a proxy server address and a queue as arguments and opens an instance of webdriver with the given proxy. The function tests the proxy by going to a specific url and trying to retrieve an html element. If the test is successful the function will add the webdriver instance to the queue. The function is shown below.
def test_proxy(address, queue):
print(f"testing proxy {address}")
chrome_options_1 = webdriver.ChromeOptions()
chrome_options_1.add_argument('--proxy-server=%s' % address)
chrome_options_1.add_argument("headless")
driver = webdriver.Chrome('.\driver\chromedriver.exe', options=chrome_options_1)
driver.set_page_load_timeout(10)
url = "https://www.facebook.com/marketplace/nyc/search/?query=honda"
try:
driver.get(url)
driver.find_element_by_xpath("//*[#class='kbiprv82']/a").get_attribute("href")
print(f"Successfully connected to proxy server at {address}")
queue.put(driver)
return
except:
print("Connection failed")
driver.quit()
In my main process I have a list of proxy addresses to test. A process is created to test each proxy in the list until a test is successful and a driver instance is put in the queue. If an item is found in the queue all the processes are terminated and the proxy list is cleared. The loop in my main process limits the number of child processes to 10. The main process code is in a class and shown below.
def find_proxy(self):
self.proxies = []
self.proxy_queue = multiprocessing.Queue()
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument("headless")
driver = webdriver.Chrome('.\driver\chromedriver.exe', options=chrome_options)
driver.get("https://free-proxy-list.net/")
Select(driver.find_element_by_xpath("//*[#id='proxylisttable']/tfoot/tr/th[7]/select")).select_by_visible_text("yes")
for country in ["US", "MX", "CA", "CL", "CO", "BR", "PE"]:
try:
Select(driver.find_element_by_xpath("//*[#id='proxylisttable']/tfoot/tr/th[3]/select")).select_by_visible_text(country)
i = 0
entries = driver.find_elements_by_xpath("//table[#id='proxylisttable']/tbody/tr/td")
for entry in entries:
if i == 7:
i = 0
self.proxies.append([proxy_address])
else:
if i == 0:
proxy_address = entry.text + ':'
if i == 1:
proxy_address = proxy_address + entry.text
i = i + 1
except:
pass
driver.quit()
while len(self.proxies) > 0:
i = 0
for proxy in self.proxies[:10]:
if self.proxy_queue.empty() == False:
driver = self.proxy_queue.get()
for proxy_1 in self.proxies:
try:
proxy_1[1].terminate()
except:
pass
self.proxies.clear()
return driver
elif len(proxy) < 2:
proxy.insert(1, multiprocessing.Process(target=test_proxy, args=(proxy[0], self.proxy_queue,)))
print(f"proxy thread {proxy[0]} created")
proxy[1].start()
print(f"proxy thread {proxy[0]} started")
elif proxy[1].is_alive() == False:
print(f"proxy thread {proxy[0]} dead")
del self.proxies[i]
print("proxy deleted")
break
i = i + 1
The issue is that the processes seem to start just fine but none of the code in the test_proxy function is actually run, not even the first print statement.

Selenium Threads: how to run multi-threaded browser with proxy ( python)

I'm writing a script to access a website using proxies with multiple threads but now I'm stuck in multiple threads, when I run the script below, it opens 5 browsers but all 5 use 1 proxy, I want 5 browsers to use different proxies, can someone help me complete it? thank you
Here is my script :
from selenium import webdriver
from selenium import webdriver
import time , random
import threading
def e():
a = open("sock2.txt", "r")
for line in a.readlines():
b = line
prox = b.split(":")
IP = prox[0]
PORT = int(prox[1].strip("\n"))
print(IP)
print(PORT)
profile = webdriver.FirefoxProfile()
profile.set_preference("network.proxy.type", 1)
profile.set_preference("network.proxy.socks", IP)
profile.set_preference("network.proxy.socks_port", PORT)
try:
driver = webdriver.Firefox(firefox_profile=profile)
driver.get("http://www.whatsmyip.org/")
except:
print("Proxy Connection Error")
driver.quit()
else:
time.sleep(random.randint(40, 70))
driver.quit()
for i in range(5):
t = threading.Thread(target=e)
t.start()
(Wish everyone has a happy and lucky new year)
Dominik Lašo captured it correctly - each threads processes the file from the beginning. Here's probably how it should look like:
from selenium import webdriver
from selenium import webdriver
import time , random
import threading
def e(ip, port):
profile = webdriver.FirefoxProfile()
profile.set_preference("network.proxy.type", 1)
profile.set_preference("network.proxy.socks", IP)
profile.set_preference("network.proxy.socks_port", PORT)
try:
driver = webdriver.Firefox(firefox_profile=profile)
driver.get("http://www.whatsmyip.org/")
except:
print("Proxy Connection Error")
driver.quit()
else:
time.sleep(random.randint(40, 70))
driver.quit()
my_threads = []
with open("sock2.txt", "r") as fd:
for line in fd.readlines():
line = line.strip()
if not line:
continue
prox = line.split(":")
ip = prox[0]
port = int(prox[1])
print('-> {}:{}'.format(ip, port))
t = threading.Thread(target=e, args=(ip, port,))
t.start()
my_threads.append(t)
for t in my_threads:
t.join()
( I personaly think that a problem is there that when you start a program, it will go to new thread, which will go throught the textfile from beginning, becasue you aint deleting them )
I have cane across the same problem, when I was doing the same thing as you do now. I know you would rather want help with your code, but I am in hurry to test it and want to help you ;) , so here is a code that works for me ... There is even task killer for a chrome ( you just have to edit it to firefox )
If I were you, I would start the thread after opening the file, cuz it looks liek you are opening the same file from 1st line everytime the tread starts
links = [ // Link you want to go to ]
def funk(xxx , website):
link = website
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--proxy-server=%s' % str(xxx))
chromedriver = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'chromedriver')
chrome = webdriver.Chrome(chromedriver, chrome_options=chrome_options)
try :
// Do stuff
except:
print('exception')
chrome.close()
for link in links:
f = open('proxies.txt')
line = f.readline()
x = 1
xx = 0
while line:
if number_of_used_proxies < 10:
print(line)
line = f.readline()
try:
threading.Timer(40, funk, [line, link]).start()
except Exception as e:
print(e)
time.sleep(1)
x += 1
number_of_used_proxies += 1
else:
time.sleep(100)
for x in range(1, 10):
try:
xzxzx = 'os.system("taskkill /f /im chrome.exe")'
os.system("killall 'Google Chrome'")
except:
print("NoMore")
time.sleep(10)
number_of_used_proxies = 0
f.close()
Hope it helps :)
vantuong: Here's how you can solve the problem with ThreadPoolExecutor.
Reference: https://docs.python.org/3/library/concurrent.futures.html
from selenium import webdriver
import time, random
#import threading
import concurrent.futures
MAX_WORKERS = 5
def get_proxys(data_file):
proxys = []
with open(data_file, "r") as fd:
for line in fd.readlines():
line = line.strip()
if not line:
continue
prox = line.split(":")
ip = prox[0]
port = int(prox[1])
proxys.append((ip, port))
return proxys
def e(ip, port):
profile = webdriver.FirefoxProfile()
profile.set_preference("network.proxy.type", 1)
profile.set_preference("network.proxy.socks", IP)
profile.set_preference("network.proxy.socks_port", PORT)
try:
driver = webdriver.Firefox(firefox_profile=profile)
driver.get("http://www.whatsmyip.org/")
except:
print("Proxy Connection Error")
driver.quit()
else:
time.sleep(random.randint(40, 70))
driver.quit()
with concurrent.futures.ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
proxys = get_proxys('sock2.txt')
tasks = {executor.submit(e, proxy[0], proxy[1]): proxy for proxy in proxys}
for task in concurrent.futures.as_completed(tasks):
proxy = tasks[task]
try:
data = task.result()
except Exception as exc:
print('{} generated an exception: {}'.format(proxy, exc))
else:
print('{} completed successfully'.format(proxy))
Fun exercise: Try playing around with different values of MAX_WORKERS.

Selenium Web-driver automation not working properly

So I found this script online, which is meant to brute-force web-forms online etc. using Selenium, and I thought it would be a good idea to take it, modify it a bit and experiment with it. This time, I tried creating a bot that:
Signs up to Twitter
Goes to twitter.com
Posts something.
Logs out.
Loop 1, 2, 3, 4 again
However, when I run the script, it just pops up a browser window and does nothing. Then the Terminal ends the Python script like it did its work correctly and finished with no problems...
Code (note that the script might look weird for what I want, but that's because I found the script as a web-form brute-forcer online, and decided to modify it to my needs):
#!/bin/python
from mainLib import *
from time import sleep
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import simplejson as json
import sys
import optparse
profile = webdriver.FirefoxProfile()
profile.set_preference("general.useragent.override", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.76 Safari/537.36")
driver = "reserved"
def userExists(username):
try:
driver.get("https://twitter.com/"+username)
assert (("??" or "?Twitter / ?") not in driver.title)
except AssertionError:
print '\033[1;31mUser\033[1;m',
print '\033[1;35m#%s\033[1;m' %username,
print '\033[1;31mdoes not exist.\033[1;m',
print '\033[1;33mTrying with the next Username...\033[1;m'
print ' '
return 1
except:
'uknown error'
def login(user, password, delay):
try:
print '\033[1;33mCreating account with mail: \033[1;m' + '\033[1;35m' + password + '\033[1;m' '\033[1;33m ...\033[1;m'
sleep(2)
if driver.current_url == 'https://twitter.com':
print '\033[1;33mPlease retry using a different IP Address (Proxy/VPN).\033[1;m'
driver.get("https://twitter.com/signup")
if driver.title == "Login on Twitter":
driver.get("https://twitter.com/signup")
sleep(3)
elem = driver.find_element_by_id('full-name')
elem.clear()
elem.send_keys('john')
elem = driver.find_element_by_id('email')
elem.clear()
elem.send_keys(password)
elem = driver.find_element_by_id('password')
elem.clear()
elem.send_keys("twitter1")
sleep(3)
elem.send_keys(Keys.RETURN)
sleep(delay + 3)
if driver.title == "Twitter / Error":
print ' \033[1;31mFailed!\033[1;m'
driver.get("https://twitter.com/signup")
sleep(3)
if driver.title == "Login to Twitter":
print ' \033[1;31mFailed!\033[1;m'
driver.get("https://twitter.com/signup")
sleep(3)
# if "This email is already registered." in driver.page_source:
# print ' \033[1;31mFailed!\033[1;m'
if driver.current_url == 'https://twitter.com/account/access':
print ' \033[1;31mFailed!\033[1;m'
print ("")
print '\033[1;33mPlease retry using a different IP Address (Proxy/VPN).\033[1;m'
driver.close()
sys.exit("")
assert (("Enter your phone") not in driver.title)
except AssertionError:
print ' \033[1;32mSuccess!\033[1;m'
# print '\033[1;35mEmail: \033[1;m' + password
# print '\033[1;35mPassword: \033[1;m' + "twitter1"
# print("")
try:
f = open('CreatedAccounts.txt','a')
except:
f = open('CreatedAccounts.txt','w')
f.write(password+'\n')
f.close()
driver.get("https://twitter.com")
elem = driver.find_element_by_id('tweet-box-home-timeline')
elem.clear()
elem.send_keys('It worked!')
elem = driver.find_element_by_xpath('//*[#id="timeline"]/div[2]/div/form/div[2]/div[2]/button')
elem.send_keys(Keys.RETURN)
time.sleep(5)
driver.get("https://twitter.com/logout")
sleep(5)
elem = driver.find_element_by_css_selector("button.js-submit").click()
sleep(5)
driver.get("https://twitter.com/signup")
# driver.delete_all_cookies()
# return 1
# else:
# print '\033[1;33mPlease check your Internet Connection.\033[1;m'
def dictionaryAttack(usernames,passwords,delay):
if str(type(usernames)) == "<type 'list'>":
for username in usernames:
#if (userExists(username) == 1):
# continue
driver.get("https://twitter.com/signup")
sleep(delay)
print("Creating Accounts...")
print("")
for password in passwords:
if (login(username,password,delay) == 1):
cj.clear()
break
def main():
parser = optparse.OptionParser()
parser.add_option('-f', '--file', action="store", dest="userfile", help="File containing valid usernames (one per line)", default=False)
parser.add_option('-d', '--dictionary', action="store", dest="dictionary", help="Text file containing passwords", default=False)
parser.add_option('-u', '--username', action="store", dest="username", help="A valid username", default=False)
parser.add_option('-t', '--time', action="store", dest="delay", help="Delay (in seconds) - use this option based on your Network Connection speed.", default=True)
options, args = parser.parse_args()
global driver
if (options.delay is None):
delay = 4
else:
delay = int(options.delay)
print '\033[1;33mUsing\033[1;m',
print '\033[1;35m%d second(s)\033[1;m' %delay,
print '\033[1;33mof delay between login attempts.\033[1;m'
print ' '
if ( (options.userfile == False) and (options.username == False) ) :
print 'You have to set an username or a userfile'
exit()
if ( (options.userfile != False) and (options.username != False) ) :
print 'You can\'t set both options at once.. choose between username or userfile'
exit()
if (options.dictionary == False):
print 'You have to set a valid path for the passwords dictionary.'
exit()
try:
f = open(options.dictionary,'r')
passwords = []
while True:
line = f.readline()
if not line:
break
passwords.append(line.strip('\n'))
f.close()
except:
print 'Check the path to the dictionary and try again.'
exit()
if (options.userfile != False):
try:
f = open(options.userfile,'r')
usernames = []
while True:
line = f.readline()
if not line:
break
usernames.append(line.strip('\n'))
f.close()
except:
print 'Check the path to the users file and try again.'
exit()
driver = webdriver.Firefox(profile)
driver.implicitly_wait(30)
dictionaryAttack(usernames,passwords,delay)
else:
driver = webdriver.Firefox(profile)
driver.implicitly_wait(30)
dictionaryAttack(options.username,passwords,delay)
driver.close()
if __name__ == '__main__':
main()
See the link below. That could be the problem.
https://github.com/SeleniumHQ/selenium/issues/2257

QWaitCondition error when multiprocessing with python ghost.py

I'm using multiprocessing and ghost.py to crawl some data from the internet, but there are some errors:
2015-03-31T23:22:30 QT: QWaitCondition: Destroyed while threads are still waiting
This is some of my code:
l.acquire()
global ghost
try:
ghost = Ghost(wait_timeout=60)
ghost.open(website) #download page
ghost.wait_for_selector('#pagenum') #wait JS
html = []
#print u"\t\t the first page"
html.append(ghost.content)
pageSum = findPageSum(ghost.content)
for i in xrange(pageSum-1): #crawl all pages
#print u"\t\tthe"+ str(i+2) +"page"
ghost.set_field_value('#pagenum', str(i+2))
ghost.click('#page-go')
ghost.wait_for_text("<td>"+str(20*(i+1)+1)+"</td>")
html.append(ghost.content)
for i in html:
souped(i)
print website, "\t\t OK!"
except :
pass
l.release()
Other code:
global _use_line
q = Queue.Queue(0)
for i in xrange(len(websitelist)):
q.put((websitelist[i]))
lock = Lock()
while (not q.empty()):
if (_use_line > 0):
for i in range(_use_line):
dl = q.get()
_use_line -= 1
print "_use_line: ", _use_line
p = Process(target=download, args=(lock,dl))
p.start()
else:
time.sleep(1)
ghost.py uses pyqt and pyside, and I think this issue is because ofsome local variable's error, but I don't know how to find it.

Categories

Resources