Python process starts but does not actually run the target function - python

I'm using multiprocessing to test proxy server usability. The target function of my process takes a proxy server address and a queue as arguments and opens an instance of webdriver with the given proxy. The function tests the proxy by going to a specific url and trying to retrieve an html element. If the test is successful the function will add the webdriver instance to the queue. The function is shown below.
def test_proxy(address, queue):
print(f"testing proxy {address}")
chrome_options_1 = webdriver.ChromeOptions()
chrome_options_1.add_argument('--proxy-server=%s' % address)
chrome_options_1.add_argument("headless")
driver = webdriver.Chrome('.\driver\chromedriver.exe', options=chrome_options_1)
driver.set_page_load_timeout(10)
url = "https://www.facebook.com/marketplace/nyc/search/?query=honda"
try:
driver.get(url)
driver.find_element_by_xpath("//*[#class='kbiprv82']/a").get_attribute("href")
print(f"Successfully connected to proxy server at {address}")
queue.put(driver)
return
except:
print("Connection failed")
driver.quit()
In my main process I have a list of proxy addresses to test. A process is created to test each proxy in the list until a test is successful and a driver instance is put in the queue. If an item is found in the queue all the processes are terminated and the proxy list is cleared. The loop in my main process limits the number of child processes to 10. The main process code is in a class and shown below.
def find_proxy(self):
self.proxies = []
self.proxy_queue = multiprocessing.Queue()
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument("headless")
driver = webdriver.Chrome('.\driver\chromedriver.exe', options=chrome_options)
driver.get("https://free-proxy-list.net/")
Select(driver.find_element_by_xpath("//*[#id='proxylisttable']/tfoot/tr/th[7]/select")).select_by_visible_text("yes")
for country in ["US", "MX", "CA", "CL", "CO", "BR", "PE"]:
try:
Select(driver.find_element_by_xpath("//*[#id='proxylisttable']/tfoot/tr/th[3]/select")).select_by_visible_text(country)
i = 0
entries = driver.find_elements_by_xpath("//table[#id='proxylisttable']/tbody/tr/td")
for entry in entries:
if i == 7:
i = 0
self.proxies.append([proxy_address])
else:
if i == 0:
proxy_address = entry.text + ':'
if i == 1:
proxy_address = proxy_address + entry.text
i = i + 1
except:
pass
driver.quit()
while len(self.proxies) > 0:
i = 0
for proxy in self.proxies[:10]:
if self.proxy_queue.empty() == False:
driver = self.proxy_queue.get()
for proxy_1 in self.proxies:
try:
proxy_1[1].terminate()
except:
pass
self.proxies.clear()
return driver
elif len(proxy) < 2:
proxy.insert(1, multiprocessing.Process(target=test_proxy, args=(proxy[0], self.proxy_queue,)))
print(f"proxy thread {proxy[0]} created")
proxy[1].start()
print(f"proxy thread {proxy[0]} started")
elif proxy[1].is_alive() == False:
print(f"proxy thread {proxy[0]} dead")
del self.proxies[i]
print("proxy deleted")
break
i = i + 1
The issue is that the processes seem to start just fine but none of the code in the test_proxy function is actually run, not even the first print statement.

Related

Checker python Selenium

I decided to create a checker for Instagram accounts.
Please tell me how you can change the ip when you restart the browser. I have a Tor profile. That is, the ip automatically changes every 10 minutes. How can I make the ip change 1 time per minute. Is this even possible?
Maybe there is some kind of set_preference setting or how in general you can change the ip when restarting the Firefox browser with the Tor settings.
import time
from selenium import webdriver
from selenium.webdriver.firefox.firefox_profile import FirefoxProfile
file = open('Good2.txt', encoding='utf-8-sig').read().split('\n')
goods = open('good_acc.txt', 'a+')
def settings_browser():
""" Настройки браузера FireFox. """
profile = FirefoxProfile(r'C:\Users\ASUS\Desktop\Scrape\Tor Browser\Browser\TorBrowser\Data\Browser\profile.default')
profile.set_preference('network.proxy.type', 1)
profile.set_preference('network.proxy.socks', '127.0.0.1')
profile.set_preference('network.proxy.socks_port', 9050)
profile.set_preference("network.proxy.socks_remote_dns", False)
# get a huge speed increase by not downloading images
profile.set_preference("permissions.default.image", 2)
profile.update_preferences()
return profile
def check_email():
""" Принимает всплывающее окно куки. Чекает валидность почт, если почта валидна, то сохраняет в файл 'good_acc.txt'. """
driver = webdriver.Firefox(firefox_profile=settings_browser(), executable_path=r'C:\Users\ASUS\Desktop\Scrape\geckodriver.exe')
for login in file:
driver.get("https://www.instagram.com/accounts/password/reset/")
body = driver.find_elements_by_class_name('pbNvD.FrS-d.gD9tr')
for bd in body:
if bd.find_element_by_class_name('aOOlW.bIiDR').text == 'Принять все':
bd.find_element_by_class_name('aOOlW.bIiDR').click()
time.sleep(7)
authorization = driver.find_elements_by_class_name("AHCwU")
pops = driver.find_elements_by_class_name("_-rjm")
username = login.split(":")[0]
password = login.split(":")[1]
for data in authorization:
# почта логин
data_login = data.find_element_by_name('cppEmailOrUsername')
data_login.click()
data_login.send_keys(username)
time.sleep(1)
# кнопка входа
clock_button = data.find_element_by_class_name('sqdOP.L3NKy.y3zKF')
clock_button.click()
time.sleep(2)
for pop in pops:
if 'Мы отправили ссылку для восстановления' in pop.find_element_by_class_name('tA2fc').text:
# Почта зарегана
goods.write(username + ' : ' + password + '\n')
print('Валидный аккаунт ' + username + ' : ' + password)
elif 'Подождите несколько минут, прежде чем пытаться снова.' in pop.find_element_by_class_name('tA2fc').text:
driver.quit()
driver = webdriver.Firefox(firefox_profile=settings_browser(), executable_path=r'C:\Users\ASUS\Desktop\Scrape\geckodriver.exe')
print('Успешная перезагрузка драйвера из-за "Подождите несколько минут, прежде чем пытаться снова."')
elif 'feedback_required' in pop.find_element_by_class_name('tA2fc').text:
driver.quit()
driver = webdriver.Firefox(firefox_profile=settings_browser(), executable_path=r'C:\Users\ASUS\Desktop\Scrape\geckodriver.exe')
print('Успешная перезагрузка драйвера из-за "feedback_required"')
else:
# Почта не зарегана
print(f"Не валидный аккаунт " + username)
goods.close()
def main():
check_email()
if __name__ == '__main__':
main()
If you set ControlPort: 9051 (and Passord: ...) in config file (on Linux /etc/tor/torrc) then you can use even standard socket to send signal to tor to change IP.
import socket
s = socket.socket()
s.connect(('127.0.0.1', 9051))
s.send('AUTHENTICATE "your_passord"\r\nSIGNAL NEWNYM\r\n'.encode())
It needs few seconds to get new IP from tor network.
And after few seconds proxy should use new IP.
You may also use module stem for this (it also needs settings in torrc)
from stem import Signal
from stem.control import Controller
with Controller.from_port(port=9051) as controller:
controller.authenticate(password='your_password')
controller.signal(Signal.NEWNYM)
More: Python: How to use Tor Network with requests to change IP?
EDIT:
import socket
def main():
# send signal to `tor` to change IP
s = socket.socket()
s.connect(('127.0.0.1', 9051))
s.send('AUTHENTICATE "your_passord"\r\nSIGNAL NEWNYM\r\n'.encode())
# wait few seconds for new IP
time.sleep(3)
check_email()

Selenium Threads: how to run multi-threaded browser with proxy ( python)

I'm writing a script to access a website using proxies with multiple threads but now I'm stuck in multiple threads, when I run the script below, it opens 5 browsers but all 5 use 1 proxy, I want 5 browsers to use different proxies, can someone help me complete it? thank you
Here is my script :
from selenium import webdriver
from selenium import webdriver
import time , random
import threading
def e():
a = open("sock2.txt", "r")
for line in a.readlines():
b = line
prox = b.split(":")
IP = prox[0]
PORT = int(prox[1].strip("\n"))
print(IP)
print(PORT)
profile = webdriver.FirefoxProfile()
profile.set_preference("network.proxy.type", 1)
profile.set_preference("network.proxy.socks", IP)
profile.set_preference("network.proxy.socks_port", PORT)
try:
driver = webdriver.Firefox(firefox_profile=profile)
driver.get("http://www.whatsmyip.org/")
except:
print("Proxy Connection Error")
driver.quit()
else:
time.sleep(random.randint(40, 70))
driver.quit()
for i in range(5):
t = threading.Thread(target=e)
t.start()
(Wish everyone has a happy and lucky new year)
Dominik Lašo captured it correctly - each threads processes the file from the beginning. Here's probably how it should look like:
from selenium import webdriver
from selenium import webdriver
import time , random
import threading
def e(ip, port):
profile = webdriver.FirefoxProfile()
profile.set_preference("network.proxy.type", 1)
profile.set_preference("network.proxy.socks", IP)
profile.set_preference("network.proxy.socks_port", PORT)
try:
driver = webdriver.Firefox(firefox_profile=profile)
driver.get("http://www.whatsmyip.org/")
except:
print("Proxy Connection Error")
driver.quit()
else:
time.sleep(random.randint(40, 70))
driver.quit()
my_threads = []
with open("sock2.txt", "r") as fd:
for line in fd.readlines():
line = line.strip()
if not line:
continue
prox = line.split(":")
ip = prox[0]
port = int(prox[1])
print('-> {}:{}'.format(ip, port))
t = threading.Thread(target=e, args=(ip, port,))
t.start()
my_threads.append(t)
for t in my_threads:
t.join()
( I personaly think that a problem is there that when you start a program, it will go to new thread, which will go throught the textfile from beginning, becasue you aint deleting them )
I have cane across the same problem, when I was doing the same thing as you do now. I know you would rather want help with your code, but I am in hurry to test it and want to help you ;) , so here is a code that works for me ... There is even task killer for a chrome ( you just have to edit it to firefox )
If I were you, I would start the thread after opening the file, cuz it looks liek you are opening the same file from 1st line everytime the tread starts
links = [ // Link you want to go to ]
def funk(xxx , website):
link = website
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--proxy-server=%s' % str(xxx))
chromedriver = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'chromedriver')
chrome = webdriver.Chrome(chromedriver, chrome_options=chrome_options)
try :
// Do stuff
except:
print('exception')
chrome.close()
for link in links:
f = open('proxies.txt')
line = f.readline()
x = 1
xx = 0
while line:
if number_of_used_proxies < 10:
print(line)
line = f.readline()
try:
threading.Timer(40, funk, [line, link]).start()
except Exception as e:
print(e)
time.sleep(1)
x += 1
number_of_used_proxies += 1
else:
time.sleep(100)
for x in range(1, 10):
try:
xzxzx = 'os.system("taskkill /f /im chrome.exe")'
os.system("killall 'Google Chrome'")
except:
print("NoMore")
time.sleep(10)
number_of_used_proxies = 0
f.close()
Hope it helps :)
vantuong: Here's how you can solve the problem with ThreadPoolExecutor.
Reference: https://docs.python.org/3/library/concurrent.futures.html
from selenium import webdriver
import time, random
#import threading
import concurrent.futures
MAX_WORKERS = 5
def get_proxys(data_file):
proxys = []
with open(data_file, "r") as fd:
for line in fd.readlines():
line = line.strip()
if not line:
continue
prox = line.split(":")
ip = prox[0]
port = int(prox[1])
proxys.append((ip, port))
return proxys
def e(ip, port):
profile = webdriver.FirefoxProfile()
profile.set_preference("network.proxy.type", 1)
profile.set_preference("network.proxy.socks", IP)
profile.set_preference("network.proxy.socks_port", PORT)
try:
driver = webdriver.Firefox(firefox_profile=profile)
driver.get("http://www.whatsmyip.org/")
except:
print("Proxy Connection Error")
driver.quit()
else:
time.sleep(random.randint(40, 70))
driver.quit()
with concurrent.futures.ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
proxys = get_proxys('sock2.txt')
tasks = {executor.submit(e, proxy[0], proxy[1]): proxy for proxy in proxys}
for task in concurrent.futures.as_completed(tasks):
proxy = tasks[task]
try:
data = task.result()
except Exception as exc:
print('{} generated an exception: {}'.format(proxy, exc))
else:
print('{} completed successfully'.format(proxy))
Fun exercise: Try playing around with different values of MAX_WORKERS.

Chrome user:pass login not working?

I am attempting to use chrome with a user pass proxy this is my code:
def getProxy(string_only=True):
try:
proxy_manager = ProxyManager("proxies.txt")
proxy = proxy_manager.random_proxy()
if string_only:
return proxy.proxy_string
return proxy.get_dict()
except (OSError, IOError, IndexError) as e: # couldn't load the file / file is empty
return None
chrome_options = webdriver.ChromeOptions()
proxy = getProxy()
userpass = proxy[0:proxy.find("#")]
hostport = proxy[proxy.find("#")+1:]
if proxy:
chrome_options.add_argument("--proxy-server=" + hostport)
driver1 = webdriver.Chrome(chrome_options=chrome_options)
wait = WebDriverWait(driver1, 1000000)
if manual == "No":
driver1.get("http://%s#www.google.com"%userpass)
Help is appreciated

How to close selenium drivers while multiprocessing in python on command?

I wrote a python script that uses multiprocessing to open up four different windows and search a common query among them; I'm trying to find a way to close the drivers on command instead of calling driver.close() at the end of the function which would automatically close the drivers as soon as they have successfully spawned.
I would like to browse the windows for a bit and close them when I am finished.
Here's what my code looks like:
def main(hosts, inp):
driver = webdriver.Chrome(executable_path='./chromedriver')
driver.get(hosts)
if 'ebay' in driver.current_url:
print 'opened ebay'
search_box = driver.find_element_by_id('gh-ac')
search_box.clear()
search_box.send_keys(inp)
search_box.send_keys(Keys.RETURN)
elif 'google' in driver.current_url:
print 'opened google'
search_box = driver.find_element_by_id('gbqfq')
search_box.clear()
search_box.send_keys(inp)
search_box.send_keys(Keys.RETURN)
elif 'etsy' in driver.current_url:
print 'opened etsy'
search_box = driver.find_element_by_id('search-query')
search_box.clear()
search_box.send_keys(inp)
search_box.send_keys(Keys.RETURN)
elif 'amazon' in driver.current_url:
print 'opened amazon'
search_box = driver.find_element_by_id('twotabsearchtextbox')
search_box.clear()
search_box.send_keys(inp)
search_box.send_keys(Keys.RETURN)
else:
print "--NONE--"
# driver.close()
if __name__ == '__main__':
hosts = ["http://ebay.com", "https://www.google.com/shopping?hl=en", "http://etsy.com", "http://amazon.com"]
num_hosts = len(hosts)
inp = raw_input("What do you want to search?: ")
p = Pool(num_hosts)
partial_main = partial(main, inp=inp)
p.map(partial_main, hosts)
p.close()
p.join()
I would like to do something like this:
done = raw_input("Type 'done' when finished:" )
if done = 'done':
driver.close()
but just injected that into main gives me an EOF error

QWaitCondition error when multiprocessing with python ghost.py

I'm using multiprocessing and ghost.py to crawl some data from the internet, but there are some errors:
2015-03-31T23:22:30 QT: QWaitCondition: Destroyed while threads are still waiting
This is some of my code:
l.acquire()
global ghost
try:
ghost = Ghost(wait_timeout=60)
ghost.open(website) #download page
ghost.wait_for_selector('#pagenum') #wait JS
html = []
#print u"\t\t the first page"
html.append(ghost.content)
pageSum = findPageSum(ghost.content)
for i in xrange(pageSum-1): #crawl all pages
#print u"\t\tthe"+ str(i+2) +"page"
ghost.set_field_value('#pagenum', str(i+2))
ghost.click('#page-go')
ghost.wait_for_text("<td>"+str(20*(i+1)+1)+"</td>")
html.append(ghost.content)
for i in html:
souped(i)
print website, "\t\t OK!"
except :
pass
l.release()
Other code:
global _use_line
q = Queue.Queue(0)
for i in xrange(len(websitelist)):
q.put((websitelist[i]))
lock = Lock()
while (not q.empty()):
if (_use_line > 0):
for i in range(_use_line):
dl = q.get()
_use_line -= 1
print "_use_line: ", _use_line
p = Process(target=download, args=(lock,dl))
p.start()
else:
time.sleep(1)
ghost.py uses pyqt and pyside, and I think this issue is because ofsome local variable's error, but I don't know how to find it.

Categories

Resources