I'm writing a script to access a website using proxies with multiple threads but now I'm stuck in multiple threads, when I run the script below, it opens 5 browsers but all 5 use 1 proxy, I want 5 browsers to use different proxies, can someone help me complete it? thank you
Here is my script :
from selenium import webdriver
from selenium import webdriver
import time , random
import threading
def e():
a = open("sock2.txt", "r")
for line in a.readlines():
b = line
prox = b.split(":")
IP = prox[0]
PORT = int(prox[1].strip("\n"))
print(IP)
print(PORT)
profile = webdriver.FirefoxProfile()
profile.set_preference("network.proxy.type", 1)
profile.set_preference("network.proxy.socks", IP)
profile.set_preference("network.proxy.socks_port", PORT)
try:
driver = webdriver.Firefox(firefox_profile=profile)
driver.get("http://www.whatsmyip.org/")
except:
print("Proxy Connection Error")
driver.quit()
else:
time.sleep(random.randint(40, 70))
driver.quit()
for i in range(5):
t = threading.Thread(target=e)
t.start()
(Wish everyone has a happy and lucky new year)
Dominik Lašo captured it correctly - each threads processes the file from the beginning. Here's probably how it should look like:
from selenium import webdriver
from selenium import webdriver
import time , random
import threading
def e(ip, port):
profile = webdriver.FirefoxProfile()
profile.set_preference("network.proxy.type", 1)
profile.set_preference("network.proxy.socks", IP)
profile.set_preference("network.proxy.socks_port", PORT)
try:
driver = webdriver.Firefox(firefox_profile=profile)
driver.get("http://www.whatsmyip.org/")
except:
print("Proxy Connection Error")
driver.quit()
else:
time.sleep(random.randint(40, 70))
driver.quit()
my_threads = []
with open("sock2.txt", "r") as fd:
for line in fd.readlines():
line = line.strip()
if not line:
continue
prox = line.split(":")
ip = prox[0]
port = int(prox[1])
print('-> {}:{}'.format(ip, port))
t = threading.Thread(target=e, args=(ip, port,))
t.start()
my_threads.append(t)
for t in my_threads:
t.join()
( I personaly think that a problem is there that when you start a program, it will go to new thread, which will go throught the textfile from beginning, becasue you aint deleting them )
I have cane across the same problem, when I was doing the same thing as you do now. I know you would rather want help with your code, but I am in hurry to test it and want to help you ;) , so here is a code that works for me ... There is even task killer for a chrome ( you just have to edit it to firefox )
If I were you, I would start the thread after opening the file, cuz it looks liek you are opening the same file from 1st line everytime the tread starts
links = [ // Link you want to go to ]
def funk(xxx , website):
link = website
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--proxy-server=%s' % str(xxx))
chromedriver = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'chromedriver')
chrome = webdriver.Chrome(chromedriver, chrome_options=chrome_options)
try :
// Do stuff
except:
print('exception')
chrome.close()
for link in links:
f = open('proxies.txt')
line = f.readline()
x = 1
xx = 0
while line:
if number_of_used_proxies < 10:
print(line)
line = f.readline()
try:
threading.Timer(40, funk, [line, link]).start()
except Exception as e:
print(e)
time.sleep(1)
x += 1
number_of_used_proxies += 1
else:
time.sleep(100)
for x in range(1, 10):
try:
xzxzx = 'os.system("taskkill /f /im chrome.exe")'
os.system("killall 'Google Chrome'")
except:
print("NoMore")
time.sleep(10)
number_of_used_proxies = 0
f.close()
Hope it helps :)
vantuong: Here's how you can solve the problem with ThreadPoolExecutor.
Reference: https://docs.python.org/3/library/concurrent.futures.html
from selenium import webdriver
import time, random
#import threading
import concurrent.futures
MAX_WORKERS = 5
def get_proxys(data_file):
proxys = []
with open(data_file, "r") as fd:
for line in fd.readlines():
line = line.strip()
if not line:
continue
prox = line.split(":")
ip = prox[0]
port = int(prox[1])
proxys.append((ip, port))
return proxys
def e(ip, port):
profile = webdriver.FirefoxProfile()
profile.set_preference("network.proxy.type", 1)
profile.set_preference("network.proxy.socks", IP)
profile.set_preference("network.proxy.socks_port", PORT)
try:
driver = webdriver.Firefox(firefox_profile=profile)
driver.get("http://www.whatsmyip.org/")
except:
print("Proxy Connection Error")
driver.quit()
else:
time.sleep(random.randint(40, 70))
driver.quit()
with concurrent.futures.ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
proxys = get_proxys('sock2.txt')
tasks = {executor.submit(e, proxy[0], proxy[1]): proxy for proxy in proxys}
for task in concurrent.futures.as_completed(tasks):
proxy = tasks[task]
try:
data = task.result()
except Exception as exc:
print('{} generated an exception: {}'.format(proxy, exc))
else:
print('{} completed successfully'.format(proxy))
Fun exercise: Try playing around with different values of MAX_WORKERS.
Related
I decided to create a checker for Instagram accounts.
Please tell me how you can change the ip when you restart the browser. I have a Tor profile. That is, the ip automatically changes every 10 minutes. How can I make the ip change 1 time per minute. Is this even possible?
Maybe there is some kind of set_preference setting or how in general you can change the ip when restarting the Firefox browser with the Tor settings.
import time
from selenium import webdriver
from selenium.webdriver.firefox.firefox_profile import FirefoxProfile
file = open('Good2.txt', encoding='utf-8-sig').read().split('\n')
goods = open('good_acc.txt', 'a+')
def settings_browser():
""" Настройки браузера FireFox. """
profile = FirefoxProfile(r'C:\Users\ASUS\Desktop\Scrape\Tor Browser\Browser\TorBrowser\Data\Browser\profile.default')
profile.set_preference('network.proxy.type', 1)
profile.set_preference('network.proxy.socks', '127.0.0.1')
profile.set_preference('network.proxy.socks_port', 9050)
profile.set_preference("network.proxy.socks_remote_dns", False)
# get a huge speed increase by not downloading images
profile.set_preference("permissions.default.image", 2)
profile.update_preferences()
return profile
def check_email():
""" Принимает всплывающее окно куки. Чекает валидность почт, если почта валидна, то сохраняет в файл 'good_acc.txt'. """
driver = webdriver.Firefox(firefox_profile=settings_browser(), executable_path=r'C:\Users\ASUS\Desktop\Scrape\geckodriver.exe')
for login in file:
driver.get("https://www.instagram.com/accounts/password/reset/")
body = driver.find_elements_by_class_name('pbNvD.FrS-d.gD9tr')
for bd in body:
if bd.find_element_by_class_name('aOOlW.bIiDR').text == 'Принять все':
bd.find_element_by_class_name('aOOlW.bIiDR').click()
time.sleep(7)
authorization = driver.find_elements_by_class_name("AHCwU")
pops = driver.find_elements_by_class_name("_-rjm")
username = login.split(":")[0]
password = login.split(":")[1]
for data in authorization:
# почта логин
data_login = data.find_element_by_name('cppEmailOrUsername')
data_login.click()
data_login.send_keys(username)
time.sleep(1)
# кнопка входа
clock_button = data.find_element_by_class_name('sqdOP.L3NKy.y3zKF')
clock_button.click()
time.sleep(2)
for pop in pops:
if 'Мы отправили ссылку для восстановления' in pop.find_element_by_class_name('tA2fc').text:
# Почта зарегана
goods.write(username + ' : ' + password + '\n')
print('Валидный аккаунт ' + username + ' : ' + password)
elif 'Подождите несколько минут, прежде чем пытаться снова.' in pop.find_element_by_class_name('tA2fc').text:
driver.quit()
driver = webdriver.Firefox(firefox_profile=settings_browser(), executable_path=r'C:\Users\ASUS\Desktop\Scrape\geckodriver.exe')
print('Успешная перезагрузка драйвера из-за "Подождите несколько минут, прежде чем пытаться снова."')
elif 'feedback_required' in pop.find_element_by_class_name('tA2fc').text:
driver.quit()
driver = webdriver.Firefox(firefox_profile=settings_browser(), executable_path=r'C:\Users\ASUS\Desktop\Scrape\geckodriver.exe')
print('Успешная перезагрузка драйвера из-за "feedback_required"')
else:
# Почта не зарегана
print(f"Не валидный аккаунт " + username)
goods.close()
def main():
check_email()
if __name__ == '__main__':
main()
If you set ControlPort: 9051 (and Passord: ...) in config file (on Linux /etc/tor/torrc) then you can use even standard socket to send signal to tor to change IP.
import socket
s = socket.socket()
s.connect(('127.0.0.1', 9051))
s.send('AUTHENTICATE "your_passord"\r\nSIGNAL NEWNYM\r\n'.encode())
It needs few seconds to get new IP from tor network.
And after few seconds proxy should use new IP.
You may also use module stem for this (it also needs settings in torrc)
from stem import Signal
from stem.control import Controller
with Controller.from_port(port=9051) as controller:
controller.authenticate(password='your_password')
controller.signal(Signal.NEWNYM)
More: Python: How to use Tor Network with requests to change IP?
EDIT:
import socket
def main():
# send signal to `tor` to change IP
s = socket.socket()
s.connect(('127.0.0.1', 9051))
s.send('AUTHENTICATE "your_passord"\r\nSIGNAL NEWNYM\r\n'.encode())
# wait few seconds for new IP
time.sleep(3)
check_email()
I'm using multiprocessing to test proxy server usability. The target function of my process takes a proxy server address and a queue as arguments and opens an instance of webdriver with the given proxy. The function tests the proxy by going to a specific url and trying to retrieve an html element. If the test is successful the function will add the webdriver instance to the queue. The function is shown below.
def test_proxy(address, queue):
print(f"testing proxy {address}")
chrome_options_1 = webdriver.ChromeOptions()
chrome_options_1.add_argument('--proxy-server=%s' % address)
chrome_options_1.add_argument("headless")
driver = webdriver.Chrome('.\driver\chromedriver.exe', options=chrome_options_1)
driver.set_page_load_timeout(10)
url = "https://www.facebook.com/marketplace/nyc/search/?query=honda"
try:
driver.get(url)
driver.find_element_by_xpath("//*[#class='kbiprv82']/a").get_attribute("href")
print(f"Successfully connected to proxy server at {address}")
queue.put(driver)
return
except:
print("Connection failed")
driver.quit()
In my main process I have a list of proxy addresses to test. A process is created to test each proxy in the list until a test is successful and a driver instance is put in the queue. If an item is found in the queue all the processes are terminated and the proxy list is cleared. The loop in my main process limits the number of child processes to 10. The main process code is in a class and shown below.
def find_proxy(self):
self.proxies = []
self.proxy_queue = multiprocessing.Queue()
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument("headless")
driver = webdriver.Chrome('.\driver\chromedriver.exe', options=chrome_options)
driver.get("https://free-proxy-list.net/")
Select(driver.find_element_by_xpath("//*[#id='proxylisttable']/tfoot/tr/th[7]/select")).select_by_visible_text("yes")
for country in ["US", "MX", "CA", "CL", "CO", "BR", "PE"]:
try:
Select(driver.find_element_by_xpath("//*[#id='proxylisttable']/tfoot/tr/th[3]/select")).select_by_visible_text(country)
i = 0
entries = driver.find_elements_by_xpath("//table[#id='proxylisttable']/tbody/tr/td")
for entry in entries:
if i == 7:
i = 0
self.proxies.append([proxy_address])
else:
if i == 0:
proxy_address = entry.text + ':'
if i == 1:
proxy_address = proxy_address + entry.text
i = i + 1
except:
pass
driver.quit()
while len(self.proxies) > 0:
i = 0
for proxy in self.proxies[:10]:
if self.proxy_queue.empty() == False:
driver = self.proxy_queue.get()
for proxy_1 in self.proxies:
try:
proxy_1[1].terminate()
except:
pass
self.proxies.clear()
return driver
elif len(proxy) < 2:
proxy.insert(1, multiprocessing.Process(target=test_proxy, args=(proxy[0], self.proxy_queue,)))
print(f"proxy thread {proxy[0]} created")
proxy[1].start()
print(f"proxy thread {proxy[0]} started")
elif proxy[1].is_alive() == False:
print(f"proxy thread {proxy[0]} dead")
del self.proxies[i]
print("proxy deleted")
break
i = i + 1
The issue is that the processes seem to start just fine but none of the code in the test_proxy function is actually run, not even the first print statement.
I am attempting to use chrome with a user pass proxy this is my code:
def getProxy(string_only=True):
try:
proxy_manager = ProxyManager("proxies.txt")
proxy = proxy_manager.random_proxy()
if string_only:
return proxy.proxy_string
return proxy.get_dict()
except (OSError, IOError, IndexError) as e: # couldn't load the file / file is empty
return None
chrome_options = webdriver.ChromeOptions()
proxy = getProxy()
userpass = proxy[0:proxy.find("#")]
hostport = proxy[proxy.find("#")+1:]
if proxy:
chrome_options.add_argument("--proxy-server=" + hostport)
driver1 = webdriver.Chrome(chrome_options=chrome_options)
wait = WebDriverWait(driver1, 1000000)
if manual == "No":
driver1.get("http://%s#www.google.com"%userpass)
Help is appreciated
Wrote this crawler in Python, it dumps several parameters to JSON output file based on the input list of domains.
Have this question:
Do I need to close the HTTP connection in each thread? Input data is ca. 5 Million items. It process at the beginning at a rate ca. 50 iterations per second, but later after some time it drops to 1-2 per second and/or hangs (no kernel messages and no errors on stdout)? Can this be code or is network limiting related? I suspect software since when I restart it, it starts again with high rate (ca. 50 iteration per second)
Any tips how to improve the code below are also welcome, especially improve on speed and crawling throughput.
Code in questions:
import urllib2
import pprint
from tqdm import tqdm
import lxml.html
from Queue import Queue
from geoip import geolite2
import pycountry
from tld import get_tld
resfile = open("out.txt",'a')
concurrent = 200
def doWork():
while True:
url = q.get()
status = getStatus(url)
doSomethingWithResult(status)
q.task_done()
def getStatus(ourl):
try:
response = urllib2.urlopen("http://"+ourl)
peer = response.fp._sock.fp._sock.getpeername()
ip = peer[0]
header = response.info()
html = response.read()
html_element = lxml.html.fromstring(html)
generator = html_element.xpath("//meta[#name='generator']/#content")
try:
match = geolite2.lookup(ip)
if match is not None:
country= match.country
try:
c=pycountry.countries.lookup(country)
country=c.name
except:
country=""
except:
country=""
try:
res=get_tld("http://www"+ourl, as_object=True)
tld=res.suffix
except:
tld=""
try:
match = re.search(r'[\w\.-]+#[\w\.-]+', html)
email=match.group(0)
except:
email=""
try:
item= generator[0]
val = "{ \"Domain\":\"http://"+ourl.rstrip()+"\",\"IP:\""+ip+"\"," + "\"Server\":"+ "\""+str(header.getheader("Server")).replace("None","")+"\",\"PoweredBy\":" + "\""+str(header.getheader("X-Powered-By")).replace("None","")+"\""+",\"MetaGenerator\":\""+item+"\",\"Email\":\""+email+"\",\"Suffix\":\""+tld+"\",\"CountryHosted\":\""+country+"\" }"
except:
val = "{ \"Domain\":\"http://"+ourl.rstrip()+"\",\"IP:\""+ip+"\"," + "\"Server\":"+ "\""+str(header.getheader("Server")).replace("None","")+"\",\"PoweredBy\":" + "\""+str(header.getheader("X-Powered-By")).replace("None","")+"\""+",\"MetaGenerator\":\"\",\"Email\":\""+email+"\",\"Suffix\":\""+tld+"\",\"CountryHosted\":\""+country+"\" }"
return val
except Exception as e:
#print "error"+str(e)
pass
def doSomethingWithResult(status):
if status:
resfile.write(str(status)+"\n")
q = Queue(concurrent * 2)
for i in range(concurrent):
t = Thread(target=doWork)
t.daemon = True
t.start()
try:
for url in tqdm(open('list.txt')):
q.put(url.strip())
status = open("status.txt",'w')
status.write(str(url.strip()))
q.join()
except KeyboardInterrupt:
sys.exit(1)
Update 1:
Closing the Socket and FileDescriptor makes it work better, does not seem to hang anymore after some time. Performance is 50 reqs/sec on home laptop and ca 100 req/sec on a VPS
from threading import Thread
import httplib, sys
import urllib2
import pprint
from tqdm import tqdm
import lxml.html
from Queue import Queue
from geoip import geolite2
import pycountry
from tld import get_tld
import json
resfile = open("out.txt",'a')
concurrent = 200
def doWork():
while True:
url = q.get()
status = getStatus(url)
doSomethingWithResult(status)
q.task_done()
def getStatus(ourl):
try:
response = urllib2.urlopen("http://"+ourl)
realsock = response.fp._sock.fp._sock
peer = response.fp._sock.fp._sock.getpeername()
ip = peer[0]
header = response.info()
html = response.read()
realsock.close()
response.close()
html_element = lxml.html.fromstring(html)
generator = html_element.xpath("//meta[#name='generator']/#content")
try:
match = geolite2.lookup(ip)
if match is not None:
country= match.country
try:
c=pycountry.countries.lookup(country)
country=c.name
except:
country=""
except:
country=""
try:
res=get_tld("http://www"+ourl, as_object=True)
tld=res.suffix
except:
tld=""
try:
match = re.search(r'[\w\.-]+#[\w\.-]+', html)
email=match.group(0)
except:
email=""
try:
item= generator[0]
val = "{ \"Domain\":"+json.dumps("http://"+ourl.rstrip())+",\"IP\":\""+ip+"\",\"Server\":"+json.dumps(str(header.getheader("Server")).replace("None",""))+",\"PoweredBy\":" +json.dumps(str(header.getheader("X-Powered-By")).replace("None",""))+",\"MetaGenerator\":"+json.dumps(item)+",\"Email\":"+json.dumps(email)+",\"Suffix\":\""+tld+"\",\"CountryHosted\":\""+country+"\" }"
except:
val = "{ \"Domain\":"+json.dumps("http://"+ourl.rstrip())+",\"IP\":\""+ip+"\"," + "\"Server\":"+json.dumps(str(header.getheader("Server")).replace("None",""))+",\"PoweredBy\":" +json.dumps(str(header.getheader("X-Powered-By")).replace("None",""))+",\"MetaGenerator\":\"\",\"Email\":"+json.dumps(email)+",\"Suffix\":\""+tld+"\",\"CountryHosted\":\""+country+"\" }"
return val
except Exception as e:
print "error"+str(e)
pass
def doSomethingWithResult(status):
if status:
resfile.write(str(status)+"\n")
q = Queue(concurrent * 2)
for i in range(concurrent):
t = Thread(target=doWork)
t.daemon = True
t.start()
try:
for url in tqdm(open('list.txt')):
q.put(url.strip())
status = open("status.txt",'w')
status.write(str(url.strip()))
q.join()
except KeyboardInterrupt:
sys.exit(1)
The handles will be automatically garbage collected, but, you will be better off closing the handles yourself, especially as you are doing this in a tight loop.
You also asked for suggestions for improvement. A big one would be to stop using urllib2 and start using requests instead.
There are many possible options, why your crawling rate drops.
1.) Take care not to crawl to much data from the same domain. Some web servers are configured just to allow one connection per IP address in parallel.
2.) Try to send randomized browser-like http headers (user-agent, referrer, ...) to prevent web server scraping protection, if set.
3.) Use a mature http (parallel) library, like pycurl (has MultiCurl) or requests (grequests). They perform faster for sure.
I'm using libmproxy to capture http traffic. I would like to use web driver to load a web page while the proxy is running. The proxy code is working great but I am unable to launch the page. I assume the script is becoming stuck in a loop at m.run(). How can I move to the web driver code while the proxy is running?
import unittest
import sys
from libmproxy import proxy, dump, cmdline
from libmproxy.version import VERSION
from optparse import OptionParser
from selenium import webdriver
class Test(unittest.TestCase):
def setUp(self):
parser = OptionParser(
usage = "%prog [options] [filter]",
version="%%prog %s"%VERSION,
)
cmdline.common_options(parser)
parser.add_option(
"--keepserving",
action="store_true", dest="keepserving", default=False,
help="Continue serving after client playback or file read. We exit by default."
)
options, args = parser.parse_args()
if options.quiet:
options.verbose = 0
proxyconfig = proxy.process_proxy_options(parser, options)
if options.no_server:
server = proxy.DummyServer(proxyconfig)
else:
try:
server = proxy.ProxyServer(proxyconfig, options.port, options.addr)
except proxy.ProxyServerError, v:
print >> sys.stderr, "mitmdump:", v.args[0]
sys.exit(1)
try:
dumpopts = dump.Options(**cmdline.get_common_options(options))
except cmdline.OptionException, v:
parser.error(v.message)
dumpopts.keepserving = options.keepserving
if args:
filt = " ".join(args)
else:
filt = None
try:
PROXY_HOST = "localhost"
PROXY_PORT = 8080
#driver.get("http://msn.com")
#f = open('/Users/cnave/Documents/capture/dump.txt', 'w')
#sys.stdout('/Users/cnave/Documents/capture/dump.txt', 'w')
#open('/Users/cnave/Documents/capture/dump.txt', 'w')
m = dump.DumpMaster(server, dumpopts, filt)
m.run()
#sys.stdout('/Users/cnave/Documents/capture/dump')
fp = webdriver.FirefoxProfile()
# Direct = 0, Manual = 1, PAC = 2, AUTODETECT = 4, SYSTEM = 5
fp.set_preference("network.proxy.type", 1)
fp.set_preference("network.proxy.http", PROXY_HOST)
fp.set_preference("network.proxy.http_port", PROXY_PORT)
fp.set_preference("network.proxy.no_proxies_on", "") # set this value as desired
driver = webdriver.Firefox(firefox_profile=fp)
driver.get('http://google.com')
except dump.DumpError, e:
print >> sys.stderr, "mitmdump:", e
sys.exit(1)
except KeyboardInterrupt:
pass
self.dm = dump.DumpMaster(server, dumpopts, filt)
// run the MITM proxy in a background thread
thread.start_new_thread(self.dm.run,())
// and you maybe need to shutdown the proxy in tearDown()
self.dm.shutdown()