I have a website which I'm querying after solving a CAPTCHA.
After solving the CAPTCHA my query downloads a PDF file. My issue is that I cannot get FireFox to download the file automatically to the current working directory without user interaction.
I also cannot figure out how to determine if the file already exists, which would prompt my code to display either a dialog or a message.
Here's my current code, which does everything correctly until the file download popup.
import os
import logging
import argparse
import requests
from time import sleep
from selenium import webdriver
from selenium.common import exceptions
from selenium.webdriver.common.by import By
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
logger = logging.getLogger('tst-log-query')
logger.setLevel(logging.DEBUG)
formatter = logging.Formatter(
'%(asctime)s - %(name)s - %(levelname)-5.5s - %(message)s', "%Y-%m-%d %H:%M:%S")
file_handler = logging.FileHandler(
'tst-log-query.log', 'w', encoding='utf-8')
file_handler.setFormatter(formatter)
logger.addHandler(file_handler)
mainurl = "https://cndt-certidao.tst.jus.br/inicio.faces"
ckey = "f1a382ddd51949057324a7fc7c8ccf8a"
def solver(captcha):
with requests.Session() as req:
print("[*] - Please wait while CAPTCHA is solved ")
cdata1 = {
"clientKey": ckey,
"task": {
"type": "ImageToTextTask",
"body": captcha
}
}
cdata2 = {
"clientKey": ckey
}
while True:
try:
r = req.post(
'https://api.anti-captcha.com/createTask', json=cdata1)
cdata2['taskId'] = r.json()['taskId']
break
except KeyError:
logger.debug(r.json()["errorDescription"])
continue
while True:
sleep(5)
logger.info("Slept 5 Seconds!")
fr = req.post(
'https://api.anti-captcha.com/getTaskResult', json=cdata2)
status = fr.json()
logger.debug("Status: {}".format(status["status"]))
if status['status'] == "ready":
cap = status['solution']['text']
print("[*] - CAPTCHA Solved!")
return cap
else:
continue
def main(pat):
# saving to current working directory
options = Options()
options.set_preference('browser.download.folderList', 2)
options.set_preference('browser.download.manager.showWhenStarting', False)
options.set_preference('browser.download.dir', os.getcwd())
options.set_preference(
'browser.helperApps.neverAsk.saveToDisk', 'pdf')
#__________________________#
driver = webdriver.Firefox(options=options)
print(f"Checking (CNPJ/CPF)# {pat}")
while True:
try:
driver.get(mainurl)
waiter = WebDriverWait(driver, 60)
waiter.until(
EC.element_to_be_clickable(
(By.CSS_SELECTOR, "input[value=Regularização]"))
).click()
waiter.until(
EC.presence_of_element_located(
(By.CSS_SELECTOR, "#consultarRegularizacaoForm\:cpfCnpj"))
).send_keys(pat)
cap = waiter.until(
EC.presence_of_element_located(
(By.CSS_SELECTOR, "img[src^=data]"))).get_attribute('src').split(',', 1)[1]
break
except exceptions.TimeoutException:
logger.error('[*] - Unable to found elements, Refreshing Request.')
continue
capso = solver(cap)
if capso:
driver.find_element(By.ID, 'idCaptcha').send_keys(capso)
driver.find_element(
By.ID, 'consultarRegularizacaoForm:btnEmitirCertidao').click()
if __name__ == "__main__":
parser = argparse.ArgumentParser(description='Download PDF File!')
parser.add_argument(
'pattern', metavar="(CNPJ/CPF) Number", help="(CNPJ/CPF) Number", type=str)
try:
main(parser.parse_args().pattern)
except KeyboardInterrupt:
exit("Good Bye!")
Usage: python script.py 15436940000103
options = Options()
options.headless = True
options.set_preference(
"browser.helperApps.neverAsk.saveToDisk", "application/pdf")
options.set_preference("browser.download.folderList", 2)
options.set_preference("browser.download.dir", os.getcwd())
options.set_preference("pdfjs.disabled", True)
driver = webdriver.Firefox(options=options)
Solved using the previous code.
Related
I'm writing a script to access a website using proxies with multiple threads but now I'm stuck in multiple threads, when I run the script below, it opens 5 browsers but all 5 use 1 proxy, I want 5 browsers to use different proxies, can someone help me complete it? thank you
Here is my script :
from selenium import webdriver
from selenium import webdriver
import time , random
import threading
def e():
a = open("sock2.txt", "r")
for line in a.readlines():
b = line
prox = b.split(":")
IP = prox[0]
PORT = int(prox[1].strip("\n"))
print(IP)
print(PORT)
profile = webdriver.FirefoxProfile()
profile.set_preference("network.proxy.type", 1)
profile.set_preference("network.proxy.socks", IP)
profile.set_preference("network.proxy.socks_port", PORT)
try:
driver = webdriver.Firefox(firefox_profile=profile)
driver.get("http://www.whatsmyip.org/")
except:
print("Proxy Connection Error")
driver.quit()
else:
time.sleep(random.randint(40, 70))
driver.quit()
for i in range(5):
t = threading.Thread(target=e)
t.start()
(Wish everyone has a happy and lucky new year)
Dominik Lašo captured it correctly - each threads processes the file from the beginning. Here's probably how it should look like:
from selenium import webdriver
from selenium import webdriver
import time , random
import threading
def e(ip, port):
profile = webdriver.FirefoxProfile()
profile.set_preference("network.proxy.type", 1)
profile.set_preference("network.proxy.socks", IP)
profile.set_preference("network.proxy.socks_port", PORT)
try:
driver = webdriver.Firefox(firefox_profile=profile)
driver.get("http://www.whatsmyip.org/")
except:
print("Proxy Connection Error")
driver.quit()
else:
time.sleep(random.randint(40, 70))
driver.quit()
my_threads = []
with open("sock2.txt", "r") as fd:
for line in fd.readlines():
line = line.strip()
if not line:
continue
prox = line.split(":")
ip = prox[0]
port = int(prox[1])
print('-> {}:{}'.format(ip, port))
t = threading.Thread(target=e, args=(ip, port,))
t.start()
my_threads.append(t)
for t in my_threads:
t.join()
( I personaly think that a problem is there that when you start a program, it will go to new thread, which will go throught the textfile from beginning, becasue you aint deleting them )
I have cane across the same problem, when I was doing the same thing as you do now. I know you would rather want help with your code, but I am in hurry to test it and want to help you ;) , so here is a code that works for me ... There is even task killer for a chrome ( you just have to edit it to firefox )
If I were you, I would start the thread after opening the file, cuz it looks liek you are opening the same file from 1st line everytime the tread starts
links = [ // Link you want to go to ]
def funk(xxx , website):
link = website
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--proxy-server=%s' % str(xxx))
chromedriver = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'chromedriver')
chrome = webdriver.Chrome(chromedriver, chrome_options=chrome_options)
try :
// Do stuff
except:
print('exception')
chrome.close()
for link in links:
f = open('proxies.txt')
line = f.readline()
x = 1
xx = 0
while line:
if number_of_used_proxies < 10:
print(line)
line = f.readline()
try:
threading.Timer(40, funk, [line, link]).start()
except Exception as e:
print(e)
time.sleep(1)
x += 1
number_of_used_proxies += 1
else:
time.sleep(100)
for x in range(1, 10):
try:
xzxzx = 'os.system("taskkill /f /im chrome.exe")'
os.system("killall 'Google Chrome'")
except:
print("NoMore")
time.sleep(10)
number_of_used_proxies = 0
f.close()
Hope it helps :)
vantuong: Here's how you can solve the problem with ThreadPoolExecutor.
Reference: https://docs.python.org/3/library/concurrent.futures.html
from selenium import webdriver
import time, random
#import threading
import concurrent.futures
MAX_WORKERS = 5
def get_proxys(data_file):
proxys = []
with open(data_file, "r") as fd:
for line in fd.readlines():
line = line.strip()
if not line:
continue
prox = line.split(":")
ip = prox[0]
port = int(prox[1])
proxys.append((ip, port))
return proxys
def e(ip, port):
profile = webdriver.FirefoxProfile()
profile.set_preference("network.proxy.type", 1)
profile.set_preference("network.proxy.socks", IP)
profile.set_preference("network.proxy.socks_port", PORT)
try:
driver = webdriver.Firefox(firefox_profile=profile)
driver.get("http://www.whatsmyip.org/")
except:
print("Proxy Connection Error")
driver.quit()
else:
time.sleep(random.randint(40, 70))
driver.quit()
with concurrent.futures.ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
proxys = get_proxys('sock2.txt')
tasks = {executor.submit(e, proxy[0], proxy[1]): proxy for proxy in proxys}
for task in concurrent.futures.as_completed(tasks):
proxy = tasks[task]
try:
data = task.result()
except Exception as exc:
print('{} generated an exception: {}'.format(proxy, exc))
else:
print('{} completed successfully'.format(proxy))
Fun exercise: Try playing around with different values of MAX_WORKERS.
I am using APScheduler to automate my script every 10 minutes for now. It stops executing after the print("[+] Success! Bot Starting!"), it does not output an error. I suggest my declaration of scheduler.add_job(trendingBot, 'interval', minutes=10, args=[url, browser]) is incorrect and I'm not sure how to fix it.
# grabs all the trending quotes for that day
def getTrendingQuotes(browser):
# wait until trending links appear, not really needed only for example
all_trendingQuotes = WebDriverWait(browser, 10).until(
lambda d: d.find_elements_by_css_selector('#trendingQuotes a')
)
return [link.get_attribute('href') for link in all_trendingQuotes]
def getStockDetails(url, browser):
print(url)
browser.get(url)
quote_wrapper = browser.find_element_by_css_selector('div.quote-wrapper')
quote_name = quote_wrapper.find_element_by_class_name(
"quote-name").find_element_by_tag_name('h2').text
quote_price = quote_wrapper.find_element_by_class_name("quote-price").text
quote_volume = quote_wrapper.find_element_by_class_name(
"quote-volume").text
print("\n")
print("Quote Name: " + quote_name)
print("Quote Price: " + quote_price)
print("Quote Volume: " + quote_volume)
print("\n")
convertToJson(quote_name, quote_price, quote_volume, url)
quotesArr = []
# Convert to a JSON file
def convertToJson(quote_name, quote_price, quote_volume, url):
quoteObject = {
"url": url,
"Name": quote_name,
"Price": quote_price,
"Volume": quote_volume
}
quotesArr.append(quoteObject)
def trendingBot(url, browser):
browser.get(url)
trending = getTrendingQuotes(browser)
for trend in trending:
getStockDetails(trend, browser)
# requests finished, write json to file
with open('trendingQuoteData.json', 'w') as outfile:
json.dump(quotesArr, outfile)
def Main():
scheduler = BlockingScheduler()
chrome_options = Options()
chrome_options.add_argument("--headless")
# applicable to windows os only
chrome_options.add_argument('--disable-gpu')
url = 'https://www.tmxmoney.com/en/index.html'
browser = webdriver.Chrome(
r"C:\Users\austi\OneDrive\Desktop\chromeDriver\chromedriver_win32\chromedriver.exe", chrome_options=chrome_options)
browser.get(url)
os.system('cls')
print("[+] Success! Bot Starting!")
scheduler.add_job(trendingBot, 'interval', minutes=1, args=[url, browser])
scheduler.start()
#trendingBot(url, browser)
browser.quit()
if __name__ == "__main__":
Main()
APScheduler is non-blocking by default, which means that when you do scheduler.add_job(.. and scheduler.start() your application continues to run to browser.quit() and finally to the end of your application (where it exits).
You seem to want a blocking scheduler such as BlockingScheduler.
Change the line:
scheduler = BackgroundScheduler()
to
scheduler = BlockingScheduler()
I am attempting to use chrome with a user pass proxy this is my code:
def getProxy(string_only=True):
try:
proxy_manager = ProxyManager("proxies.txt")
proxy = proxy_manager.random_proxy()
if string_only:
return proxy.proxy_string
return proxy.get_dict()
except (OSError, IOError, IndexError) as e: # couldn't load the file / file is empty
return None
chrome_options = webdriver.ChromeOptions()
proxy = getProxy()
userpass = proxy[0:proxy.find("#")]
hostport = proxy[proxy.find("#")+1:]
if proxy:
chrome_options.add_argument("--proxy-server=" + hostport)
driver1 = webdriver.Chrome(chrome_options=chrome_options)
wait = WebDriverWait(driver1, 1000000)
if manual == "No":
driver1.get("http://%s#www.google.com"%userpass)
Help is appreciated
So I found this script online, which is meant to brute-force web-forms online etc. using Selenium, and I thought it would be a good idea to take it, modify it a bit and experiment with it. This time, I tried creating a bot that:
Signs up to Twitter
Goes to twitter.com
Posts something.
Logs out.
Loop 1, 2, 3, 4 again
However, when I run the script, it just pops up a browser window and does nothing. Then the Terminal ends the Python script like it did its work correctly and finished with no problems...
Code (note that the script might look weird for what I want, but that's because I found the script as a web-form brute-forcer online, and decided to modify it to my needs):
#!/bin/python
from mainLib import *
from time import sleep
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import simplejson as json
import sys
import optparse
profile = webdriver.FirefoxProfile()
profile.set_preference("general.useragent.override", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.76 Safari/537.36")
driver = "reserved"
def userExists(username):
try:
driver.get("https://twitter.com/"+username)
assert (("??" or "?Twitter / ?") not in driver.title)
except AssertionError:
print '\033[1;31mUser\033[1;m',
print '\033[1;35m#%s\033[1;m' %username,
print '\033[1;31mdoes not exist.\033[1;m',
print '\033[1;33mTrying with the next Username...\033[1;m'
print ' '
return 1
except:
'uknown error'
def login(user, password, delay):
try:
print '\033[1;33mCreating account with mail: \033[1;m' + '\033[1;35m' + password + '\033[1;m' '\033[1;33m ...\033[1;m'
sleep(2)
if driver.current_url == 'https://twitter.com':
print '\033[1;33mPlease retry using a different IP Address (Proxy/VPN).\033[1;m'
driver.get("https://twitter.com/signup")
if driver.title == "Login on Twitter":
driver.get("https://twitter.com/signup")
sleep(3)
elem = driver.find_element_by_id('full-name')
elem.clear()
elem.send_keys('john')
elem = driver.find_element_by_id('email')
elem.clear()
elem.send_keys(password)
elem = driver.find_element_by_id('password')
elem.clear()
elem.send_keys("twitter1")
sleep(3)
elem.send_keys(Keys.RETURN)
sleep(delay + 3)
if driver.title == "Twitter / Error":
print ' \033[1;31mFailed!\033[1;m'
driver.get("https://twitter.com/signup")
sleep(3)
if driver.title == "Login to Twitter":
print ' \033[1;31mFailed!\033[1;m'
driver.get("https://twitter.com/signup")
sleep(3)
# if "This email is already registered." in driver.page_source:
# print ' \033[1;31mFailed!\033[1;m'
if driver.current_url == 'https://twitter.com/account/access':
print ' \033[1;31mFailed!\033[1;m'
print ("")
print '\033[1;33mPlease retry using a different IP Address (Proxy/VPN).\033[1;m'
driver.close()
sys.exit("")
assert (("Enter your phone") not in driver.title)
except AssertionError:
print ' \033[1;32mSuccess!\033[1;m'
# print '\033[1;35mEmail: \033[1;m' + password
# print '\033[1;35mPassword: \033[1;m' + "twitter1"
# print("")
try:
f = open('CreatedAccounts.txt','a')
except:
f = open('CreatedAccounts.txt','w')
f.write(password+'\n')
f.close()
driver.get("https://twitter.com")
elem = driver.find_element_by_id('tweet-box-home-timeline')
elem.clear()
elem.send_keys('It worked!')
elem = driver.find_element_by_xpath('//*[#id="timeline"]/div[2]/div/form/div[2]/div[2]/button')
elem.send_keys(Keys.RETURN)
time.sleep(5)
driver.get("https://twitter.com/logout")
sleep(5)
elem = driver.find_element_by_css_selector("button.js-submit").click()
sleep(5)
driver.get("https://twitter.com/signup")
# driver.delete_all_cookies()
# return 1
# else:
# print '\033[1;33mPlease check your Internet Connection.\033[1;m'
def dictionaryAttack(usernames,passwords,delay):
if str(type(usernames)) == "<type 'list'>":
for username in usernames:
#if (userExists(username) == 1):
# continue
driver.get("https://twitter.com/signup")
sleep(delay)
print("Creating Accounts...")
print("")
for password in passwords:
if (login(username,password,delay) == 1):
cj.clear()
break
def main():
parser = optparse.OptionParser()
parser.add_option('-f', '--file', action="store", dest="userfile", help="File containing valid usernames (one per line)", default=False)
parser.add_option('-d', '--dictionary', action="store", dest="dictionary", help="Text file containing passwords", default=False)
parser.add_option('-u', '--username', action="store", dest="username", help="A valid username", default=False)
parser.add_option('-t', '--time', action="store", dest="delay", help="Delay (in seconds) - use this option based on your Network Connection speed.", default=True)
options, args = parser.parse_args()
global driver
if (options.delay is None):
delay = 4
else:
delay = int(options.delay)
print '\033[1;33mUsing\033[1;m',
print '\033[1;35m%d second(s)\033[1;m' %delay,
print '\033[1;33mof delay between login attempts.\033[1;m'
print ' '
if ( (options.userfile == False) and (options.username == False) ) :
print 'You have to set an username or a userfile'
exit()
if ( (options.userfile != False) and (options.username != False) ) :
print 'You can\'t set both options at once.. choose between username or userfile'
exit()
if (options.dictionary == False):
print 'You have to set a valid path for the passwords dictionary.'
exit()
try:
f = open(options.dictionary,'r')
passwords = []
while True:
line = f.readline()
if not line:
break
passwords.append(line.strip('\n'))
f.close()
except:
print 'Check the path to the dictionary and try again.'
exit()
if (options.userfile != False):
try:
f = open(options.userfile,'r')
usernames = []
while True:
line = f.readline()
if not line:
break
usernames.append(line.strip('\n'))
f.close()
except:
print 'Check the path to the users file and try again.'
exit()
driver = webdriver.Firefox(profile)
driver.implicitly_wait(30)
dictionaryAttack(usernames,passwords,delay)
else:
driver = webdriver.Firefox(profile)
driver.implicitly_wait(30)
dictionaryAttack(options.username,passwords,delay)
driver.close()
if __name__ == '__main__':
main()
See the link below. That could be the problem.
https://github.com/SeleniumHQ/selenium/issues/2257
import time
import unittest
from selenium import webdriver
#from selenium.webdriver.common.keys import Keys
#from setuptools.py31compat import unittest_main
username = "robertredrain#gmail.com"
password = ""
tomailid = "robertredrain#yahoo.com"
emailsubject = "robertredrain#yahoo.com"
mailbody = "Great! you sent email:-)" + "\n" + "Regards," + "\n" + "Robert"
class send_email(unittest.TestCase):
def setUp(self):
self.driver = webdriver.Firefox()
self.baseUrl = "http://mail.google.com/intl/en/mail/help/about.html"
def test_Login_Email(self):
driver = self.driver
driver.get(self.baseUrl)
driver.maximize_window()
driver.find_element_by_id("gmail-sign-in").click()
driver.find_element_by_id("Email").clear()
driver.find_element_by_id("Email").send_keys(username)
driver.find_element_by_id("next").click()
time.sleep(5)
driver.find_element_by_id("Passwd").clear()
driver.find_element_by_id("Passwd").send_keys(password)
driver.find_element_by_id("signIn").click();
#Verify login
if "Gmail" in driver.title:
print("Logged in sucessfully !!!" + driver.title)
else:
print("Unable to loggin :-( " + driver.title)
time.sleep(5)
def test_Compose_Email(self):
time.sleep(5)
driver = self.driver
driver.find_element_by_xpath("/html/body/div[7]/div[3]/div/div[2]/div[1]/div[1]/div[1]/div[2]/div/div/div[1]/div/div").click()
#time.sleep(5)
driver.find_element_by_class_name("vO").send_keys(tomailid)
driver.find_element_by_class_name("aoT").send_keys(emailsubject)
driver.find_element_by_class_name("Am").clear()
driver.find_element_by_class_name("Am").send_keys(mailbody)
driver.find_element_by_xpath("//div[text()='Send']").click()
def tearDown(self):
self.driver.close();
if __name__ == '__main__':
unittest.main()
I try to do "COMPOSE email", but got the error "raise exception_class(message, screen, stacktrace)
selenium.common.exceptions.NoSuchElementException: Message: Unable to locate element: {"method":"xpath","selector":"/html/body/div[7]/div[3]/div/div[2]/div[1]/div[1]/div[1]/div[2]/div/div/div[1]/div/div"}"
Could someone help? Thanks a lot!
Make it simple and locate the "Compose" button by text:
//div[. = "COMPOSE"]