APScheduler and passing arguments - python

I am using APScheduler to automate my script every 10 minutes for now. It stops executing after the print("[+] Success! Bot Starting!"), it does not output an error. I suggest my declaration of scheduler.add_job(trendingBot, 'interval', minutes=10, args=[url, browser]) is incorrect and I'm not sure how to fix it.
# grabs all the trending quotes for that day
def getTrendingQuotes(browser):
# wait until trending links appear, not really needed only for example
all_trendingQuotes = WebDriverWait(browser, 10).until(
lambda d: d.find_elements_by_css_selector('#trendingQuotes a')
)
return [link.get_attribute('href') for link in all_trendingQuotes]
def getStockDetails(url, browser):
print(url)
browser.get(url)
quote_wrapper = browser.find_element_by_css_selector('div.quote-wrapper')
quote_name = quote_wrapper.find_element_by_class_name(
"quote-name").find_element_by_tag_name('h2').text
quote_price = quote_wrapper.find_element_by_class_name("quote-price").text
quote_volume = quote_wrapper.find_element_by_class_name(
"quote-volume").text
print("\n")
print("Quote Name: " + quote_name)
print("Quote Price: " + quote_price)
print("Quote Volume: " + quote_volume)
print("\n")
convertToJson(quote_name, quote_price, quote_volume, url)
quotesArr = []
# Convert to a JSON file
def convertToJson(quote_name, quote_price, quote_volume, url):
quoteObject = {
"url": url,
"Name": quote_name,
"Price": quote_price,
"Volume": quote_volume
}
quotesArr.append(quoteObject)
def trendingBot(url, browser):
browser.get(url)
trending = getTrendingQuotes(browser)
for trend in trending:
getStockDetails(trend, browser)
# requests finished, write json to file
with open('trendingQuoteData.json', 'w') as outfile:
json.dump(quotesArr, outfile)
def Main():
scheduler = BlockingScheduler()
chrome_options = Options()
chrome_options.add_argument("--headless")
# applicable to windows os only
chrome_options.add_argument('--disable-gpu')
url = 'https://www.tmxmoney.com/en/index.html'
browser = webdriver.Chrome(
r"C:\Users\austi\OneDrive\Desktop\chromeDriver\chromedriver_win32\chromedriver.exe", chrome_options=chrome_options)
browser.get(url)
os.system('cls')
print("[+] Success! Bot Starting!")
scheduler.add_job(trendingBot, 'interval', minutes=1, args=[url, browser])
scheduler.start()
#trendingBot(url, browser)
browser.quit()
if __name__ == "__main__":
Main()

APScheduler is non-blocking by default, which means that when you do scheduler.add_job(.. and scheduler.start() your application continues to run to browser.quit() and finally to the end of your application (where it exits).
You seem to want a blocking scheduler such as BlockingScheduler.
Change the line:
scheduler = BackgroundScheduler()
to
scheduler = BlockingScheduler()

Related

Selenium not finding information(worked before)

I have this script to get the bids of an NFT as they come in, It worked perfectly before however all of a sudden it stopped being able to find the information that i needed.
It was able to save the information into a file and then post the message onto discord
however now it cant get there due to it not being able to get the information.
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
import asyncio,calendar,discord
import time
from discord.ext import commands
TOKEN = "token here"
url = "https://auction.chimpers.xyz/"
class_names = ["m-bid-amount","m-bid-bidder","m-bid-timestamp","m-countdown-pending","m-name",'img']
bot = bot = commands.Bot(command_prefix="!")
channel_id = channel_id here
async def message_on_bid(auction_status="New Bid Placed",bid="has recieved a bid of",bought="from"):
await bot.wait_until_ready()
channel = bot.get_channel(channel_id)
with open("CurrentTopBidder.txt","r") as f:
info = [line.rstrip() for line in f.readlines()]
if len(info[1]) > 10:
info[1] = info[1][:-(len(info[1])-4):] + "..." + info[1][len(info[1])-4::]
myEmbed = discord.Embed(title=auction_status,url="https://auction.chimpers.xyz/",description=f"{info[4]} {bid} Ξ{info[0][:-4]} {bought} {info[1]}",color=0x202020)
myEmbed.set_footer(text=f"{info[2]}")
myEmbed.set_thumbnail(url=info[5])
await channel.send(embed=myEmbed)
async def main():
start_time = time.time()
while True:
if time.time() - start_time > 60:
await message_on_bid(auction_status="Auction Closed",bid="has been bought for",bought="by")
print("auction closed")
with open("CurrentTopBidder.txt","w") as f:
f.write("0 ETH\nOwner\nTime\nAuctionStatus\nName\nimage")
f.close()
break
driver_options = Options()
driver_options.add_argument("--headless")
driver = webdriver.Chrome("Drivers\Chromedriver.exe")
driver.get(url)
results = {}
try:
for class_name in class_names:
if class_name != "img":
element = WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.CLASS_NAME, class_name)))
# Getting inner text of the html tag
results[class_name] = element.get_attribute("textContent")
else:
element = WebDriverWait(driver,10).until(EC.presence_of_element_located((By.TAG_NAME,class_name)))
results[class_name] = element.get_attribute("src")
finally:
driver.quit()
print(results)
this_list = []
f = open("CurrentTopBidder.txt","r")
for line in f.readlines():
this_list.append(line.rstrip())
if results["m-name"] == "Unnamed":
results["m-name"] = this_list[4]
if float(results["m-bid-amount"][:-4:]) > float(this_list[0][:-4:]):
if results["m-countdown-pending"] == " Auction ended ":
status = "AuctionEnded"
else:
status = "CurrentTopBidder"
f = open("CurrentTopBidder.txt","w")
f.write(str(results["m-bid-amount"]) + "\n" + str(results["m-bid-bidder"] + "\n" + str(results["m-bid-timestamp"]) + "\n" + str(status) + "\n" + str(results["m-name"]) + "\n" + results["img"]))
f.close()
await message_on_bid()
else:
f.close()
driver.quit()
await asyncio.sleep(600)
if __name__ == "__main__":
bot.loop.create_task(main())
bot.run(TOKEN)
any reason as to why this stopped being able to find the information????
error here

Selenium save file to current working directory

I have a website which I'm querying after solving a CAPTCHA.
After solving the CAPTCHA my query downloads a PDF file. My issue is that I cannot get FireFox to download the file automatically to the current working directory without user interaction.
I also cannot figure out how to determine if the file already exists, which would prompt my code to display either a dialog or a message.
Here's my current code, which does everything correctly until the file download popup.
import os
import logging
import argparse
import requests
from time import sleep
from selenium import webdriver
from selenium.common import exceptions
from selenium.webdriver.common.by import By
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
logger = logging.getLogger('tst-log-query')
logger.setLevel(logging.DEBUG)
formatter = logging.Formatter(
'%(asctime)s - %(name)s - %(levelname)-5.5s - %(message)s', "%Y-%m-%d %H:%M:%S")
file_handler = logging.FileHandler(
'tst-log-query.log', 'w', encoding='utf-8')
file_handler.setFormatter(formatter)
logger.addHandler(file_handler)
mainurl = "https://cndt-certidao.tst.jus.br/inicio.faces"
ckey = "f1a382ddd51949057324a7fc7c8ccf8a"
def solver(captcha):
with requests.Session() as req:
print("[*] - Please wait while CAPTCHA is solved ")
cdata1 = {
"clientKey": ckey,
"task": {
"type": "ImageToTextTask",
"body": captcha
}
}
cdata2 = {
"clientKey": ckey
}
while True:
try:
r = req.post(
'https://api.anti-captcha.com/createTask', json=cdata1)
cdata2['taskId'] = r.json()['taskId']
break
except KeyError:
logger.debug(r.json()["errorDescription"])
continue
while True:
sleep(5)
logger.info("Slept 5 Seconds!")
fr = req.post(
'https://api.anti-captcha.com/getTaskResult', json=cdata2)
status = fr.json()
logger.debug("Status: {}".format(status["status"]))
if status['status'] == "ready":
cap = status['solution']['text']
print("[*] - CAPTCHA Solved!")
return cap
else:
continue
def main(pat):
# saving to current working directory
options = Options()
options.set_preference('browser.download.folderList', 2)
options.set_preference('browser.download.manager.showWhenStarting', False)
options.set_preference('browser.download.dir', os.getcwd())
options.set_preference(
'browser.helperApps.neverAsk.saveToDisk', 'pdf')
#__________________________#
driver = webdriver.Firefox(options=options)
print(f"Checking (CNPJ/CPF)# {pat}")
while True:
try:
driver.get(mainurl)
waiter = WebDriverWait(driver, 60)
waiter.until(
EC.element_to_be_clickable(
(By.CSS_SELECTOR, "input[value=Regularização]"))
).click()
waiter.until(
EC.presence_of_element_located(
(By.CSS_SELECTOR, "#consultarRegularizacaoForm\:cpfCnpj"))
).send_keys(pat)
cap = waiter.until(
EC.presence_of_element_located(
(By.CSS_SELECTOR, "img[src^=data]"))).get_attribute('src').split(',', 1)[1]
break
except exceptions.TimeoutException:
logger.error('[*] - Unable to found elements, Refreshing Request.')
continue
capso = solver(cap)
if capso:
driver.find_element(By.ID, 'idCaptcha').send_keys(capso)
driver.find_element(
By.ID, 'consultarRegularizacaoForm:btnEmitirCertidao').click()
if __name__ == "__main__":
parser = argparse.ArgumentParser(description='Download PDF File!')
parser.add_argument(
'pattern', metavar="(CNPJ/CPF) Number", help="(CNPJ/CPF) Number", type=str)
try:
main(parser.parse_args().pattern)
except KeyboardInterrupt:
exit("Good Bye!")
Usage: python script.py 15436940000103
options = Options()
options.headless = True
options.set_preference(
"browser.helperApps.neverAsk.saveToDisk", "application/pdf")
options.set_preference("browser.download.folderList", 2)
options.set_preference("browser.download.dir", os.getcwd())
options.set_preference("pdfjs.disabled", True)
driver = webdriver.Firefox(options=options)
Solved using the previous code.

Selenium wait a lot while searching for element

I am working on a software to find email addresses in source codes of websites.
But sometimes the sources of the websites are very long, so it searches for a long time.
How can I set a certain time for this and have it switch to the other website url after this time expires?
for query in my_list:
results.append(search(query, tld="com", num=3, stop=3, pause=2))
for result in results:
url = list(result)
print(*url,sep='\n')
for site in url:
driver = webdriver.Chrome()
driver.get(site)
doc = driver.page_source
emails = re.findall(r'[\w\.-]+#[\w\.-]+', doc)
for email in emails:
print(email)
results = []
start_time = time.time()
for query in my_list:
results.append(search(query, tld="com", num=3, stop=3, pause=2))
for result in results:
url = list(result)
print(*url,sep='\n')
for site in url:
driver = webdriver.Chrome()
driver.get(site)
doc = driver.page_source
emails = re.findall(r'[\w\.-]+#[\w\.-]+', doc)
for email in emails:
print(email)
if time.time() - start_time > 10:
# if 10 seconds pass do something
start_time = time.time()
time.sleep(3)
driver.close()
import time
start_time = time.time()
# your code
while True:
if time.time() - start_time > 2:
# if 2 seconds pass do something
start_time = time.time()
print("2 seconds passed")
You can wait a bit providing time using python time module as follows:
import time
for site in url:
driver = webdriver.Chrome()
driver.get(site)
time.sleep(8)

Python process starts but does not actually run the target function

I'm using multiprocessing to test proxy server usability. The target function of my process takes a proxy server address and a queue as arguments and opens an instance of webdriver with the given proxy. The function tests the proxy by going to a specific url and trying to retrieve an html element. If the test is successful the function will add the webdriver instance to the queue. The function is shown below.
def test_proxy(address, queue):
print(f"testing proxy {address}")
chrome_options_1 = webdriver.ChromeOptions()
chrome_options_1.add_argument('--proxy-server=%s' % address)
chrome_options_1.add_argument("headless")
driver = webdriver.Chrome('.\driver\chromedriver.exe', options=chrome_options_1)
driver.set_page_load_timeout(10)
url = "https://www.facebook.com/marketplace/nyc/search/?query=honda"
try:
driver.get(url)
driver.find_element_by_xpath("//*[#class='kbiprv82']/a").get_attribute("href")
print(f"Successfully connected to proxy server at {address}")
queue.put(driver)
return
except:
print("Connection failed")
driver.quit()
In my main process I have a list of proxy addresses to test. A process is created to test each proxy in the list until a test is successful and a driver instance is put in the queue. If an item is found in the queue all the processes are terminated and the proxy list is cleared. The loop in my main process limits the number of child processes to 10. The main process code is in a class and shown below.
def find_proxy(self):
self.proxies = []
self.proxy_queue = multiprocessing.Queue()
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument("headless")
driver = webdriver.Chrome('.\driver\chromedriver.exe', options=chrome_options)
driver.get("https://free-proxy-list.net/")
Select(driver.find_element_by_xpath("//*[#id='proxylisttable']/tfoot/tr/th[7]/select")).select_by_visible_text("yes")
for country in ["US", "MX", "CA", "CL", "CO", "BR", "PE"]:
try:
Select(driver.find_element_by_xpath("//*[#id='proxylisttable']/tfoot/tr/th[3]/select")).select_by_visible_text(country)
i = 0
entries = driver.find_elements_by_xpath("//table[#id='proxylisttable']/tbody/tr/td")
for entry in entries:
if i == 7:
i = 0
self.proxies.append([proxy_address])
else:
if i == 0:
proxy_address = entry.text + ':'
if i == 1:
proxy_address = proxy_address + entry.text
i = i + 1
except:
pass
driver.quit()
while len(self.proxies) > 0:
i = 0
for proxy in self.proxies[:10]:
if self.proxy_queue.empty() == False:
driver = self.proxy_queue.get()
for proxy_1 in self.proxies:
try:
proxy_1[1].terminate()
except:
pass
self.proxies.clear()
return driver
elif len(proxy) < 2:
proxy.insert(1, multiprocessing.Process(target=test_proxy, args=(proxy[0], self.proxy_queue,)))
print(f"proxy thread {proxy[0]} created")
proxy[1].start()
print(f"proxy thread {proxy[0]} started")
elif proxy[1].is_alive() == False:
print(f"proxy thread {proxy[0]} dead")
del self.proxies[i]
print("proxy deleted")
break
i = i + 1
The issue is that the processes seem to start just fine but none of the code in the test_proxy function is actually run, not even the first print statement.

Selenium Threads: how to run multi-threaded browser with proxy ( python)

I'm writing a script to access a website using proxies with multiple threads but now I'm stuck in multiple threads, when I run the script below, it opens 5 browsers but all 5 use 1 proxy, I want 5 browsers to use different proxies, can someone help me complete it? thank you
Here is my script :
from selenium import webdriver
from selenium import webdriver
import time , random
import threading
def e():
a = open("sock2.txt", "r")
for line in a.readlines():
b = line
prox = b.split(":")
IP = prox[0]
PORT = int(prox[1].strip("\n"))
print(IP)
print(PORT)
profile = webdriver.FirefoxProfile()
profile.set_preference("network.proxy.type", 1)
profile.set_preference("network.proxy.socks", IP)
profile.set_preference("network.proxy.socks_port", PORT)
try:
driver = webdriver.Firefox(firefox_profile=profile)
driver.get("http://www.whatsmyip.org/")
except:
print("Proxy Connection Error")
driver.quit()
else:
time.sleep(random.randint(40, 70))
driver.quit()
for i in range(5):
t = threading.Thread(target=e)
t.start()
(Wish everyone has a happy and lucky new year)
Dominik Lašo captured it correctly - each threads processes the file from the beginning. Here's probably how it should look like:
from selenium import webdriver
from selenium import webdriver
import time , random
import threading
def e(ip, port):
profile = webdriver.FirefoxProfile()
profile.set_preference("network.proxy.type", 1)
profile.set_preference("network.proxy.socks", IP)
profile.set_preference("network.proxy.socks_port", PORT)
try:
driver = webdriver.Firefox(firefox_profile=profile)
driver.get("http://www.whatsmyip.org/")
except:
print("Proxy Connection Error")
driver.quit()
else:
time.sleep(random.randint(40, 70))
driver.quit()
my_threads = []
with open("sock2.txt", "r") as fd:
for line in fd.readlines():
line = line.strip()
if not line:
continue
prox = line.split(":")
ip = prox[0]
port = int(prox[1])
print('-> {}:{}'.format(ip, port))
t = threading.Thread(target=e, args=(ip, port,))
t.start()
my_threads.append(t)
for t in my_threads:
t.join()
( I personaly think that a problem is there that when you start a program, it will go to new thread, which will go throught the textfile from beginning, becasue you aint deleting them )
I have cane across the same problem, when I was doing the same thing as you do now. I know you would rather want help with your code, but I am in hurry to test it and want to help you ;) , so here is a code that works for me ... There is even task killer for a chrome ( you just have to edit it to firefox )
If I were you, I would start the thread after opening the file, cuz it looks liek you are opening the same file from 1st line everytime the tread starts
links = [ // Link you want to go to ]
def funk(xxx , website):
link = website
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--proxy-server=%s' % str(xxx))
chromedriver = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'chromedriver')
chrome = webdriver.Chrome(chromedriver, chrome_options=chrome_options)
try :
// Do stuff
except:
print('exception')
chrome.close()
for link in links:
f = open('proxies.txt')
line = f.readline()
x = 1
xx = 0
while line:
if number_of_used_proxies < 10:
print(line)
line = f.readline()
try:
threading.Timer(40, funk, [line, link]).start()
except Exception as e:
print(e)
time.sleep(1)
x += 1
number_of_used_proxies += 1
else:
time.sleep(100)
for x in range(1, 10):
try:
xzxzx = 'os.system("taskkill /f /im chrome.exe")'
os.system("killall 'Google Chrome'")
except:
print("NoMore")
time.sleep(10)
number_of_used_proxies = 0
f.close()
Hope it helps :)
vantuong: Here's how you can solve the problem with ThreadPoolExecutor.
Reference: https://docs.python.org/3/library/concurrent.futures.html
from selenium import webdriver
import time, random
#import threading
import concurrent.futures
MAX_WORKERS = 5
def get_proxys(data_file):
proxys = []
with open(data_file, "r") as fd:
for line in fd.readlines():
line = line.strip()
if not line:
continue
prox = line.split(":")
ip = prox[0]
port = int(prox[1])
proxys.append((ip, port))
return proxys
def e(ip, port):
profile = webdriver.FirefoxProfile()
profile.set_preference("network.proxy.type", 1)
profile.set_preference("network.proxy.socks", IP)
profile.set_preference("network.proxy.socks_port", PORT)
try:
driver = webdriver.Firefox(firefox_profile=profile)
driver.get("http://www.whatsmyip.org/")
except:
print("Proxy Connection Error")
driver.quit()
else:
time.sleep(random.randint(40, 70))
driver.quit()
with concurrent.futures.ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
proxys = get_proxys('sock2.txt')
tasks = {executor.submit(e, proxy[0], proxy[1]): proxy for proxy in proxys}
for task in concurrent.futures.as_completed(tasks):
proxy = tasks[task]
try:
data = task.result()
except Exception as exc:
print('{} generated an exception: {}'.format(proxy, exc))
else:
print('{} completed successfully'.format(proxy))
Fun exercise: Try playing around with different values of MAX_WORKERS.

Categories

Resources