Multi-Threading with selenium and Python Background processing - python

I currently trying to auto login into a website and fill a form.
I'm using selenium in python and am trying to multi thread each username in a different driver.
Issue is that drivers does open parallel to each other in the background yet they seem to not process the data unless they are opened in the foreground which does delay the process alot to wait for one to finish then process the next.
for confidentiality reasons I cannot share the website URL yet here is the code and functions used.
import undetected_chromedriver as uc
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import Select
from selenium.common.exceptions import NoSuchElementException,ElementClickInterceptedException,TimeoutException
from threading import Thread
import time
import pandas as pd
Chrome options:-
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--headless')
chrome_options.add_argument('--enable-javascript')
chrome_options.add_argument('--disable-gpu')
user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/15.0 Safari/605.1.15'
chrome_options.add_argument('User-Agent={0}'.format(user_agent))
chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
chrome_options.add_experimental_option('useAutomationExtension', True)
Functions used:-
def login_hbd(driver,username,password):
WebDriverWait(driver, 20).until(EC.visibility_of_element_located((By.NAME,"username")))
username_field = driver.find_element(By.NAME,"username")
username_field.send_keys(username)
password_field = driver.find_element(By.NAME,"password")
password_field.send_keys(password)
submit_btn = WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.ID, "submitBtn")))
driver.execute_script("arguments[0].click();", submit_btn)
def get_ss_hbd(driver,username_list,password_list):
while True:
try:
login_hbd(driver,str(username_list),password_list)
WebDriverWait(driver, 1).until(EC.visibility_of_element_located((By.XPATH,"/html/body/div[1]/div/div/header/div/div[3]/nav/ul/li[7]/a")))
driver.get("*****")
driver.save_screenshot(f"{username_list}.png")
driver.close()
break
except (NoSuchElementException,ElementClickInterceptedException,TimeoutException):
continue
Threading code:-
# get the start time
st = time.time()
number_of_threads = len(df)
threads = []
for _ in range(number_of_threads):
username_list = df.loc[_][0]
password_list = df.loc[_][1]
driver = uc.Chrome(chrome_options=chrome_options,service_args=['--quiet'])
driver.get("****")
t = Thread(target=get_ss_hbd, args=(driver,username_list,password_list))
t.start()
threads.append(t)
for t in threads:
t.join()
et = time.time()
# get the execution time
elapsed_time = et - st
print('Execution time:', elapsed_time, 'seconds')

After referencing to this issue https://github.com/ultrafunkamsterdam/undetected-chromedriver/issues/1051
and running the following code to fix the issues with undetected_chromedriver
code:
python -m pip uninstall undetected-chromedriver
python -m pip install git+https://github.com/ultrafunkamsterdam/undetected-chromedriver#fix-multiple-instance
python -m pip install --upgrade selenium

Related

Selenium Scrape not reading all elements

I am trying to scrape data from the following site. I was able to click on load more yet the code doesn't catch most of the elements and I do not really know what to do.
url = 'https://www.carrefouregypt.com/mafegy/en/c/FEGY1701230'
products = []
options = Options()
driver = webdriver.Chrome(options = options)
driver.get(url)
time.sleep(8)
#click on load more
while True:
try:
btn_class = 'css-1n3fqy0'
btn = driver.find_element(By.CLASS_NAME , btn_class)
btn.click()
driver.implicitly_wait(10)
except NoSuchElementException:
break
driver.execute_script("window.scrollTo(0,document.body.scrollHeight)")
time.sleep(8)
The following code will click that button until it cannot locate it, and exit gracefully:
from selenium.common.exceptions import NoSuchElementException, TimeoutException
from selenium import webdriver
from selenium.webdriver.firefox.service import Service
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.firefox.options import Options as Firefox_Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support.ui import Select
from selenium.webdriver.support import expected_conditions as EC
import time as t
firefox_options = Firefox_Options()
firefox_options.add_argument("--width=1280")
firefox_options.add_argument("--height=720")
# firefox_options.headless = True
firefox_options.set_preference("general.useragent.override", "Mozilla/5.0 (Linux; Android 7.0; SM-A310F Build/NRD90M) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.91 Mobile Safari/537.36 OPR/42.7.2246.114996")
driverService = Service('chromedriver/geckodriver')
browser = webdriver.Firefox(service=driverService, options=firefox_options)
url = 'https://www.carrefouregypt.com/mafegy/en/c/FEGY1701230'
browser.get(url)
t.sleep(5)
while True:
try:
load_more_button = WebDriverWait(browser, 10).until(EC.element_to_be_clickable((By.XPATH,'//button[text()="Load More"]')))
browser.execute_script('window.scrollBy(0, 100);')
load_more_button.click()
print('clicked')
t.sleep(3)
except TimeoutException:
print('all elements loaded in page')
break
It's using Firefox, on a linux setup (for some reasons Chrome was temperamental on this one). You just have to observe the imports, and the code after defining the browser/driver. Selenium documentation: https://www.selenium.dev/documentation/

How to handle Firefox print dialog box in Selenium

I have the following code:
from selenium.webdriver import Firefox
from selenium.webdriver.common.keys import Keys
url = 'https://finance.yahoo.com/'
driver_path = 'geckodriver.exe'
browser = Firefox(executable_path = driver_path)
browser.get(url)
profile.set_preference("browser.helperApps.neverAsk.saveToDisk", "application/pdf")
profile.set_preference("browser.helperApps.neverAsk.openFile", "application/pdf")
search_field_id = 'yfin-usr-qry'
element_search_field = browser.find_element_by_id(search_field_id)
element_search_field.clear()
element_search_field.send_keys('TSLA')
element_search_field.send_keys(Keys.ENTER)
from selenium.webdriver import ActionChains
action_chains = ActionChains(browser)
action_chains.key_down(Keys.CONTROL).send_keys('V').key_up(Keys.CONTROL).perform()
xpath_string = '/html/body/div[1]/div/div/div[1]/div/div[2]/div/div/div[6]/div/div/section/div/ul/li[2]/a/span'
element = browser.find_element_by_xpath(xpath_string)
action_chains.move_to_element(element).click().perform()
browser.execute_script('window.print();')
A print dialog box pops up for Firefox. I was wondering how can i accept it. Is there a way to bypass this dialog box and directly print since this is not a system dialog box but Firefox's.
Edit:
My full updated code as per input from #Prophet
from selenium.webdriver import Firefox
from selenium.webdriver.common.keys import Keys
from selenium.webdriver import ActionChains
from selenium.webdriver.support.ui import Select
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.firefox.firefox_profile import FirefoxProfile
import time
from fake_useragent import UserAgent
from selenium import webdriver
from selenium.webdriver import DesiredCapabilities
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.common.keys import Keys
ua = UserAgent()
userAgent = ua.random
url = 'https://finance.yahoo.com/'
driver_path = 'geckodriver.exe'
profile = FirefoxProfile('C:\\Users\\\\AppData\\Roaming\\Mozilla\\Firefox\\Profiles\\tp3cz5dm.default-release')
profile.set_preference("general.useragent.override", userAgent)
browser = Firefox(executable_path = driver_path)
browser.get(url)
profile.set_preference("browser.helperApps.neverAsk.saveToDisk", "application/pdf")
profile.set_preference("browser.helperApps.neverAsk.openFile", "application/pdf")
search_field_id = 'yfin-usr-qry'
element_search_field = browser.find_element_by_id(search_field_id)
element_search_field.clear()
element_search_field.send_keys('TSLA')
element_search_field.send_keys(Keys.ENTER)
from selenium.webdriver import ActionChains
action_chains = ActionChains(browser)
action_chains.key_down(Keys.CONTROL).send_keys('V').key_up(Keys.CONTROL).perform()
# xpath_string = '/html/body/div[1]/div/div/div[1]/div/div[2]/div/div/div[6]/div/div/section/div/ul/li[2]/a/span'
# element = browser.find_element_by_xpath(xpath_string)
# action_chains.move_to_element(element).click().perform()
browser.execute_script('window.print();')
browser.switch_to.window(browser.window_handles[-1])
time.sleep(0.5)
actionButton = browser.execute_script(
"return document.querySelector('print-preview-app').shadowRoot.querySelector('#sidebar').shadowRoot.querySelector('print-preview-button-strip').shadowRoot.querySelector('.action-button')")
cancelButton.click()
# switch back to main window
browser.switch_to.window(driver.window_handles[0])
When i run this i am getting error:
JavascriptException: TypeError: document.querySelector(...) is null
Both the solutions below are designed NOT to launch the print dialog. These solutions will either print the active webpage to your local printer or to a PDF file without having to deal with the dialog.
UPDATED POST 08-19-2021
I wanted to save the output to PDF vs printing to paper. I was shocked how hard it was to print to a PDF using the geckodriver and selenium. With the 'chromedriver' you can call the function 'execute_cdp_cmd' and pass Page.printToPDF. The geckodriver doesn't have 'execute_cdp_cmd'.
When I looked through Stack Overflow for inspiration, I discover multiple open question on printing pdf using the geckodriver with selenium. After seeing that this was a problem, I looked through the issues in selenium and the bug reports for mozilla. Again this was a problem that others had.
Some of the bug reports mentioned that certain switches used in the print process no longer worked.
profile.set_preference("print.print_to_file", True)
profile.set_preference("print.print_to_filename", "/tmp/file.pdf")
I decided to look at the source code for mozilla gecko-dev for a potential solution. After hours of research I found that the switches above were replaced with new ones and that another printer variable had also been replaced. After some testing, I was able to get your webpage to save as PDF.
The code below will print a webpage to a PDF with all the links enabled. I would recommend adding some error handling to the code. One part of the code that I need to improve on the filename part. You should be able to add a function that will rename the file, which would allow you to print as many files as you want in a single session.
from time import sleep
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.firefox.options import FirefoxProfile
firefox_options = Options()
firefox_options.add_argument("--disable-infobars")
firefox_options.add_argument("--disable-extensions")
firefox_options.add_argument("--disable-popup-blocking")
profile_options = FirefoxProfile()
user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 11.5; rv:90.0) Gecko/20100101 Firefox/90.0'
profile_options.set_preference('profile_options = FirefoxProfile()', user_agent)
profile_options.set_preference("print_printer", "Mozilla Save to PDF")
profile_options.set_preference("print.always_print_silent", True)
profile_options.set_preference("print.show_print_progress", False)
profile_options.set_preference('print.save_as_pdf.links.enabled', True)
profile_options.set_preference("print.printer_Mozilla_Save_to_PDF.print_to_file", True)
# set your own file path
profile_options.set_preference('print.printer_Mozilla_Save_to_PDF.print_to_filename',
"tmp/testprint.pdf")
driver = webdriver.Firefox(executable_path='/usr/local/bin/geckodriver', options=firefox_options,
firefox_profile=profile_options)
URL = 'https://finance.yahoo.com/'
driver.get(URL)
sleep(10)
search_field_id = 'yfin-usr-qry'
element_search_field = driver.find_element_by_id(search_field_id)
element_search_field.clear()
element_search_field.send_keys('TSLA')
element_search_field.send_keys(Keys.ENTER)
sleep(10)
driver.execute_script("window.print()")
sleep(20)
driver.quit()
ORIGINAL POST 08-18-2021
I decided to look at your issue, because I'm interested in selenium functionality.
I looked through the source code of the geckodriver and found printUtils.js, which provides details on the switches used in the print process, such as these:
firefox_options.set_preference("print.always_print_silent", True)
firefox_options.set_preference("print.show_print_progress", False)
After removing some of your code and adding some, I was able to print to my HP printer with the code below without dealing with a print dialog box:
from time import sleep
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.firefox.options import FirefoxProfile
firefox_options = Options()
firefox_options.add_argument("--disable-infobars")
firefox_options.add_argument("--disable-extensions")
firefox_options.add_argument("--disable-popup-blocking")
profile_options = FirefoxProfile()
user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 11.5; rv:90.0) Gecko/20100101 Firefox/90.0'
firefox_options.set_preference('profile_options = FirefoxProfile()', user_agent)
firefox_options.set_preference("print.always_print_silent", True)
firefox_options.set_preference("print.show_print_progress", False)
firefox_options.set_preference("pdfjs.disabled", True)
driver = webdriver.Firefox(executable_path='/usr/local/bin/geckodriver', options=firefox_options)
URL = 'https://finance.yahoo.com/'
driver.get(URL)
sleep(10)
search_field_id = 'yfin-usr-qry'
element_search_field = driver.find_element_by_id(search_field_id)
element_search_field.clear()
element_search_field.send_keys('TSLA')
element_search_field.send_keys(Keys.ENTER)
sleep(10)
driver.execute_script("window.print()")
----------------------------------------
My system information
----------------------------------------
Platform: Apple
OS: 10.15.7
Python: 3.9
Selenium: 3.141
Firefox: 90.0.2
Geckodriver: 0.29.0
----------------------------------------
Adding these profile preferences should avoid presenting this pop-up:
profile.set_preference("print.always_print_silent", True)
profile.set_preference("print.show_print_progress", False)
UPD
After involving the printing dialog please try accepting it by this code:
# switch to print preview window
driver.switch_to.window(driver.window_handles[-1])
time.sleep(0.5)
actionButton = driver.execute_script(
"return document.querySelector('print-preview-app').shadowRoot.querySelector('#sidebar').shadowRoot.querySelector('print-preview-button-strip').shadowRoot.querySelector('.action-button')")
cancelButton.click()
# switch back to main window
driver.switch_to.window(driver.window_handles[0])

How to restart Selenium browser after quit?

I'm trying to quit and then restart a new browser session with Selenium when encountering a captcha, and I'm not sure yet why the code below isn't working.
It quits the existing driver, but after recursion browser.get() results in this error: ConnectionRefusedError: [Errno 61] Connection refused
Thanks in advance for any advice. I've included only the most relevant parts of the code below:
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.action_chains import ActionChains
path_to_chromedriver = '/Users/Myname/Desktop/a/chromedriver 2'
options = webdriver.ChromeOptions()
options.add_argument('start-maximized')
#options.add_argument('disable-infobars')
#options.add_argument('--disable-notifications')
options.add_argument('--disable-extensions')
browser = webdriver.Chrome(chrome_options=options, executable_path=path_to_chromedriver)
headers = {'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36'}
def get_page_info(url, browser = webdriver.Chrome(chrome_options=options, executable_path=path_to_chromedriver)):
browser.get(url)
try:
body = browser.find_element_by_tag_name('body')
if "been denied because we believe" in body.text:
print("going to new session...")
browser.quit()
human(4,6) #time delay
return winery_info(url)
Edit: I normally wouldn't use this tactic to get around a captcha, but in my use case this makes sense.
Try to use the driver.delete_all_cookies() method instead of closing browser and reopening it
edit : maybe the site block your ip adress i suggest you to use tor to change ip automatically i will give you this
import os
import time
os.system("killall tor")
os.system("tor &")
time.sleep(5)
#init driver
fp = webdriver.FirefoxProfile()
fp.set_preference("network.proxy.type", 1)
fp.set_preference("network.proxy.socks", "127.0.0.1")
fp.set_preference("network.proxy.socks_port", int("9050"))
fp.update_preferences()
browser = webdriver.Firefox(firefox_profile=fp)
browser.get(...)
...
...
if captcha:
os.system("killall tor")
os.system("tor &")
time.sleep(5)
browser.get(...)
# this will change your ip adress
# You can also configure tor to change ip every 10 seconds by changing torrc file

Selenium PhantomJS never finishes loading / incomplete loading

I was trying to get the embedded video URL from https://www.fmovies.is . I'm using selenium.PhantomJS(). The exact same code works perfectly if I use selenium.Firefox() driver . It seems as though I'm doing something wrong during the waiting phase.
If someone could point out what I was doing wrong , I would really appreciate it.
from bs4 import BeautifulSoup
from selenium import webdriver
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver import DesiredCapabilities
desired_capabilities = DesiredCapabilities.PHANTOMJS.copy()
desired_capabilities['phantomjs.page.customHeaders.User-Agent'] = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_5)AppleWebKit 537.36 (KHTML, like Gecko) Chrome"
desired_capabilities['phantomjs.page.customHeaders.Accept'] = "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8"
url = "https://fmovies.is/film/kung-fu-panda-2.9kx/q8kkyj"
driver = webdriver.PhantomJS(service_args=['--ignore-ssl-errors=true', '--ssl-protocol=any'],desired_capabilities=desired_capabilities)
driver.get(url)
try:
element = WebDriverWait(driver, 100).until(EC.presence_of_element_located((By.ID, "jw")))
finally:
driver.find_element_by_id("player").click()
pageSource = driver.page_source
soup = BeautifulSoup(pageSource,'lxml')
url = soup.find("video",{"class":"jw-video"})
print url
videoURL = ''
if url:
videoURL = url['src']
print videoURL

PhantomJS returning empty web page (python, Selenium)

Trying to screen scrape a web site without having to launch an actual browser instance in a python script (using Selenium). I can do this with Chrome or Firefox - I've tried it and it works - but I want to use PhantomJS so it's headless.
The code looks like this:
import sys
import traceback
import time
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
dcap = dict(DesiredCapabilities.PHANTOMJS)
dcap["phantomjs.page.settings.userAgent"] = (
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/53 "
"(KHTML, like Gecko) Chrome/15.0.87"
)
try:
# Choose our browser
browser = webdriver.PhantomJS(desired_capabilities=dcap)
#browser = webdriver.PhantomJS()
#browser = webdriver.Firefox()
#browser = webdriver.Chrome(executable_path="/usr/local/bin/chromedriver")
# Go to the login page
browser.get("https://www.whatever.com")
# For debug, see what we got back
html_source = browser.page_source
with open('out.html', 'w') as f:
f.write(html_source)
# PROCESS THE PAGE (code removed)
except Exception, e:
browser.save_screenshot('screenshot.png')
traceback.print_exc(file=sys.stdout)
finally:
browser.close()
The output is merely:
<html><head></head><body></body></html>
But when I use the Chrome or Firefox options, it works fine. I thought maybe the web site was returning junk based on the user agent, so I tried faking that out. No difference.
What am I missing?
UPDATED: I will try to keep the below snippet updated with until it works. What's below is what I'm currently trying.
import sys
import traceback
import time
import re
from selenium import webdriver
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
from selenium.webdriver.support import expected_conditions as EC
dcap = dict(DesiredCapabilities.PHANTOMJS)
dcap["phantomjs.page.settings.userAgent"] = (
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/53 (KHTML, like Gecko) Chrome/15.0.87")
try:
# Set up our browser
browser = webdriver.PhantomJS(desired_capabilities=dcap, service_args=['--ignore-ssl-errors=true'])
#browser = webdriver.Chrome(executable_path="/usr/local/bin/chromedriver")
# Go to the login page
print "getting web page..."
browser.get("https://www.website.com")
# Need to wait for the page to load
timeout = 10
print "waiting %s seconds..." % timeout
wait = WebDriverWait(browser, timeout)
element = wait.until(EC.element_to_be_clickable((By.ID,'the_id')))
print "done waiting. Response:"
# Rest of code snipped. Fails as "wait" above.
I was facing the same problem and no amount of code to make the driver wait was helping.
The problem is the SSL encryption on the https websites, ignoring them will do the trick.
Call the PhantomJS driver as:
driver = webdriver.PhantomJS(service_args=['--ignore-ssl-errors=true', '--ssl-protocol=TLSv1'])
This solved the problem for me.
You need to wait for the page to load. Usually, it is done by using an Explicit Wait to wait for a key element to be present or visible on a page. For instance:
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
# ...
browser.get("https://www.whatever.com")
wait = WebDriverWait(driver, 10)
wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "div.content")))
html_source = browser.page_source
# ...
Here, we'll wait up to 10 seconds for a div element with class="content" to become visible before getting the page source.
Additionally, you may need to ignore SSL errors:
browser = webdriver.PhantomJS(desired_capabilities=dcap, service_args=['--ignore-ssl-errors=true'])
Though, I'm pretty sure this is related to the redirecting issues in PhantomJS. There is an open ticket in phantomjs bugtracker:
PhantomJS does not follow some redirects
driver = webdriver.PhantomJS(service_args=['--ignore-ssl-errors=true', '--ssl-protocol=TLSv1'])
This worked for me

Categories

Resources