Using python selenium to click and download files - python

i'm currently trying to automate the task of clicking different download links from this website:
https://www.theice.com/clear-us/risk-management#margin-rates
in this page, i first have to click the "Download ICE Risk Model array files" header which gives me 2 dropdowns from which i want to first click the "Final" link which downloads a csv file for each month of each available year.
Currently, both the dropdowns change due to hidden dropdown menus above, i have first tried to make them visible which was successful as well as changing year in it using selenium click,
The problem arising is that i'm not able to click the "Final" link in the csv section but it just clicks the "
Intercontinental Exchange"
button in the footer and navigates to a new page.
Is there anyway to get this task done ?
As well as is it possible to change the download location to the current directory where the .py script is ?
This is the python code so far, i currently removed the headless part to see what's going on :
from bs4 import BeautifulSoup
import requests
from selenium.webdriver.chrome.options import Options
from shutil import which
from selenium.webdriver.common.keys import Keys
from selenium import webdriver
from selenium.webdriver.support.ui import Select
import time
import os
chrome_options = Options()
chrome_options.add_argument("--headless")
chrome_options.add_argument("--log-level=3")
chrome_path = which("chromedriver")
driver = webdriver.Chrome(executable_path=chrome_path, options=chrome_options)
driver.set_window_size(1366, 768)
driver.get("https://www.theice.com/clear-us/risk-management#margin-rates")
main_button = driver.find_element_by_xpath('//h4[#class="collapsible-section-header"]')
main_button.click()
time.sleep(5)
driver.execute_script("document.getElementById('icus-ice-form-year').style.display = 'block';")
driver.execute_script("document.getElementById('icus-ice-form-month').style.display = 'block';")
time.sleep(1)
dropdown_1 = Select(driver.find_element_by_xpath('//select[#id="icus-ice-form-year"]'))
dropdown_2 = Select(driver.find_element_by_xpath('//select[#id="icus-ice-form-year"]'))
main_table_div = driver.find_element_by_xpath('//div[#id="icus-ice-riskarraytable"]')
main_table = main_table_div.find_element_by_xpath('//table[#class="table table-data"]')
for opt in dropdown_1.options:
opt.click()
for opt2 in dropdown_2.options:
opt2.click()
time.sleep(3)
download_links_1 = main_table.find_elements_by_xpath('//td[#class="table-partitioned"]')
for dow in download_links_1:
try:
temp_dow = dow.find_element_by_xpath('//a')
temp_dow.click()
time.sleep(4)
except:
pass

This should switch the downloads to the current working directory and print all the early csvs. change '//[#id="icus-ice-riskarraytable"]/table/tbody/tr[{0}]/td[2]/a' to '//[#id="icus-ice-riskarraytable"]/table/tbody/tr[{0}]/td[3]/a' for the other final csv.
options = Options()
currentDirectory = os.getcwd()
prefs = {
"download.default_directory": currentDirectory,
"download.prompt_for_download": False
}
#print(currentDirectory)
options.add_experimental_option("prefs", prefs)
driver.get("https://www.theice.com/clear-us/risk-management#margin-rates")
driver.implicitly_wait(5)
main_button = driver.find_element_by_xpath('//h4[#class="collapsible-section-header"]')
main_button.click()
driver.implicitly_wait(5)
driver.execute_script("document.getElementById('icus-ice-form-year').style.display = 'block';")
driver.execute_script("document.getElementById('icus-ice-form-month').style.display = 'block';")
driver.implicitly_wait(5)
drop1length=len(driver.find_elements_by_xpath('//select[#id="icus-ice-form-year"]/option'))
#print(drop1length)
for i in range(1,drop1length-1):
drop1=Select(driver.find_element_by_xpath('//select[#id="icus-ice-form-year"]'))
drop1.select_by_index(i)
drop2length=len(driver.find_elements_by_xpath('//select[#id="icus-ice-form-month"]/option'))
#print(drop2length)
for j in range(1,drop2length-1):
drop2=Select(driver.find_element_by_xpath('//select[#id="icus-ice-form-month"]'))
driver.implicitly_wait(5)
drop2.select_by_index(j)
download_links_length = len(driver.find_elements_by_xpath('//*[#id="icus-ice-riskarraytable"]/table/tbody/tr/td[2]/a'))
#print(download_links_length)
for dow in range(1,download_links_length-1):
try:
element = driver.find_element_by_xpath('//*[#id="icus-ice-riskarraytable"]/table/tbody/tr[{0}]/td[2]/a'.format(str(dow)))
driver.implicitly_wait(5)
driver.execute_script("arguments[0].click();", element)
driver.switch_to.window(driver.window_handles[0])
except Exception as e:
print(e)
Import these
import os
from selenium.webdriver.chrome.options import Options

Related

Selenium Webdriver Python is not looping

I'm trying to use Selenium to click on multiple links one each a time to download multiple CSV files, the problem here is the selenium makes the donwload of about few csv files but in the middle of the loop it stops working, crashes the Browser accusin that don't have internet and close the driver. I already put the chromedriver.exe in the same folder and put the path but it still not working.
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
import time
import os
import re
new_directory = r"Z:\BI_Database_teste"
for document in os.listdir(new_directory):
os.remove(os.path.join(new_directory, document))
url = 'myPersonalURL'
chrome_options = Options()
chrome_options.add_argument('--headless')
browser = webdriver.Chrome('chromedriver.exe', options=chrome_options)
params = {'behavior': 'allow', 'downloadPath': new_directory}
browser.execute_cdp_cmd('Page.setDownloadBehavior', params)
browser.get(url)
time.sleep(2)
input_email = browser.find_element(By.ID, 'email')
input_email.send_keys('myEmail')
input_password = browser.find_element(By.ID, 'password')
input_password.send_keys('myPassword')
input_password.submit()
input_question = browser.find_element(By.XPATH, '/html/body/div[2]/div[2]/form/table/tbody/tr[3]/td/table/tbody/tr[2]/td[2]').text
answer_field = browser.find_element(By.XPATH, '/html/body/div[2]/div[2]/form/table/tbody/tr[3]/td/table/tbody/tr[3]/td[2]/input')
if input_question == 'question':
answer_field.send_keys('answer')
elif input_question == 'question':
answer_field.send_keys('answer')
else:
answer_field.send_keys('answer')
time.sleep(2)
answer_field.submit()
time.sleep(4)
links = browser.find_elements(By.LINK_TEXT, 'Export (CSV)')
links_len = len(links)
print(str(links_len) + ' BI Databases to Download')
list_count = 0
for link in range(list_count, links_len):
time.sleep(2)
links[list_count].click()
list_count = list_count + 1
print(str(list_count) + ' BI Databases downloaded')
browser.quit()
for file in os.listdir(new_directory):
if file.startswith("PBI"):
try:
os.rename(os.path.join(new_directory, file), os.path.join(new_directory, re.sub('[0-9]', '', file)))
except:
pass
print('BI Databases Download Successfully!')```
Could someone help me to find out why the webdriver stops working in the middle of the loop?
If you think that for some reason the driver isn't loaded correctly then you can download the webdriver on runtime from your code, this might help,
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
chrome = webdriver.Chrome(ChromeDriverManager().install(), options=options)
without much further context, the question cannot be answered or reproduced.

GoogleCaptcha roadblock in website scraper

I am currently working on a scraper for aniworld.to.
My goal is it to enter the anime name and get all of the Episodes downloaded.
I have everything working except one thing...
The websites has a Watch button. That Button redirects you to https://aniworld.to/redirect/SOMETHING and that Site has a captcha which means the link is not in the html...
Is there a way to bypass this/get the link in python? Or a way to display the captcha so I can solve it?
Because the captcha only appears every lightyear.
The only thing I need from that page is the redirect link. It looks like this:
https://vidoza.net/embed-something.html
My very very wip code is here if it helps: https://github.com/wolfswolke/aniworld_scraper
Mitchdu showed me how to do it.
If anyone else needs help here is my code: https://github.com/wolfswolke/aniworld_scraper/blob/main/src/logic/captcha.py
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.support.ui import WebDriverWait
from threading import Thread
import os
def open_captcha_window(full_url):
working_dir = os.getcwd()
path_to_ublock = r'{}\extensions\ublock'.format(working_dir)
options = webdriver.ChromeOptions()
options.add_argument("app=" + full_url)
options.add_argument("window-size=423,705")
options.add_experimental_option('excludeSwitches', ['enable-logging'])
if os.path.exists(path_to_ublock):
options.add_argument('load-extension=' + path_to_ublock)
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
driver.get(full_url)
wait = WebDriverWait(driver, 100, 0.3)
wait.until(lambda redirect: redirect.current_url != full_url)
new_page = driver.current_url
Thread(target=threaded_driver_close, args=(driver,)).start()
return new_page
def threaded_driver_close(driver):
driver.close()

ChromeWebdriver sees website differently than I do (Python)

I'm trying to make a script that logs into my online grade book to look for any changes (new grades, etc). This is my code so far.
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
import time
def main():
options = Options()
# options.add_argument("--headless")
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
driver.maximize_window()
# goes to the desired website
driver.get('https://portal.librus.pl/rodzina')
# searches for and clicks a button that drops down a menu in which link for login form is visible
button = driver.find_element(By.CLASS_NAME, 'btn.btn-third.btn-synergia-top.btn-navbar.dropdown-toggle')
button.click()
# searches and clicks login link
agree = driver.find_element(By.CLASS_NAME, 'zmdi.zmdi-account.dropdown-item__icon')
agree.click()
time.sleep(10)
driver.quit()
if __name__ == '__main__':
main()
And there is a problem, I cannot seem to find a way to make webdriver see what I see. What I mean is that I see the webpage like this and webdriver sees the same webpage like this also the source code is different. I've tried using undetected ChromeDriver with no success. This is my code using UC.
import undetected_chromedriver as uc
import time
from selenium.webdriver.common.by import By
def main():
driver = uc.Chrome()
driver.maximize_window()
# goes to the desired website
driver.get('https://portal.librus.pl/rodzina/home')
# searches for and clicks a button that drops down a menu in which link for login form is visible
button = driver.find_element(By.CLASS_NAME, 'btn.btn-third.btn-synergia-top.btn-navbar.dropdown-toggle')
button.click()
# searches and clicks login link
agree = driver.find_element(By.CLASS_NAME, 'zmdi.zmdi-account.dropdown-item__icon')
agree.click()
time.sleep(5)
driver.execute_script("window.print();")
if __name__ == '__main__':
main()
Has anyone had a similar problem and managed to solve it?

Why am I unable to find html element with Python and Selenium?

I am having a weird issue with Python and Selenium. I am accessing the URL https://www.biggerpockets.com/users/JarridJ1. When you click more it shows further content. I can understand that it is a React-based website. When I view it on browser and doa View Source I can see the required stuff in a react element <div data-react-class="Profile/Header/Header" data-react-props="{&quot. I tried to automate Firefox via Selenium but I could not even get with that as well.
Check the screenshot:
Below is the code I tried:
from time import sleep
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
def parse(u):
print('Processing... {}'.format(u))
driver.get(u)
sleep(2)
html = driver.page_source
driver.save_screenshot('bp.png')
print(html)
if __name__ == '__main__':
options = Options()
options.add_argument("--headless") # Runs Chrome in headless mode.
options.add_argument('--no-sandbox') # Bypass OS security model
options.add_argument('--disable-gpu') # applicable to windows os only
options.add_argument('start-maximized') #
options.add_argument('disable-infobars')
options.add_argument("--disable-extensions")
driver = webdriver.Firefox()
parse('https://www.biggerpockets.com/users/JarridJ1')
This is a tricky one but I found a way to get to the element you have highlighted. Still not sure why driver.page_source is not return what you are looking for.
def parse(u):
print('Processing... {}'.format(u))
driver.get(u)
sleep(2)
get_everything = driver.find_elements_by_xpath("//*")
for element in get_everything:
print(element .get_attribute('innerHTML'))
#html = driver.page_source
#driver.save_screenshot('bp.png')
#print(html)
Below is my standalone example:
from selenium import webdriver
import time
driver = webdriver.Chrome("C:\Path\To\chromedriver.exe")
driver.get("https://www.biggerpockets.com/users/JarridJ1")
time.sleep(5)
a = driver.find_element_by_xpath("//div[#data-react-class='Profile/Header/Header']")
b = a.get_attribute("data-react-props")
print(b)
c = driver.find_elements_by_xpath("//*")
for i in c:
print(i.get_attribute('innerHTML'))

Python script goes to website but doesn't click button intended to

As a test, I am trying to create a script that goes to my website and clicks on the learn more button, but am having trouble actually automatically clicking the button.
I've tried everything that I've found on stack overflow but nothing has worked.
from selenium import webdriver
import webbrowser
import time
url = 'https://www.mwstan.com'
driver = webbrowser.open_new_tab(url)
element = driver.find_element_by_id('learnmore')
element.click()
You are going to need to install a binary for whatever driver you are going to use
import os
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
chrome_options = Options()
chrome_options.add_argument("--headless")
chrome_options.add_argument("--window-size=1920x1080")
chrome_driver = os.getcwd() + "/chromedriver"
def get_url_example(url):
driver = webdriver.Chrome(chrome_options=chrome_options, executable_path=chrome_driver)
driver.get(url)
button = driver.find_element_by_id("learnmore")
button.click()
# you can access the page source here using driver.page_source
if __name__ == '__main__':
get_url_page_source("https://www.mwstan.com")
This code works for me and hits your button.
This is using chrome webdriver but you can use another webdriver. JUst makesure you move the driver and access the path correctly like in line
chrome_driver = os.getcwd() + "/chromedriver"

Categories

Resources