How to bypass the alert window while downloading a zipfile?

How to bypass the alert window while downloading a zipfile? - python

If I open the link: https://dibbs2.bsm.dla.mil/Downloads/RFQ/Archive/ca210731.zip
This link shows the window and I need to press the OK button and it downloads the file.
The alert is not from the browser, it is from the page itself.
But When I tried the script:
from io import BytesIO
from zipfile import ZipFile
import requests
def get_zip(file_url):
url = requests.get(file_url)
zipfile = ZipFile(BytesIO(url.content))
zipfile.extractall("")
file_link ='https://dibbs2.bsm.dla.mil/Downloads/RFQ/Archive/ca210731.zip'
get_zip(file_link)
This throws the error:
zipfile.BadZipFile: File is not a zip file
And when I tried:
import requests
url = r'https://dibbs2.bsm.dla.mil/Downloads/RFQ/Archive/ca210731.zip'
output = r'downloadedfile.zip'
r = requests.get(url)
with open(output, 'wb') as f:
f.write(r.content)
This downloads the content of the page showing the OK button.
Any idea how to solve this:, the link downloads the zip file.

I believe you are accepting answer using selenium, Here's what you can do using selenium :
from selenium import webdriver
from selenium.webdriver.firefox.options import Options
profile = webdriver.FirefoxProfile()
profile.set_preference("browser.download.folderList",1)
# 0 for desktop
# 1 for default download folder
# 2 for specific folder
# You can specify directory by using profile.set_preference("browser.download.dir","<>")
profile.set_preference("browser.helperApps.neverAsk.saveToDisk", "application/octet-stream")
profile.set_preference("browser.helperApps.alwaysAsk.force", False);
# If you don't have some download manager then you can remove these
profile.set_preference("browser.download.manager.showWhenStarting",False)
profile.set_preference("browser.download.manager.useWindow", False);
profile.set_preference("browser.download.manager.focusWhenStarting", False);
profile.set_preference("browser.download.manager.alertOnEXEOpen", False);
profile.set_preference("browser.download.manager.showAlertOnComplete", False);
driver=webdriver.Firefox(firefox_profile=profile,executable_path="<>")
driver.get("https://dibbs2.bsm.dla.mil/Downloads/RFQ/Archive/ca210731.zip")
driver.find_element_by_id("butAgree").click()
Here we are setting some profiles to disable pop out, download dialog.
It is working perfectly fine in latest version of Firefox and 3.141.0 version of selenium

Related

Headless mode disable python web file downloading

I aim to download web files while in headless mode. My program downloads perfectly when NOT in headless mode, but once I add the constraint not to show MS Edge opening, the downloading is disregarded.
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import Select
driver = webdriver.Edge()
driver.get("URL")
id_box = driver.find_element(By.ID,"...")
pw_box = driver.find_element(By.ID,"...")
id_box.send_keys("...")
pw_box.send_keys("...")
log_in = driver.find_element(By.ID,"...")
log_in.click()
time.sleep(0.1) # If not included, get error: "Unable to locate element"
drop_period = Select(driver.find_element(By.ID,"..."))
drop_period.select_by_index(1)
drop_consul = Select(driver.find_element(By.ID,"..."))
drop_consul.select_by_visible_text("...")
drop_client = Select(driver.find_element(By.ID,"..."))
drop_client.select_by_index(1)
# Following files do not download with headless inculded:
driver.find_element(By.XPATH, "...").click()
driver.find_element(By.XPATH, "...").click()

In that case, you might try downloading the file using the direct link (to the file) and python requests.
You'll need to get the url, by parsing the elemt its href:
Downloading and saving a file from url should work as following then:
import requests as req
remote_url = 'http://www.example.com/file.txt'
local_file_name = 'my_file.txt'
data = req.get(remote_url)
# Save file data to local copy
with open(local_file_name, 'wb')as file:
file.write(data.content)
resource

There are different headless modes for Chrome. If you want to download files, use one of the special ones.
For Chrome 109 and above, use:
options.add_argument("--headless=new")
For Chrome 108 and below, use:
options.add_argument("--headless=chrome")
Reference: https://github.com/chromium/chromium/commit/e9c516118e2e1923757ecb13e6d9fff36775d1f4

Downloading files in headless mode works for me on MicrosoftEdge version 110.0.1587.41 using following options:
MicrosoftEdge: [{
"browserName": "MicrosoftEdge",
"ms:edgeOptions": {
args: ['--headless=new'],
prefs: {
"download.prompt_for_download": false,
"plugins.always_open_pdf_externally": true,
'download.default_directory': "dlFolder"
}
},
}]
Nothing worked until I added the option '--headless=new'
N.B: Tested on a Mac environment using webdriverIO

Download excel file in current folder instead of "downloads"

I am trying to download an excel file using Selenium in Python from a website
I need the file to be downloaded in the current folder instead of "download"
but it is not working, it is downloading it in the downloads folder
mime_types = [
'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet'
]
options = Options()
options.set_preference("browser.download.folderList",2)
options.set_preference("browser.download.manager.showWhenStarting", False)
options.set_preference("browser.download.dir","./")
options.set_preference("browser.helperApps.neverAsk.saveToDisk", ",".join(mime_types))
s = Service(GeckoDriverManager().install())
driver = webdriver.Firefox(service=s, options=options)
driver.get("https://chartink.com/screener/close-below-bb-205")
WebDriverWait(driver, 2).until(EC.element_to_be_clickable((By.XPATH, "/html/body/div[2]/div[2]/div[2]/div/div/div/div[2]/div/div[2]/div[6]/div[1]/div/div[1]/div/button[3]"))).click()
please let me know what is wrong with the code?

To download a file using Selenium in Python within a specified directory you need to tweak the following about:config entries:
Solution
So an working solution can be to create a FirefoxProfile and then create a new directory to later download the files in it as follows:
newpath = 'C:\\home\\vivvin\\shKLSE'
if not os.path.exists(newpath):
os.makedirs(newpath)
profile = webdriver.FirefoxProfile()
profile.set_preference("browser.download.dir",newpath);
profile.set_preference("browser.download.folderList",2);
Reference
You can find a couple of detailed discussion in:
Downloading file through Selenium Webdriver in python
Python: Unable to download with selenium in webpage

Save PDF by using Selenium and IE Browser

To save PDF by using CHrome Browser does not cause any issues (I'm using these options):
options.add_experimental_option('prefs',{
'credentials_enable_service': False,
'plugins':{
'always_open_pdf_externally': True
},
'profile': {
'password_manager_enabled': False,
},
'download': {
'prompt_for_download': False,
'directory_upgrade': True,
'default_directory': ''
}
})
BUT .... How to save PDF by using webdriver.Ie() Internet Explorer Driver with Python + Selenium?
P.S. AFAIK Internet explorer can not be executed by using headless mode, but if someone will not the way to do it, will be amazing !!!

You can't use Selenium to deal with the download prompt in IE because that's an OS-level prompt. Selenium WebDriver has no capability to automate OS-level prompt window. You need to use some 3rd party tools to help you to download file in IE using Selenium.
Here I use Wget to bypass the download prompt and download file in IE. You can refer to this article about how to use Wget.
About using headless mode in IE in Selenium, you can also use a 3rd party tool called headless_ie_selenium. You can download this tool and use headless_ie_selenium.exe instead of IEDriverServer.exe to automate IE.
The sample code to download a pdf file is like below, please note to change the paths in the code to your owns:
from selenium import webdriver
import time
import os
url = "https://file-examples.com/index.php/sample-documents-download/sample-pdf-download/"
driver = webdriver.Ie('D:\\headless-selenium-for-win-v1-4\\headless_ie_selenium.exe')
driver.get(url)
time.sleep(3)
link = driver.find_elements_by_class_name("download-button")[0]
hrefurl = link.get_attribute("href")
os.system('cmd /c C:\\Wget\\wget.exe -P D:\\Download --no-check-certificate ' + hrefurl)
print("*******************")

Is there a way to access '/Click/xxxx' files using web scraping?

I am currently trying to download a few pdf files from http://annualreports.com/Company/abercrombie-fitch and I am having a problem downloading the 2019 Annual Report. I am currently using
response = urllib2.urlopen("http://annualreports.com" + link)
file = open(name, 'wb')
file.write(response.read())
where link is '/Click/20415' but this is returning a text file rather than a pdf. Is there a specific way to fix this?

Another solution, using requests module.
import requests
url = 'http://annualreports.com/Click/20415'
with requests.get(url, stream=True) as r:
filename = r.url.split('/')[-1]
with open(filename, 'wb') as f_out:
for chunk in r.iter_content(chunk_size=8192):
if chunk:
print('.', end='')
f_out.write(chunk)
This saves NYSE_ANF_2019.pdf file to your disk.
EDIT: Screenshot from PDF in Firefox:

If you use Selenium you could try this:
from selenium import webdriver
download_dir = "C:\\Temp\\Dowmload" # for linux/*nix, download_dir="/usr/Public"
options = webdriver.ChromeOptions()
profile = {"plugins.plugins_list": [{"enabled": False, "name": "Chrome PDF Viewer"}], # Disable Chrome's PDF Viewer
"download.default_directory": download_dir , "download.extensions_to_open": "applications/pdf"}
options.add_experimental_option("prefs", profile)
driver = webdriver.Chrome('//Server/Apps/chrome_driver/chromedriver.exe', chrome_options=options) # Optional argument, if not specified will search path.
driver.get('http://annualreports.com' + link)
If you only want to download the PDF and do not want to do anything on the site, I think it is better to use the method that #superstew says. See:
https://stackabuse.com/download-files-with-python/

Selenium Python Download popup pdf with specific filename

I need to download a set of individual pdf files from a webpage. It is publicly available by government (ministry of education in Turkey) so totally legal.
However my selenium browser only displays the pdf file, how can I download it and name as I wish.
(This code is also from web)
# Import your newly installed selenium package
from selenium import webdriver
from bs4 import BeautifulSoup
# Now create an 'instance' of your driver
# This path should be to wherever you downloaded the driver
driver = webdriver.Chrome(executable_path="/Users/ugur/Downloads/chromedriver")
# A new Chrome (or other browser) window should open up
download_dir = "/Users/ugur/Downloads/" # for linux/*nix, download_dir="/usr/Public"
options = webdriver.ChromeOptions()
profile = {"plugins.plugins_list": [{"enabled": False, "name": "Chrome PDF Viewer"}], # Disable Chrome's PDF Viewer
"download.default_directory": download_dir , "download.extensions_to_open": "applications/pdf"}
options.add_experimental_option("prefs", profile)
# Now just tell it wherever you want it to go
driver.get("https://odsgm.meb.gov.tr/kurslar/KazanimTestleri.aspx?sinifid=5&ders=29")
driver.find_element_by_id("ContentPlaceHolder1_dtYillikPlanlar_lnkIndir_2").click()
driver.get("https://odsgm.meb.gov.tr/kurslar/PDFFile.aspx?name=kazanimtestleri.pdf")
Thanks in advance
Extra information:
I had a python 2 code doing this perfectly. But somehow it creates empty files and I couldn't convert it to python 3. Maybe this helps (no offense but I never liked selenium)
import urllib
import urllib2
from bs4 import BeautifulSoup
import os
sinifId=5
maxOrd = 1
fileNames=[]
directory = '/Users/ugur/Downloads/Hasan'
print 'List of current files in directory '+ directory+'\n---------------------------------\n\n'
for current_file in os.listdir(directory):
if (current_file.find('pdf')>-1 and current_file.find(' ')>-1):
print current_file
order = int(current_file.split(' ',1)[0])
if order>maxOrd: maxOrd=order
fileNames.append(current_file.split(' ',2)[1])
print '\n\nStarting download \n---------------------------------\n'
ctA=int(maxOrd+1)
for ders in [29]:
urlSinif='http://odsgm.meb.gov.tr/kurslar/KazanimTestleri.aspx?sinifid='+str(sinifId)+'&ders='+str(ders)
page = urllib2.urlopen(urlSinif)
soup = BeautifulSoup(page,"lxml")
st = soup.prettify()
count=st.count('ctl00')-1
dersAdi = soup.find('a', href='/kurslar/CevapAnahtarlari.aspx?sinifid='+str(sinifId)+'&ders='+str(ders)).getText().strip()
for testNo in range(count):
if(str(sinifId)+str(ders)+str(testNo+1) in fileNames):
print str(ctA)+' '+str(sinifId)+str(ders)+str(testNo+1)+' '+dersAdi+str(testNo+1)+'.pdf'+' skipped'
else:
annex=""
if(testNo%2==1): annex="2"
eiha_url = u'http://odsgm.meb.gov.tr/kurslar/KazanimTestleri.aspx?sinifid='+str(sinifId)+'&ders='+str(ders)
data = ('__EVENTTARGET','ctl00$ContentPlaceHolder1$dtYillikPlanlar$ctl'+format(testNo, '02')+'$lnkIndir'+annex), ('__EVENTARGUMENT', '39')
print 'ctl00$ContentPlaceHolder1$dtYillikPlanlar$ctl'+format(testNo, '02')+'$lnkIndir'+annex
new_data = urllib.urlencode(data)
response = urllib2.urlopen(eiha_url, new_data)
urllib.urlretrieve (str(response.url), directory+'/{0:0>3}'.format(ctA)+' '+str(sinifId)+str(ders)+str(testNo+1)+' '+dersAdi+str(testNo+1)+'.pdf')
print str(ctA)+' '+str(sinifId)+str(ders)+str(testNo+1)+' '+dersAdi+str(testNo+1)+'.pdf'+' downloaded'
ctA=ctA+1

Add your options before launching Chrome and then specify the chrome_options parameter.
download_dir = "/Users/ugur/Downloads/"
options = webdriver.ChromeOptions()
profile = {"plugins.plugins_list": [{"enabled": False, "name": "Chrome PDF Viewer"}],
"download.default_directory": download_dir,
"download.extensions_to_open": "applications/pdf"}
options.add_experimental_option("prefs", profile)
driver = webdriver.Chrome(
executable_path="/Users/ugur/Downloads/chromedriver",
chrome_options=options
)
To answer your second question:
May I ask how to specify the filename as well?
I found this: Selenium give file name when downloading
What I do is:
file_name = ''
while file_name.lower().endswith('.pdf') is False:
time.sleep(.25)
try:
file_name = max([download_dir + '/' + f for f in os.listdir(download_dir)], key=os.path.getctime)
except ValueError:
pass

Here is the code sample I used to download pdf with a specific file name. First you need to configure chrome webdriver with required options. Then after clicking the button (to open pdf popup window), call a function to wait for download to finish and rename the downloaded file.
import os
import time
import shutil
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
# function to wait for download to finish and then rename the latest downloaded file
def wait_for_download_and_rename(newFilename):
# function to wait for all chrome downloads to finish
def chrome_downloads(drv):
if not "chrome://downloads" in drv.current_url: # if 'chrome downloads' is not current tab
drv.execute_script("window.open('');") # open a new tab
drv.switch_to.window(driver.window_handles[1]) # switch to the new tab
drv.get("chrome://downloads/") # navigate to chrome downloads
return drv.execute_script("""
return document.querySelector('downloads-manager')
.shadowRoot.querySelector('#downloadsList')
.items.filter(e => e.state === 'COMPLETE')
.map(e => e.filePath || e.file_path || e.fileUrl || e.file_url);
""")
# wait for all the downloads to be completed
dld_file_paths = WebDriverWait(driver, 120, 1).until(chrome_downloads) # returns list of downloaded file paths
# Close the current tab (chrome downloads)
if "chrome://downloads" in driver.current_url:
driver.close()
# Switch back to original tab
driver.switch_to.window(driver.window_handles[0])
# get latest downloaded file name and path
dlFilename = dld_file_paths[0] # latest downloaded file from the list
# wait till downloaded file appears in download directory
time_to_wait = 20 # adjust timeout as per your needs
time_counter = 0
while not os.path.isfile(dlFilename):
time.sleep(1)
time_counter += 1
if time_counter > time_to_wait:
break
# rename the downloaded file
shutil.move(dlFilename, os.path.join(download_dir,newFilename))
return
# specify custom download directory
download_dir = r'c:\Downloads\pdf_reports'
# for configuring chrome pdf viewer for downloading pdf popup reports
chrome_options = webdriver.ChromeOptions()
chrome_options.add_experimental_option('prefs', {
"download.default_directory": download_dir, # Set own Download path
"download.prompt_for_download": False, # Do not ask for download at runtime
"download.directory_upgrade": True, # Also needed to suppress download prompt
"plugins.plugins_disabled": ["Chrome PDF Viewer"], # Disable this plugin
"plugins.always_open_pdf_externally": True, # Enable this plugin
})
# get webdriver with options for configuring chrome pdf viewer
driver = webdriver.Chrome(options = chrome_options)
# open desired webpage
driver.get('https://mywebsite.com/mywebpage')
# click the button to open pdf popup
driver.find_element_by_id('someid').click()
# call the function to wait for download to finish and rename the downloaded file
wait_for_download_and_rename('My file.pdf')
# close the browser windows
driver.quit()
Set timeout (120) to the wait time as per your needs.

Non-selenium solution, You can do something like:
import requests
pdf_resp = requests.get("https://odsgm.meb.gov.tr/kurslar/PDFFile.aspx?name=kazanimtestleri.pdf")
with open("save.pdf", "wb") as f:
f.write(pdf_resp.content)
Although you might want to check the content type before to make sure it's a pdf

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

How to bypass the alert window while downloading a zipfile? - python

Related

Headless mode disable python web file downloading

Download excel file in current folder instead of "downloads"

Save PDF by using Selenium and IE Browser

Is there a way to access '/Click/xxxx' files using web scraping?

Selenium Python Download popup pdf with specific filename

Categories

Resources