PDF Download with Selenium for Chrome in Python

PDF Download with Selenium for Chrome in Python - python

I write a Script for a massdownload of PDF-files in a Loop. Firefoxbased Selenium-Webdriver stuck after the first Download, so i decide to try Chrome.
With Chrome the Loop is working now. But i can not download the PDF-files. Chrome just show them with pdf_viewer.js in the Browserwindow.
I try different Options like plugins.plugins_list": [{"enabled": False, "name": "Chrome PDF Viewer"}], download.extensions_to_open": "applications/pdf and plugins.always_open_pdf_externally": True. Nothing works.
#!/usr/bin/python
downPath = "/home/user/xxx/"
Y = ["2017", "2018", "2019"]
def main():
options = webdriver.ChromeOptions()
profile = {
"plugins.plugins_list": [{"enabled": False, "name": "Chrome PDF Viewer"}],
"download.default_directory": downPath ,
"download.extensions_to_open": "applications/pdf",
"plugins.always_open_pdf_externally": True,
"download.prompt_for_download": False,
"safebrowsing.enabled": True
}
options.add_experimental_option("prefs", profile)
browser = webdriver.Chrome(options=options)
driver = webdriver.Chrome()
driver.get('login-url')
#LOGIN-Stuff ...
time.sleep(4)
for y in Y:
print(y)
for x in range(53):
url = "https://j.world/files/{0}/filename_{0}-{1:02d}.pdf".format(y, x)
print(url)
driver.get(url)
time.sleep(4)
driver.quit()
if __name__ == "__main__":
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import time
import sys
import os
main()
I use Python 3.7.4 with python-selenium 3.141.0-1 from the Arch-Linux Repository.

you can download file with requests lib or with curl/wget/etc instead of
driver.get(url)

I have overlooked that in the code were still fragments of my attempts with firefox.
browser = webdriver.Chrome(options=options)
driver = webdriver.Chrome()
are obviously wrong
i wrote driver = webdriver.Chrome(options=options) and all things work.
excuse my carelessness

Related

Printing a PDF with Selenium Chrome Driver in headless mode

I have no problems printing without headless mode, however once I enable headless mode, it just refuses to print a PDF. I'm currently working on an app with a GUI, so I'd rather not have the Selenium webdriver visible to the end user if possible.
For this project I'm using an older version of Selenium, 4.2.0. That coupled with Python 3.9.
import os
from os.path import exists
import json
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver import Chrome, ChromeOptions
# Paths
dir_path = os.getcwd()
download_path = os.path.join(dir_path, "letters")
chrome_path = os.path.join(dir_path, "chromium\\app\\Chrome-bin\\chrome.exe")
user_data_path = os.path.join(dir_path, "sessions")
website = "https://www.google.com/"
def main():
print_settings = {
"recentDestinations": [{
"id": "Save as PDF",
"origin": "local",
"account": "",
}],
"selectedDestinationId": "Save as PDF",
"version": 2,
"isHeaderFooterEnabled": False,
"isLandscapeEnabled": True
}
options = ChromeOptions()
options.binary_location = chrome_path
options.add_argument("--start-maximized")
options.add_argument('--window-size=1920,1080')
options.add_argument(f"user-data-dir={user_data_path}")
options.add_argument("--headless")
options.add_argument('--enable-print-browser')
options.add_experimental_option("prefs", {
"printing.print_preview_sticky_settings.appState": json.dumps(print_settings),
"savefile.default_directory": download_path, # Change default directory for downloads
"download.default_directory": download_path, # Change default directory for downloads
"download.prompt_for_download": False, # To auto download the file
"download.directory_upgrade": True,
"profile.default_content_setting_values.automatic_downloads": 1,
"safebrowsing.enabled": True
})
options.add_argument("--kiosk-printing")
driver = Chrome(service=Service(ChromeDriverManager().install()), options=options)
driver.get(website)
driver.execute_script("window.print();")
if exists(os.path.join(user_data_path, "Google.pdf")):
print("YAY!")
else:
print(":(")
if __name__ == '__main__':
main()

For anyone else coming across this with a similar issue, I fixed it by using the print method described here: Selenium print PDF in A4 format
Using my example from above, I replaced:
driver.execute_script("window.print();")
with:
pdf_data = driver.execute_cdp_cmd("Page.printToPDF", print_settings)
with open('Google.pdf', 'wb') as file:
file.write(base64.b64decode(pdf_data['data']))
And that worked just fine for me.

Python/Selenium: save to pdf not saving

I am using selenium/python to save a series of webpages to pdf. The webpages have a table that is rendered with javascript; I am using "find_element_by_xpath" to identify that the pdf icon in the js table appeared before proceeding with the print. Optimally, I did not want to implement a set a hard wait/sleep time as I have thousands of pages to save.
The code seems to work but no pdf is saved.
The code is as follows:
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException
import json
options = webdriver.ChromeOptions()
settings = {
"recentDestinations": [{
"id": "Save as PDF",
"origin": "local",
"account": "",
}],
"selectedDestinationId": "Save as PDF",
"version": 2
}
prefs = {'printing.print_preview_sticky_settings.appState': json.dumps(settings)}
options.add_experimental_option('prefs', prefs)
options.add_experimental_option('excludeSwitches', ['enable-logging'])
options.add_argument('--kiosk-printing')
CHROMEDRIVER_PATH = 'chromedriver.exe'
driver = webdriver.Chrome(options=options, executable_path=CHROMEDRIVER_PATH)
driver = webdriver.Chrome(options=options, executable_path=CHROMEDRIVER_PATH)
try:
element = driver.find_element_by_xpath("//div[#class='fas fa-file-pdf']")
WebDriverWait(driver, 10).until(EC.staleness_of(element))
except NoSuchElementException:
element = None
print(element)
driver.get("url")
driver.execute_script('window.print();')
#driver.quit()

This is working
First I get document link using driver.findelement(). Than I pass that url in request(). If we get response.status_code == 200 so we can download document otherwise we can't. Than I just write document content using response.content.strong textDocument is a document path with file_name.extension. Please ignore other code. It is just for making document path.
import os
from selenium import webdriver
if os.path.exists(temp_down_path):
if len(os.listdir(temp_down_path)) != 0:
for i in os.listdir(temp_down_path):
if os.path.isdir(temp_down_path + i):
shutil.rmtree(temp_down_path + i)
elif os.path.isfile(temp_down_path + i):
os.remove(temp_down_path + i)
else:
pass
else:
os.makedirs(temp_down_path)
temp_down_path = os.getcwd() + '\\temp_files\\'
options = webdriver.ChromeOptions()
options.add_experimental_option('prefs', {
"download.default_directory": temp_down_path, # Change default directory for downloads
"download.prompt_for_download": False, # To auto download the file
"download.directory_upgrade": True,
"plugins.always_open_pdf_externally": True # It will not show PDF directly in chrome
})
driver = webdriver.Chrome(options=options, executable_path= 'chromedriver.exe')
search_url = 'your site url'
driver.get(search_url)
link = driver.find_element(By.TAG_NAME, "a").get_attribute('href')
response = requests.get(link)
Document = os.getcwd() + 'file_name' + '.pdf'
if response.status_code == 200:
open(Document, 'wb').write(response.content)
else:
print('Document download problem')

How to Use Selenium Webdriver to download files via a list of URLs

I wrote a code that use Selenium Webdriver to download files via a list of URLs but for some reason it didn't download anything to my assignedn directory. The code works perfectly fine when I only download it one by one but when I use a for loop, it doesn't work.
This is an example URL: https://www.regulations.gov/contentStreamer?documentId=WHD-2020-0007-1730&attachmentNumber=1&contentType=pdf
Here is my code:
download_dir = '/Users/datawizard/files/'
for web in down_link:
try:
options = webdriver.ChromeOptions()
options.add_argument('headless')
options.add_experimental_option("prefs", {
"download.default_directory": '/Users/clinton/GRA_2021/scraping_project/pdf/',
"download.prompt_for_download": False,
"download.directory_upgrade": True,
# "safebrowsing.enabled": True,
"plugins.always_open_pdf_externally": True
})
driver = webdriver.Chrome(chrome_options=options)
driver.command_executor._commands["send_command"] = ("POST", '/session/$sessionId/chromium/send_command')
params = {'cmd': 'Page.setDownloadBehavior', 'params': {'behavior': 'allow', 'downloadPath': download_dir}}
command_result = driver.execute("send_command", params)
driver.get(url)
except:
print(str(web)+"Link cannot be open")
I am wondering did I do something wrong with the code since it doesn't give me any error when I ran the code above.

You don't need Selenium to download files, you can download files easily using the request library
import requests
for web in down_link:
fileName = YOUR_DOWNLOAD_PATH + web.split("=")[1].split("&")[0] + ".pdf" #I created a filename
r = requests.get(web, stream=True)
with open(fileName, 'wb') as f:
for chunk in r.iter_content():
f.write(chunk)
Updated Answer based on Selenium
#replace the below value with your urls list
down_link = [
'https://www.regulations.gov/contentStreamer?documentId=WHD-2020-0007-1730&attachmentNumber=1&contentType=pdf',
'https://www.regulations.gov/contentStreamer?documentId=WHD-2020-0007-1730&attachmentNumber=1&contentType=pdf']
download_dir = "/Users/datawizard/files/"
options = webdriver.ChromeOptions()
options.add_argument('headless')
options.add_experimental_option("prefs", {
"download.default_directory": download_dir,
"download.prompt_for_download": False,
"download.directory_upgrade": True,
"plugins.always_open_pdf_externally": True
})
driver = webdriver.Chrome(chrome_options=options)
for web in down_link:
driver.get(web)
time.sleep(5) #wait for the download to end, a better handling it's to check if the file exists
driver.quit()
If your files don't have a unique file name - the above code will replace the existing file with the downloaded one.

How to use chrome webdriver in selenium to download files in python?

Based on the posts here and here I am trying to use a chrome webdriver in selenium to be able to download a file. Here is the code so far
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
chrome_options = Options()
chrome_options.add_argument("--disable-extensions")
chrome_options.add_experimental_option("profile.default_content_settings.popups", 0)
chrome_options.add_experimental_option("download.prompt_for_download", "false")
chrome_options.add_experimental_option("download.default_directory", "/tmp")
driver = webdriver.Chrome(chrome_options=chrome_options)
But this alone results in the following error:
WebDriverException: Message: unknown error: cannot parse capability: chromeOptions
from unknown error: unrecognized chrome option: download.default_directory
(Driver info: chromedriver=2.24.417424 (c5c5ea873213ee72e3d0929b47482681555340c3),platform=Linux 4.10.0-37-generic x86_64)
So how to fix this? Do I have to use this 'capability' thing? If so, how exactly?

Try this. Executed on windows
(How to control the download of files with Selenium Python bindings in Chrome)
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
options = Options()
options.add_experimental_option("prefs", {
"download.default_directory": r"C:\Users\xxx\downloads\Test",
"download.prompt_for_download": False,
"download.directory_upgrade": True,
"safebrowsing.enabled": True
})

I think that the easiest way to save arbitrary file (i.e. image) using WebDriver is to execute JavaScript which will save file. No configuration required at all!
I use this library FileSaver.js to save a file with desired name with ease.
from selenium import webdriver
import requests
FILE_SAVER_MIN_JS_URL = "https://raw.githubusercontent.com/eligrey/FileSaver.js/master/dist/FileSaver.min.js"
file_saver_min_js = requests.get(FILE_SAVER_MIN_JS_URL).content
chrome_options = webdriver.ChromeOptions()
driver = webdriver.Chrome('/usr/local/bin/chromedriver', options=chrome_options)
# Execute FileSaver.js in page's context
driver.execute_script(file_saver_min_js)
# Now you can use saveAs() function
download_script = f'''
return fetch('https://cdn.sstatic.net/Sites/stackoverflow/company/img/logos/so/so-logo.svg?v=a010291124bf',
{{
"credentials": "same-origin",
"headers": {{"accept":"image/webp,image/apng,image/*,*/*;q=0.8","accept-language":"en-US,en;q=0.9"}},
"referrerPolicy": "no-referrer-when-downgrade",
"body": null,
"method": "GET",
"mode": "cors"
}}
).then(resp => {{
return resp.blob();
}}).then(blob => {{
saveAs(blob, 'stackoverflow_logo.svg');
}});
'''
driver.execute_script(download_script)
# Done! Your browser has saved an SVG image!

Some tips:
chromium and chromedriver should have same version.
Typically chromium package should have chromedriver inside, you can find it in the install dir. If you are using ubuntu/debian, execute dpkg -L chromium-chromedriver.
Have a correct Chrome preference config.
as Satish said, use options.add_experimental_option("prefs", ...) to config selenium+chrome. But sometimes the config may change by time. The beset way to get newest and workable prefs is to check it in the chromium config dir.
For example,
Launch a chromium in Xorg desktop
Change settings in menu
Quit chromium
Find out the real settings in ~/.config/chromium/Default/Preferences
Read it, pick out the exact options you need.
In my case, the code is:
from selenium import webdriver
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
options = webdriver.ChromeOptions()
options.gpu = False
options.headless = True
options.add_experimental_option("prefs", {
"download.default_directory" : "/data/books/chrome/",
'profile.default_content_setting_values.automatic_downloads': 2,
})
desired = options.to_capabilities()
desired['loggingPrefs'] = { 'performance': 'ALL'}
driver = webdriver.Chrome(desired_capabilities=desired)

for chrome in mac os, the download.defaultdirectory did not work for me and fortunately savefile.default_directory works.
prefs = {
"printing.print_preview_sticky_settings.appState": json.dumps(settings),
"savefile.default_directory": "/Users/creative/python-apps",
"download.prompt_for_download": False,
"download.directory_upgrade": True,
"download.safebrowsing.enabled": True
}

One of the reasons you can't set "download.default_directory" may be that you have a system variable XDG_DOWNLOAD_DIR in file ~/.config/user-dirs.dirs
You can remove variable form that file or, you can set it to whatever you like before running your program.
I was looking for a solution for two days...
My SW set:
Ubuntu bionic, 18.04.5 LTS
chromedriver.86.0.4240.22.lin64
Python 3.9
selenium 3.141.0
splinter 0.14.0

From your exception, you are using chromedriver=2.24.417424.
What are the versions of Selenium and Chrome browser that are you using?
I tried the following code with:
Selenium 3.6.0
chromedriver 2.33
Google Chrome 62.0.3202.62 (Official Build) (64-bit)
And it works:
from selenium import webdriver
download_dir = "/pathToDownloadDir"
chrome_options = webdriver.ChromeOptions()
preferences = {"download.default_directory": download_dir ,
"directory_upgrade": True,
"safebrowsing.enabled": True }
chrome_options.add_experimental_option("prefs", preferences)
driver = webdriver.Chrome(chrome_options=chrome_options,executable_path=r'/pathTo/chromedriver')
driver.get("urlFileToDownload");
Make sure you are using a browser that is supported by your chromedriver (from here, should be Chrome v52-54).

Can't you use requests lib?
If so, here's an example:
import re
import requests
urls = [ '...' ]
for url in urls:
# verify = False ==> for HTTPS requests without SSL certificates
r = requests.get( url, allow_redirects = True, verify = False )
cd = r.headers.get( 'content-disposition' )
fa = re.findall( 'filename=(.+)', cd )
if len( fa ) == 0:
print( f'Error message: {link}' )
continue
filename = fa[ 0 ]
f = open( os.path.join( 'desired_path', filename ), 'wb' )
f.write( r.content )
f.close()

Downloading a file at a specified location through python and selenium using Chrome driver

I am trying to automatically download some links through selenium's click functionality and I am using a chrome webdriver and python as the programming language. How can I select the download directory through the python program so that it does not get downloaded in the default Downloads directory. I found a solution for firefox but there the download dialog keeps popping up every time it clicks on the link which does not happen in Chrome.

I found the accepted solution didn't work, however this slight change did:
from selenium import webdriver
chrome_options = webdriver.ChromeOptions()
prefs = {'download.default_directory' : '/path/to/dir'}
chrome_options.add_experimental_option('prefs', prefs)
driver = webdriver.Chrome(chrome_options=chrome_options)

Update 2018:
Its not valid Chrome command line switch, see the source code use hoju answer below to set the Preferences.
Original:
You can create a profile for chrome and define the download location for the tests. Here is an example:
from selenium import webdriver
options = webdriver.ChromeOptions()
options.add_argument("download.default_directory=C:/Downloads")
driver = webdriver.Chrome(chrome_options=options)

the exact problem I also have faced while trying to do exactly same what you want to :)
For chrome:
from selenium import webdriver
options = webdriver.ChromeOptions()
options.add_argument("--start-maximized")
prefs = {"profile.default_content_settings.popups": 0,
"download.default_directory":
r"C:\Users\user_dir\Desktop\\",#IMPORTANT - ENDING SLASH V IMPORTANT
"directory_upgrade": True}
options.add_experimental_option("prefs", prefs)
browser=webdriver.Chrome(<chromdriver.exe path>, options=options)
For Firefox:
follow this blog for the answer:
https://srirajeshsahoo.wordpress.com/2018/07/26/how-to-bypass-pop-up-during-download-in-firefox-using-selenium-and-python/
The blog says all about the pop-up and downloads dir and how to do

Using prefs solved my problem
path = os.path.dirname(os.path.abspath(__file__))
prefs = {"download.default_directory":path}
options = Options()
options.add_experimental_option("prefs", prefs)
driver = webdriver.Chrome('../harveston/chromedriver.exe',options = options)

This worked for me on Chrome v81.0.4044.138
preferences = {
"profile.default_content_settings.popups": 0,
"download.default_directory": os.getcwd() + os.path.sep,
"directory_upgrade": True
}
chrome_options.add_experimental_option('prefs', preferences)
browser = webdriver.Chrome(executable_path="/usr/bin/chromedriver", options=chrome_options)

I see that many people have the same problem, just add the backslash at the end
op = webdriver.ChromeOptions()
prefs = {'download.default_directory' : 'C:\\Users\\SAJComputer\\PycharmProjects\\robot-test\\'}
op.add_experimental_option('prefs', prefs)
driver = webdriver.Chrome(executable_path=driver_path , options=op)

Update 2022:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
options = Options()
prefs = {"download.default_directory" : "C:\YourDirectory\Folder"}
options.add_experimental_option("prefs", prefs)
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

To provide download directory and chrome's diver executable path use the following code.
from selenium import webdriver
options = webdriver.ChromeOptions()
options.add_argument("download.default_directory=C:/Your_Directory")
driver = webdriver.Chrome(options=options ,executable_path='C:/chromedriver')
change the path in your code accordingly.

If you are using linux distribution
Use this code
prefs = {'download.prompt_for_download': False,
'download.directory_upgrade': True,
'safebrowsing.enabled': False,
'safebrowsing.disable_download_protection': True}
options.add_argument('--headless')
options.add_experimental_option('prefs', prefs)
driver = webdriver.Chrome('chromedriver.exe', chrome_options=options)
driver.command_executor._commands["send_command"] = ("POST", '/session/$sessionId/chromium/send_command')
driver.desired_capabilities['browserName'] = 'ur mum'
params = {'cmd': 'Page.setDownloadBehavior', 'params': {'behavior': 'allow', 'downloadPath': r'C:\chickenbutt'}}
driver.execute("send_command", params)

Below code snippet holds good for Windows/linux/MacOs distro:
downloadDir = f"{os.getcwd()}//downloads//"
# Make sure path exists.
Path(downloadDir).mkdir(parents=True, exist_ok=True)
# Set Preferences.
preferences = {"download.default_directory": downloadDir,
"download.prompt_for_download": False,
"directory_upgrade": True,
"safebrowsing.enabled": True}
chromeOptions = webdriver.ChromeOptions()
chromeOptions.add_argument("--window-size=1480x560")
chromeOptions.add_experimental_option("prefs", preferences)
driver = webdriver.Chrome(DRIVER_PATH, options=chromeOptions)
driver.get(url)
time.sleep(10)
driver.close()

This is non code level solution with no chrome profiling/options settings.
If you are using script only on your local machine then use this solution
Click on Menu -> Setting -> Show advanced settings... -> Downloads
Now uncheck
Ask where to save each file before downloading
Hope it will help you :)

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

PDF Download with Selenium for Chrome in Python - python

you can download file with requests lib or with curl/wget/etc instead of driver.get(url)

I have overlooked that in the code were still fragments of my attempts with firefox. browser = webdriver.Chrome(options=options) driver = webdriver.Chrome() are obviously wrong i wrote driver = webdriver.Chrome(options=options) and all things work. excuse my carelessness

Related

Printing a PDF with Selenium Chrome Driver in headless mode

Python/Selenium: save to pdf not saving

How to Use Selenium Webdriver to download files via a list of URLs

How to use chrome webdriver in selenium to download files in python?

Downloading a file at a specified location through python and selenium using Chrome driver

Categories

Resources