Printing a PDF with Selenium Chrome Driver in headless mode - python

I have no problems printing without headless mode, however once I enable headless mode, it just refuses to print a PDF. I'm currently working on an app with a GUI, so I'd rather not have the Selenium webdriver visible to the end user if possible.
For this project I'm using an older version of Selenium, 4.2.0. That coupled with Python 3.9.
import os
from os.path import exists
import json
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver import Chrome, ChromeOptions
# Paths
dir_path = os.getcwd()
download_path = os.path.join(dir_path, "letters")
chrome_path = os.path.join(dir_path, "chromium\\app\\Chrome-bin\\chrome.exe")
user_data_path = os.path.join(dir_path, "sessions")
website = "https://www.google.com/"
def main():
print_settings = {
"recentDestinations": [{
"id": "Save as PDF",
"origin": "local",
"account": "",
}],
"selectedDestinationId": "Save as PDF",
"version": 2,
"isHeaderFooterEnabled": False,
"isLandscapeEnabled": True
}
options = ChromeOptions()
options.binary_location = chrome_path
options.add_argument("--start-maximized")
options.add_argument('--window-size=1920,1080')
options.add_argument(f"user-data-dir={user_data_path}")
options.add_argument("--headless")
options.add_argument('--enable-print-browser')
options.add_experimental_option("prefs", {
"printing.print_preview_sticky_settings.appState": json.dumps(print_settings),
"savefile.default_directory": download_path, # Change default directory for downloads
"download.default_directory": download_path, # Change default directory for downloads
"download.prompt_for_download": False, # To auto download the file
"download.directory_upgrade": True,
"profile.default_content_setting_values.automatic_downloads": 1,
"safebrowsing.enabled": True
})
options.add_argument("--kiosk-printing")
driver = Chrome(service=Service(ChromeDriverManager().install()), options=options)
driver.get(website)
driver.execute_script("window.print();")
if exists(os.path.join(user_data_path, "Google.pdf")):
print("YAY!")
else:
print(":(")
if __name__ == '__main__':
main()

For anyone else coming across this with a similar issue, I fixed it by using the print method described here: Selenium print PDF in A4 format
Using my example from above, I replaced:
driver.execute_script("window.print();")
with:
pdf_data = driver.execute_cdp_cmd("Page.printToPDF", print_settings)
with open('Google.pdf', 'wb') as file:
file.write(base64.b64decode(pdf_data['data']))
And that worked just fine for me.

Related

Python/Selenium: save to pdf not saving

I am using selenium/python to save a series of webpages to pdf. The webpages have a table that is rendered with javascript; I am using "find_element_by_xpath" to identify that the pdf icon in the js table appeared before proceeding with the print. Optimally, I did not want to implement a set a hard wait/sleep time as I have thousands of pages to save.
The code seems to work but no pdf is saved.
The code is as follows:
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException
import json
options = webdriver.ChromeOptions()
settings = {
"recentDestinations": [{
"id": "Save as PDF",
"origin": "local",
"account": "",
}],
"selectedDestinationId": "Save as PDF",
"version": 2
}
prefs = {'printing.print_preview_sticky_settings.appState': json.dumps(settings)}
options.add_experimental_option('prefs', prefs)
options.add_experimental_option('excludeSwitches', ['enable-logging'])
options.add_argument('--kiosk-printing')
CHROMEDRIVER_PATH = 'chromedriver.exe'
driver = webdriver.Chrome(options=options, executable_path=CHROMEDRIVER_PATH)
driver = webdriver.Chrome(options=options, executable_path=CHROMEDRIVER_PATH)
try:
element = driver.find_element_by_xpath("//div[#class='fas fa-file-pdf']")
WebDriverWait(driver, 10).until(EC.staleness_of(element))
except NoSuchElementException:
element = None
print(element)
driver.get("url")
driver.execute_script('window.print();')
#driver.quit()
This is working
First I get document link using driver.findelement(). Than I pass that url in request(). If we get response.status_code == 200 so we can download document otherwise we can't. Than I just write document content using response.content.strong textDocument is a document path with file_name.extension. Please ignore other code. It is just for making document path.
import os
from selenium import webdriver
if os.path.exists(temp_down_path):
if len(os.listdir(temp_down_path)) != 0:
for i in os.listdir(temp_down_path):
if os.path.isdir(temp_down_path + i):
shutil.rmtree(temp_down_path + i)
elif os.path.isfile(temp_down_path + i):
os.remove(temp_down_path + i)
else:
pass
else:
os.makedirs(temp_down_path)
temp_down_path = os.getcwd() + '\\temp_files\\'
options = webdriver.ChromeOptions()
options.add_experimental_option('prefs', {
"download.default_directory": temp_down_path, # Change default directory for downloads
"download.prompt_for_download": False, # To auto download the file
"download.directory_upgrade": True,
"plugins.always_open_pdf_externally": True # It will not show PDF directly in chrome
})
driver = webdriver.Chrome(options=options, executable_path= 'chromedriver.exe')
search_url = 'your site url'
driver.get(search_url)
link = driver.find_element(By.TAG_NAME, "a").get_attribute('href')
response = requests.get(link)
Document = os.getcwd() + 'file_name' + '.pdf'
if response.status_code == 200:
open(Document, 'wb').write(response.content)
else:
print('Document download problem')

Page size specification does not work when saving HTML file as PDF in Chrome using python

I wanted to open an HTML file on Chrome and save it as a PDF,
I wrote the following code using python, but even I set the "pageSize" in appState in the code to A3, A5, letter size or other size,
the page size of the output PDF file will always be A4.
If I change the values of the other properties "isLandscapeEnabled" or "isHeaderFooterEnabled" in appState, it works well.
So, it seems the "pageSize" property only not works.
And, I found when Control Panel-> Region-> Format (F) is set to Japan, the page size of the output PDF file will always be A4,
and if I set it to the United States, it will always be letter size.
So, I think the "pageSize" is always be set by the default value.
I'm a python beginner and I don't really understand it yet.
Is there something wrong with my code?
Please give me any advice.
Python 3.9.6
selenium 3.141.0
Chrome 94.0.4606.71
ChromeDriver 94.0.4606.61
windows10
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import json
import time
def PrintSetUp():
chopt=webdriver.ChromeOptions()
appState = {
"recentDestinations": [
{
"id": "Save as PDF",
"origin": "local",
"account":""
}
],
"selectedDestinationId": "Save as PDF",
"version": 2,
"isLandscapeEnabled": False,
"pageSize": 'A3',
"marginsType": 0,
"scalingType": 3 ,
"scaling": "100" ,
"isHeaderFooterEnabled": False, #ヘッダーとフッター
"isCssBackgroundEnabled": True, #背景のグラフィック
}
prefs = {'printing.print_preview_sticky_settings.appState':
json.dumps(appState),
"download.default_directory": "~/Downloads"
}
chopt.add_experimental_option('prefs', prefs)
chopt.add_argument('--kiosk-printing')
return chopt
def main_WebToPDF(BlogURL):
chopt = PrintSetUp()
driver_path = "C:/Work/pythonTest/Ver94/chromedriver_win32/chromedriver.exe" #webdriverのパス
driver = webdriver.Chrome(executable_path=driver_path, options=chopt)
driver.implicitly_wait(10) # 秒 暗示的待機
driver.get(BlogURL) #ブログのURL 読み込み
WebDriverWait(driver, 15).until(EC.presence_of_all_elements_located) # ページ上のすべての要素が読み込まれるまで待機(15秒でタイムアウト判定)
driver.execute_script('return window.print()') #Print as PDF
time.sleep(10) #ファイルのダウンロードのために10秒待機
driver.quit() #Close Screen
if __name__ == '__main__':
BlogURLList=["file://C:/Work/pythonTest/1_pdf.html",
"file://C:/Work/pythonTest/2_pdf.html",
"file://C:/Work/pythonTest/3_pdf.html"]
for BlogURL in BlogURLList:
main_WebToPDF(BlogURL)

PDF Download with Selenium for Chrome in Python

I write a Script for a massdownload of PDF-files in a Loop. Firefoxbased Selenium-Webdriver stuck after the first Download, so i decide to try Chrome.
With Chrome the Loop is working now. But i can not download the PDF-files. Chrome just show them with pdf_viewer.js in the Browserwindow.
I try different Options like plugins.plugins_list": [{"enabled": False, "name": "Chrome PDF Viewer"}], download.extensions_to_open": "applications/pdf and plugins.always_open_pdf_externally": True. Nothing works.
#!/usr/bin/python
downPath = "/home/user/xxx/"
Y = ["2017", "2018", "2019"]
def main():
options = webdriver.ChromeOptions()
profile = {
"plugins.plugins_list": [{"enabled": False, "name": "Chrome PDF Viewer"}],
"download.default_directory": downPath ,
"download.extensions_to_open": "applications/pdf",
"plugins.always_open_pdf_externally": True,
"download.prompt_for_download": False,
"safebrowsing.enabled": True
}
options.add_experimental_option("prefs", profile)
browser = webdriver.Chrome(options=options)
driver = webdriver.Chrome()
driver.get('login-url')
#LOGIN-Stuff ...
time.sleep(4)
for y in Y:
print(y)
for x in range(53):
url = "https://j.world/files/{0}/filename_{0}-{1:02d}.pdf".format(y, x)
print(url)
driver.get(url)
time.sleep(4)
driver.quit()
if __name__ == "__main__":
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import time
import sys
import os
main()
I use Python 3.7.4 with python-selenium 3.141.0-1 from the Arch-Linux Repository.
you can download file with requests lib or with curl/wget/etc instead of
driver.get(url)
I have overlooked that in the code were still fragments of my attempts with firefox.
browser = webdriver.Chrome(options=options)
driver = webdriver.Chrome()
are obviously wrong
i wrote driver = webdriver.Chrome(options=options) and all things work.
excuse my carelessness

how to save opened page as pdf in Selenium (Python)

Have tried all the solutions I could find on the Internet to be able to print a page that is open in Selenium in Python. However, while the print pop-up shows up, after a second or two it goes away, with no PDF saved.
Here is the code being tried. Based on the code here - https://stackoverflow.com/a/43752129/3973491
Coding on a Mac with Mojave 10.14.5.
from selenium import webdriver
from selenium.webdriver.support.select import Select
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import WebDriverException
import time
import json
options = Options()
appState = {
"recentDestinations": [
{
"id": "Save as PDF",
"origin": "local"
}
],
"selectedDestinationId": "Save as PDF",
"version": 2
}
profile = {'printing.print_preview_sticky_settings.appState': json.dumps(appState)}
# profile = {'printing.print_preview_sticky_settings.appState':json.dumps(appState),'savefile.default_directory':downloadPath}
options.add_experimental_option('prefs', profile)
options.add_argument('--kiosk-printing')
CHROMEDRIVER_PATH = '/usr/local/bin/chromedriver'
driver = webdriver.Chrome(options=options, executable_path=CHROMEDRIVER_PATH)
driver.implicitly_wait(5)
driver.get(url)
driver.execute_script('window.print();')
$chromedriver --v
ChromeDriver 75.0.3770.90 (a6dcaf7e3ec6f70a194cc25e8149475c6590e025-refs/branch-heads/3770#{#1003})
Any hints or solutions as to what can be done to print the open html page to a PDF. Have spent hours trying to make this work. Thank you!
Update on 2019-07-11:
My question has been identified as a duplicate, but a) the other question seems to be using javascript code, and b) the answer does not solve the problem being raised in this question - it may be to do with more recent software versions. Chrome version being used is Version 75.0.3770.100 (Official Build) (64-bit), and chromedriver is ChromeDriver 75.0.3770.90. On Mac OS Mojave. Script is running on Python 3.7.3.
Update on 2019-07-11:
Changed the code to
from selenium import webdriver
import json
chrome_options = webdriver.ChromeOptions()
settings = {
"appState": {
"recentDestinations": [{
"id": "Save as PDF",
"origin": "local",
"account": "",
}],
"selectedDestinationId": "Save as PDF",
"version": 2
}
}
prefs = {'printing.print_preview_sticky_settings': json.dumps(settings)}
chrome_options.add_experimental_option('prefs', prefs)
chrome_options.add_argument('--kiosk-printing')
CHROMEDRIVER_PATH = '/usr/local/bin/chromedriver'
driver = webdriver.Chrome(chrome_options=chrome_options, executable_path=CHROMEDRIVER_PATH)
driver.get("https://google.com")
driver.execute_script('window.print();')
driver.quit()
And now, nothing happens. Chrome launches, loads url, print dialog appears but then nothing seems to happen - nothing in the default printer queue, and no pdf either - I even searched for the PDF files by looking up "Recent Files" on Mac.
The answer here, worked when I did not have any other printer setup in my OS. But when I had another default printer, this did not work.
I don't understand how, but making small change this way seems to work.
from selenium import webdriver
import json
chrome_options = webdriver.ChromeOptions()
settings = {
"recentDestinations": [{
"id": "Save as PDF",
"origin": "local",
"account": "",
}],
"selectedDestinationId": "Save as PDF",
"version": 2
}
prefs = {'printing.print_preview_sticky_settings.appState': json.dumps(settings)}
chrome_options.add_experimental_option('prefs', prefs)
chrome_options.add_argument('--kiosk-printing')
CHROMEDRIVER_PATH = '/usr/local/bin/chromedriver'
driver = webdriver.Chrome(chrome_options=chrome_options, executable_path=CHROMEDRIVER_PATH)
driver.get("https://google.com")
driver.execute_script('window.print();')
driver.quit()
You can use the following code to print PDFs in A5 size with background css enabled:
import os
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import json
import time
chrome_options = webdriver.ChromeOptions()
settings = {
"recentDestinations": [{
"id": "Save as PDF",
"origin": "local",
"account": ""
}],
"selectedDestinationId": "Save as PDF",
"version": 2,
"isHeaderFooterEnabled": False,
"mediaSize": {
"height_microns": 210000,
"name": "ISO_A5",
"width_microns": 148000,
"custom_display_name": "A5"
},
"customMargins": {},
"marginsType": 2,
"scaling": 175,
"scalingType": 3,
"scalingTypePdf": 3,
"isCssBackgroundEnabled": True
}
mobile_emulation = { "deviceName": "Nexus 5" }
chrome_options.add_experimental_option("mobileEmulation", mobile_emulation)
chrome_options.add_argument('--enable-print-browser')
#chrome_options.add_argument('--headless')
prefs = {
'printing.print_preview_sticky_settings.appState': json.dumps(settings),
'savefile.default_directory': '<path>'
}
chrome_options.add_argument('--kiosk-printing')
chrome_options.add_experimental_option('prefs', prefs)
for dirpath, dirnames, filenames in os.walk('<source path>'):
for fileName in filenames:
print(fileName)
driver = webdriver.Chrome("./chromedriver", options=chrome_options)
driver.get(f'file://{os.path.join(dirpath, fileName)}')
time.sleep(7)
driver.execute_script('window.print();')
driver.close()
The solution is not very good, but you can take a screenshot and convert to pdf by Pillow...
from selenium import webdriver
from io import BytesIO
from PIL import Image
driver = webdriver.Chrome(executable_path='path to your driver')
driver.get('your url here')
img = Image.open(BytesIO(driver.find_element_by_tag_name('body').screenshot_as_png))
img.save('filename.pdf', "PDF", quality=100)
Here is the solution I use with Windows :
First download the ChromeDriver here : http://chromedriver.chromium.org/downloads and install Selenium
Then run this code (based on the accepted answer, slightly modified to work on Windows):
import json
from selenium import webdriver
chrome_options = webdriver.ChromeOptions()
settings = {"recentDestinations": [{"id": "Save as PDF", "origin": "local", "account": ""}], "selectedDestinationId": "Save as PDF", "version": 2}
prefs = {'printing.print_preview_sticky_settings.appState': json.dumps(settings)}
chrome_options.add_experimental_option('prefs', prefs)
chrome_options.add_argument('--kiosk-printing')
browser = webdriver.Chrome(r"chromedriver.exe", options=chrome_options)
browser.get("https://google.com/")
browser.execute_script('window.print();')
browser.close()
I would suggest Downloading the page source html which can be done like so
in vb.net:
Dim Html As String = webdriver.PageSource
Not sure how it is done in python but I'm sure it's very similar
Once you have done that then you can select the parts of the page you want to save using an html parser or by parsing it manually with string parsing code. Once you have the html for the part you want to save stored in a string then use an html to pdf converter library or program. There are lots of these for programming languages like C# and vb.net. I don't know about any for python but I'm sure some exist. Just do some research. (some are free and some are expensive)

How to use chrome webdriver in selenium to download files in python?

Based on the posts here and here I am trying to use a chrome webdriver in selenium to be able to download a file. Here is the code so far
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
chrome_options = Options()
chrome_options.add_argument("--disable-extensions")
chrome_options.add_experimental_option("profile.default_content_settings.popups", 0)
chrome_options.add_experimental_option("download.prompt_for_download", "false")
chrome_options.add_experimental_option("download.default_directory", "/tmp")
driver = webdriver.Chrome(chrome_options=chrome_options)
But this alone results in the following error:
WebDriverException: Message: unknown error: cannot parse capability: chromeOptions
from unknown error: unrecognized chrome option: download.default_directory
(Driver info: chromedriver=2.24.417424 (c5c5ea873213ee72e3d0929b47482681555340c3),platform=Linux 4.10.0-37-generic x86_64)
So how to fix this? Do I have to use this 'capability' thing? If so, how exactly?
Try this. Executed on windows
(How to control the download of files with Selenium Python bindings in Chrome)
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
options = Options()
options.add_experimental_option("prefs", {
"download.default_directory": r"C:\Users\xxx\downloads\Test",
"download.prompt_for_download": False,
"download.directory_upgrade": True,
"safebrowsing.enabled": True
})
I think that the easiest way to save arbitrary file (i.e. image) using WebDriver is to execute JavaScript which will save file. No configuration required at all!
I use this library FileSaver.js to save a file with desired name with ease.
from selenium import webdriver
import requests
FILE_SAVER_MIN_JS_URL = "https://raw.githubusercontent.com/eligrey/FileSaver.js/master/dist/FileSaver.min.js"
file_saver_min_js = requests.get(FILE_SAVER_MIN_JS_URL).content
chrome_options = webdriver.ChromeOptions()
driver = webdriver.Chrome('/usr/local/bin/chromedriver', options=chrome_options)
# Execute FileSaver.js in page's context
driver.execute_script(file_saver_min_js)
# Now you can use saveAs() function
download_script = f'''
return fetch('https://cdn.sstatic.net/Sites/stackoverflow/company/img/logos/so/so-logo.svg?v=a010291124bf',
{{
"credentials": "same-origin",
"headers": {{"accept":"image/webp,image/apng,image/*,*/*;q=0.8","accept-language":"en-US,en;q=0.9"}},
"referrerPolicy": "no-referrer-when-downgrade",
"body": null,
"method": "GET",
"mode": "cors"
}}
).then(resp => {{
return resp.blob();
}}).then(blob => {{
saveAs(blob, 'stackoverflow_logo.svg');
}});
'''
driver.execute_script(download_script)
# Done! Your browser has saved an SVG image!
Some tips:
chromium and chromedriver should have same version.
Typically chromium package should have chromedriver inside, you can find it in the install dir. If you are using ubuntu/debian, execute dpkg -L chromium-chromedriver.
Have a correct Chrome preference config.
as Satish said, use options.add_experimental_option("prefs", ...) to config selenium+chrome. But sometimes the config may change by time. The beset way to get newest and workable prefs is to check it in the chromium config dir.
For example,
Launch a chromium in Xorg desktop
Change settings in menu
Quit chromium
Find out the real settings in ~/.config/chromium/Default/Preferences
Read it, pick out the exact options you need.
In my case, the code is:
from selenium import webdriver
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
options = webdriver.ChromeOptions()
options.gpu = False
options.headless = True
options.add_experimental_option("prefs", {
"download.default_directory" : "/data/books/chrome/",
'profile.default_content_setting_values.automatic_downloads': 2,
})
desired = options.to_capabilities()
desired['loggingPrefs'] = { 'performance': 'ALL'}
driver = webdriver.Chrome(desired_capabilities=desired)
for chrome in mac os, the download.defaultdirectory did not work for me and fortunately savefile.default_directory works.
prefs = {
"printing.print_preview_sticky_settings.appState": json.dumps(settings),
"savefile.default_directory": "/Users/creative/python-apps",
"download.prompt_for_download": False,
"download.directory_upgrade": True,
"download.safebrowsing.enabled": True
}
One of the reasons you can't set "download.default_directory" may be that you have a system variable XDG_DOWNLOAD_DIR in file ~/.config/user-dirs.dirs
You can remove variable form that file or, you can set it to whatever you like before running your program.
I was looking for a solution for two days...
My SW set:
Ubuntu bionic, 18.04.5 LTS
chromedriver.86.0.4240.22.lin64
Python 3.9
selenium 3.141.0
splinter 0.14.0
From your exception, you are using chromedriver=2.24.417424.
What are the versions of Selenium and Chrome browser that are you using?
I tried the following code with:
Selenium 3.6.0
chromedriver 2.33
Google Chrome 62.0.3202.62 (Official Build) (64-bit)
And it works:
from selenium import webdriver
download_dir = "/pathToDownloadDir"
chrome_options = webdriver.ChromeOptions()
preferences = {"download.default_directory": download_dir ,
"directory_upgrade": True,
"safebrowsing.enabled": True }
chrome_options.add_experimental_option("prefs", preferences)
driver = webdriver.Chrome(chrome_options=chrome_options,executable_path=r'/pathTo/chromedriver')
driver.get("urlFileToDownload");
Make sure you are using a browser that is supported by your chromedriver (from here, should be Chrome v52-54).
Can't you use requests lib?
If so, here's an example:
import re
import requests
urls = [ '...' ]
for url in urls:
# verify = False ==> for HTTPS requests without SSL certificates
r = requests.get( url, allow_redirects = True, verify = False )
cd = r.headers.get( 'content-disposition' )
fa = re.findall( 'filename=(.+)', cd )
if len( fa ) == 0:
print( f'Error message: {link}' )
continue
filename = fa[ 0 ]
f = open( os.path.join( 'desired_path', filename ), 'wb' )
f.write( r.content )
f.close()

Categories

Resources