how to save opened page as pdf in Selenium (Python) - python

Have tried all the solutions I could find on the Internet to be able to print a page that is open in Selenium in Python. However, while the print pop-up shows up, after a second or two it goes away, with no PDF saved.
Here is the code being tried. Based on the code here - https://stackoverflow.com/a/43752129/3973491
Coding on a Mac with Mojave 10.14.5.
from selenium import webdriver
from selenium.webdriver.support.select import Select
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import WebDriverException
import time
import json
options = Options()
appState = {
"recentDestinations": [
{
"id": "Save as PDF",
"origin": "local"
}
],
"selectedDestinationId": "Save as PDF",
"version": 2
}
profile = {'printing.print_preview_sticky_settings.appState': json.dumps(appState)}
# profile = {'printing.print_preview_sticky_settings.appState':json.dumps(appState),'savefile.default_directory':downloadPath}
options.add_experimental_option('prefs', profile)
options.add_argument('--kiosk-printing')
CHROMEDRIVER_PATH = '/usr/local/bin/chromedriver'
driver = webdriver.Chrome(options=options, executable_path=CHROMEDRIVER_PATH)
driver.implicitly_wait(5)
driver.get(url)
driver.execute_script('window.print();')
$chromedriver --v
ChromeDriver 75.0.3770.90 (a6dcaf7e3ec6f70a194cc25e8149475c6590e025-refs/branch-heads/3770#{#1003})
Any hints or solutions as to what can be done to print the open html page to a PDF. Have spent hours trying to make this work. Thank you!
Update on 2019-07-11:
My question has been identified as a duplicate, but a) the other question seems to be using javascript code, and b) the answer does not solve the problem being raised in this question - it may be to do with more recent software versions. Chrome version being used is Version 75.0.3770.100 (Official Build) (64-bit), and chromedriver is ChromeDriver 75.0.3770.90. On Mac OS Mojave. Script is running on Python 3.7.3.
Update on 2019-07-11:
Changed the code to
from selenium import webdriver
import json
chrome_options = webdriver.ChromeOptions()
settings = {
"appState": {
"recentDestinations": [{
"id": "Save as PDF",
"origin": "local",
"account": "",
}],
"selectedDestinationId": "Save as PDF",
"version": 2
}
}
prefs = {'printing.print_preview_sticky_settings': json.dumps(settings)}
chrome_options.add_experimental_option('prefs', prefs)
chrome_options.add_argument('--kiosk-printing')
CHROMEDRIVER_PATH = '/usr/local/bin/chromedriver'
driver = webdriver.Chrome(chrome_options=chrome_options, executable_path=CHROMEDRIVER_PATH)
driver.get("https://google.com")
driver.execute_script('window.print();')
driver.quit()
And now, nothing happens. Chrome launches, loads url, print dialog appears but then nothing seems to happen - nothing in the default printer queue, and no pdf either - I even searched for the PDF files by looking up "Recent Files" on Mac.

The answer here, worked when I did not have any other printer setup in my OS. But when I had another default printer, this did not work.
I don't understand how, but making small change this way seems to work.
from selenium import webdriver
import json
chrome_options = webdriver.ChromeOptions()
settings = {
"recentDestinations": [{
"id": "Save as PDF",
"origin": "local",
"account": "",
}],
"selectedDestinationId": "Save as PDF",
"version": 2
}
prefs = {'printing.print_preview_sticky_settings.appState': json.dumps(settings)}
chrome_options.add_experimental_option('prefs', prefs)
chrome_options.add_argument('--kiosk-printing')
CHROMEDRIVER_PATH = '/usr/local/bin/chromedriver'
driver = webdriver.Chrome(chrome_options=chrome_options, executable_path=CHROMEDRIVER_PATH)
driver.get("https://google.com")
driver.execute_script('window.print();')
driver.quit()

You can use the following code to print PDFs in A5 size with background css enabled:
import os
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import json
import time
chrome_options = webdriver.ChromeOptions()
settings = {
"recentDestinations": [{
"id": "Save as PDF",
"origin": "local",
"account": ""
}],
"selectedDestinationId": "Save as PDF",
"version": 2,
"isHeaderFooterEnabled": False,
"mediaSize": {
"height_microns": 210000,
"name": "ISO_A5",
"width_microns": 148000,
"custom_display_name": "A5"
},
"customMargins": {},
"marginsType": 2,
"scaling": 175,
"scalingType": 3,
"scalingTypePdf": 3,
"isCssBackgroundEnabled": True
}
mobile_emulation = { "deviceName": "Nexus 5" }
chrome_options.add_experimental_option("mobileEmulation", mobile_emulation)
chrome_options.add_argument('--enable-print-browser')
#chrome_options.add_argument('--headless')
prefs = {
'printing.print_preview_sticky_settings.appState': json.dumps(settings),
'savefile.default_directory': '<path>'
}
chrome_options.add_argument('--kiosk-printing')
chrome_options.add_experimental_option('prefs', prefs)
for dirpath, dirnames, filenames in os.walk('<source path>'):
for fileName in filenames:
print(fileName)
driver = webdriver.Chrome("./chromedriver", options=chrome_options)
driver.get(f'file://{os.path.join(dirpath, fileName)}')
time.sleep(7)
driver.execute_script('window.print();')
driver.close()

The solution is not very good, but you can take a screenshot and convert to pdf by Pillow...
from selenium import webdriver
from io import BytesIO
from PIL import Image
driver = webdriver.Chrome(executable_path='path to your driver')
driver.get('your url here')
img = Image.open(BytesIO(driver.find_element_by_tag_name('body').screenshot_as_png))
img.save('filename.pdf', "PDF", quality=100)

Here is the solution I use with Windows :
First download the ChromeDriver here : http://chromedriver.chromium.org/downloads and install Selenium
Then run this code (based on the accepted answer, slightly modified to work on Windows):
import json
from selenium import webdriver
chrome_options = webdriver.ChromeOptions()
settings = {"recentDestinations": [{"id": "Save as PDF", "origin": "local", "account": ""}], "selectedDestinationId": "Save as PDF", "version": 2}
prefs = {'printing.print_preview_sticky_settings.appState': json.dumps(settings)}
chrome_options.add_experimental_option('prefs', prefs)
chrome_options.add_argument('--kiosk-printing')
browser = webdriver.Chrome(r"chromedriver.exe", options=chrome_options)
browser.get("https://google.com/")
browser.execute_script('window.print();')
browser.close()

I would suggest Downloading the page source html which can be done like so
in vb.net:
Dim Html As String = webdriver.PageSource
Not sure how it is done in python but I'm sure it's very similar
Once you have done that then you can select the parts of the page you want to save using an html parser or by parsing it manually with string parsing code. Once you have the html for the part you want to save stored in a string then use an html to pdf converter library or program. There are lots of these for programming languages like C# and vb.net. I don't know about any for python but I'm sure some exist. Just do some research. (some are free and some are expensive)

Related

Enable backgroundCSS option when Saving PDFs of webpages with Selenium

I'm trying to "Print as PDF" a set of webpage reports repeatedly with a Selenium ChromeDriver script in Python. As you can see below, if we have the "Background graphics" option selected, the header of the report renders perfectly.
However, when I run Selenium, I cannot figure out how to set the parameter for background graphics. The saved PDF will instead look like this:
Here is a minimally reproducible script that I've been attempting to run:
import selenium
from selenium import webdriver
import json
from time import sleep
import os
chrome_options = webdriver.ChromeOptions()
settings = {
"recentDestinations": [{
"id": "Save as PDF",
"origin": "local",
"account": "",
}],
"selectedDestinationId": "Save as PDF",
"version": 2,
#"cssBackground": 2,
}
prefs = {'printing.print_preview_sticky_settings.appState': json.dumps(settings), }
chrome_options.add_experimental_option('prefs', prefs)
chrome_options.add_argument('--kiosk-printing')
CHROMEDRIVER_PATH = '/usr/local/bin/chromedriver'
driver = webdriver.Chrome(chrome_options=chrome_options, executable_path=CHROMEDRIVER_PATH)
driver.get("https://dash.gallery/dash-multipage-report/")
sleep(3)
driver.execute_script('window.print();')
sleep(5)
driver.quit()
I've done some digging on Chromium's documentation and I believe that I've found the cssBackground setting here. Additionally, there's a BackgroundGraphicsModeRestriction in this file that might be affecting things? I found what I thought would be a useful lead to setting the value in this StackOverflow post but I can't seem to put the pieces together and check that "Background graphics" box from the start. Any help that you can provide would be incredibly appreciated as I'm pretty stuck here. I've also tried rendering my page's CSS with -webkit-print-color-adjust:exact; but it doesn't seem to capture the entire background.

Printing a PDF with Selenium Chrome Driver in headless mode

I have no problems printing without headless mode, however once I enable headless mode, it just refuses to print a PDF. I'm currently working on an app with a GUI, so I'd rather not have the Selenium webdriver visible to the end user if possible.
For this project I'm using an older version of Selenium, 4.2.0. That coupled with Python 3.9.
import os
from os.path import exists
import json
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver import Chrome, ChromeOptions
# Paths
dir_path = os.getcwd()
download_path = os.path.join(dir_path, "letters")
chrome_path = os.path.join(dir_path, "chromium\\app\\Chrome-bin\\chrome.exe")
user_data_path = os.path.join(dir_path, "sessions")
website = "https://www.google.com/"
def main():
print_settings = {
"recentDestinations": [{
"id": "Save as PDF",
"origin": "local",
"account": "",
}],
"selectedDestinationId": "Save as PDF",
"version": 2,
"isHeaderFooterEnabled": False,
"isLandscapeEnabled": True
}
options = ChromeOptions()
options.binary_location = chrome_path
options.add_argument("--start-maximized")
options.add_argument('--window-size=1920,1080')
options.add_argument(f"user-data-dir={user_data_path}")
options.add_argument("--headless")
options.add_argument('--enable-print-browser')
options.add_experimental_option("prefs", {
"printing.print_preview_sticky_settings.appState": json.dumps(print_settings),
"savefile.default_directory": download_path, # Change default directory for downloads
"download.default_directory": download_path, # Change default directory for downloads
"download.prompt_for_download": False, # To auto download the file
"download.directory_upgrade": True,
"profile.default_content_setting_values.automatic_downloads": 1,
"safebrowsing.enabled": True
})
options.add_argument("--kiosk-printing")
driver = Chrome(service=Service(ChromeDriverManager().install()), options=options)
driver.get(website)
driver.execute_script("window.print();")
if exists(os.path.join(user_data_path, "Google.pdf")):
print("YAY!")
else:
print(":(")
if __name__ == '__main__':
main()
For anyone else coming across this with a similar issue, I fixed it by using the print method described here: Selenium print PDF in A4 format
Using my example from above, I replaced:
driver.execute_script("window.print();")
with:
pdf_data = driver.execute_cdp_cmd("Page.printToPDF", print_settings)
with open('Google.pdf', 'wb') as file:
file.write(base64.b64decode(pdf_data['data']))
And that worked just fine for me.

Python/Selenium: save to pdf not saving

I am using selenium/python to save a series of webpages to pdf. The webpages have a table that is rendered with javascript; I am using "find_element_by_xpath" to identify that the pdf icon in the js table appeared before proceeding with the print. Optimally, I did not want to implement a set a hard wait/sleep time as I have thousands of pages to save.
The code seems to work but no pdf is saved.
The code is as follows:
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException
import json
options = webdriver.ChromeOptions()
settings = {
"recentDestinations": [{
"id": "Save as PDF",
"origin": "local",
"account": "",
}],
"selectedDestinationId": "Save as PDF",
"version": 2
}
prefs = {'printing.print_preview_sticky_settings.appState': json.dumps(settings)}
options.add_experimental_option('prefs', prefs)
options.add_experimental_option('excludeSwitches', ['enable-logging'])
options.add_argument('--kiosk-printing')
CHROMEDRIVER_PATH = 'chromedriver.exe'
driver = webdriver.Chrome(options=options, executable_path=CHROMEDRIVER_PATH)
driver = webdriver.Chrome(options=options, executable_path=CHROMEDRIVER_PATH)
try:
element = driver.find_element_by_xpath("//div[#class='fas fa-file-pdf']")
WebDriverWait(driver, 10).until(EC.staleness_of(element))
except NoSuchElementException:
element = None
print(element)
driver.get("url")
driver.execute_script('window.print();')
#driver.quit()
This is working
First I get document link using driver.findelement(). Than I pass that url in request(). If we get response.status_code == 200 so we can download document otherwise we can't. Than I just write document content using response.content.strong textDocument is a document path with file_name.extension. Please ignore other code. It is just for making document path.
import os
from selenium import webdriver
if os.path.exists(temp_down_path):
if len(os.listdir(temp_down_path)) != 0:
for i in os.listdir(temp_down_path):
if os.path.isdir(temp_down_path + i):
shutil.rmtree(temp_down_path + i)
elif os.path.isfile(temp_down_path + i):
os.remove(temp_down_path + i)
else:
pass
else:
os.makedirs(temp_down_path)
temp_down_path = os.getcwd() + '\\temp_files\\'
options = webdriver.ChromeOptions()
options.add_experimental_option('prefs', {
"download.default_directory": temp_down_path, # Change default directory for downloads
"download.prompt_for_download": False, # To auto download the file
"download.directory_upgrade": True,
"plugins.always_open_pdf_externally": True # It will not show PDF directly in chrome
})
driver = webdriver.Chrome(options=options, executable_path= 'chromedriver.exe')
search_url = 'your site url'
driver.get(search_url)
link = driver.find_element(By.TAG_NAME, "a").get_attribute('href')
response = requests.get(link)
Document = os.getcwd() + 'file_name' + '.pdf'
if response.status_code == 200:
open(Document, 'wb').write(response.content)
else:
print('Document download problem')

I can't save the PDF on with Selenium

I am trying to download a PDF, however it takes a screenshot of the screen and does not save the document in a normal PDF.
I am using Python 3.8 and Selenium.
The image comes out like this:
My code:
import json
from selenium import webdriver
prefs ={ "recentDestinations": [{
"id": "Save as PDF",
"origin": "location",
"account": "", # <======= Add this
}],
"selectedDestinationId": "Save as PDF",
"version": 2,
"margin": 0,
'size': 'auto'
}
profile = {'printing.print_preview_sticky_settings.appState': json.dumps(prefs),
'savefile.default_directory': 'D:\Documents\PycharmProjects\projects\Manipule_PDF'}
chrome_options = webdriver.ChromeOptions()
chrome_options.add_experimental_option('prefs', profile, )
chrome_options.add_argument('--kiosk-printing')
#chrome_options.add_argument("--headless")
driver = webdriver.Chrome(r"D:\Documents\chrome\chromedriver.exe", options=chrome_options)
driver.get(url)
driver.execute_script('window.print();')
Does anyone know how to save the entire PDF??
Thank you very much in advance.

Page size specification does not work when saving HTML file as PDF in Chrome using python

I wanted to open an HTML file on Chrome and save it as a PDF,
I wrote the following code using python, but even I set the "pageSize" in appState in the code to A3, A5, letter size or other size,
the page size of the output PDF file will always be A4.
If I change the values of the other properties "isLandscapeEnabled" or "isHeaderFooterEnabled" in appState, it works well.
So, it seems the "pageSize" property only not works.
And, I found when Control Panel-> Region-> Format (F) is set to Japan, the page size of the output PDF file will always be A4,
and if I set it to the United States, it will always be letter size.
So, I think the "pageSize" is always be set by the default value.
I'm a python beginner and I don't really understand it yet.
Is there something wrong with my code?
Please give me any advice.
Python 3.9.6
selenium 3.141.0
Chrome 94.0.4606.71
ChromeDriver 94.0.4606.61
windows10
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import json
import time
def PrintSetUp():
chopt=webdriver.ChromeOptions()
appState = {
"recentDestinations": [
{
"id": "Save as PDF",
"origin": "local",
"account":""
}
],
"selectedDestinationId": "Save as PDF",
"version": 2,
"isLandscapeEnabled": False,
"pageSize": 'A3',
"marginsType": 0,
"scalingType": 3 ,
"scaling": "100" ,
"isHeaderFooterEnabled": False, #ヘッダーとフッター
"isCssBackgroundEnabled": True, #背景のグラフィック
}
prefs = {'printing.print_preview_sticky_settings.appState':
json.dumps(appState),
"download.default_directory": "~/Downloads"
}
chopt.add_experimental_option('prefs', prefs)
chopt.add_argument('--kiosk-printing')
return chopt
def main_WebToPDF(BlogURL):
chopt = PrintSetUp()
driver_path = "C:/Work/pythonTest/Ver94/chromedriver_win32/chromedriver.exe" #webdriverのパス
driver = webdriver.Chrome(executable_path=driver_path, options=chopt)
driver.implicitly_wait(10) # 秒 暗示的待機
driver.get(BlogURL) #ブログのURL 読み込み
WebDriverWait(driver, 15).until(EC.presence_of_all_elements_located) # ページ上のすべての要素が読み込まれるまで待機(15秒でタイムアウト判定)
driver.execute_script('return window.print()') #Print as PDF
time.sleep(10) #ファイルのダウンロードのために10秒待機
driver.quit() #Close Screen
if __name__ == '__main__':
BlogURLList=["file://C:/Work/pythonTest/1_pdf.html",
"file://C:/Work/pythonTest/2_pdf.html",
"file://C:/Work/pythonTest/3_pdf.html"]
for BlogURL in BlogURLList:
main_WebToPDF(BlogURL)

Categories

Resources