I am using selenium/python to save a series of webpages to pdf. The webpages have a table that is rendered with javascript; I am using "find_element_by_xpath" to identify that the pdf icon in the js table appeared before proceeding with the print. Optimally, I did not want to implement a set a hard wait/sleep time as I have thousands of pages to save.
The code seems to work but no pdf is saved.
The code is as follows:
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException
import json
options = webdriver.ChromeOptions()
settings = {
"recentDestinations": [{
"id": "Save as PDF",
"origin": "local",
"account": "",
}],
"selectedDestinationId": "Save as PDF",
"version": 2
}
prefs = {'printing.print_preview_sticky_settings.appState': json.dumps(settings)}
options.add_experimental_option('prefs', prefs)
options.add_experimental_option('excludeSwitches', ['enable-logging'])
options.add_argument('--kiosk-printing')
CHROMEDRIVER_PATH = 'chromedriver.exe'
driver = webdriver.Chrome(options=options, executable_path=CHROMEDRIVER_PATH)
driver = webdriver.Chrome(options=options, executable_path=CHROMEDRIVER_PATH)
try:
element = driver.find_element_by_xpath("//div[#class='fas fa-file-pdf']")
WebDriverWait(driver, 10).until(EC.staleness_of(element))
except NoSuchElementException:
element = None
print(element)
driver.get("url")
driver.execute_script('window.print();')
#driver.quit()
This is working
First I get document link using driver.findelement(). Than I pass that url in request(). If we get response.status_code == 200 so we can download document otherwise we can't. Than I just write document content using response.content.strong textDocument is a document path with file_name.extension. Please ignore other code. It is just for making document path.
import os
from selenium import webdriver
if os.path.exists(temp_down_path):
if len(os.listdir(temp_down_path)) != 0:
for i in os.listdir(temp_down_path):
if os.path.isdir(temp_down_path + i):
shutil.rmtree(temp_down_path + i)
elif os.path.isfile(temp_down_path + i):
os.remove(temp_down_path + i)
else:
pass
else:
os.makedirs(temp_down_path)
temp_down_path = os.getcwd() + '\\temp_files\\'
options = webdriver.ChromeOptions()
options.add_experimental_option('prefs', {
"download.default_directory": temp_down_path, # Change default directory for downloads
"download.prompt_for_download": False, # To auto download the file
"download.directory_upgrade": True,
"plugins.always_open_pdf_externally": True # It will not show PDF directly in chrome
})
driver = webdriver.Chrome(options=options, executable_path= 'chromedriver.exe')
search_url = 'your site url'
driver.get(search_url)
link = driver.find_element(By.TAG_NAME, "a").get_attribute('href')
response = requests.get(link)
Document = os.getcwd() + 'file_name' + '.pdf'
if response.status_code == 200:
open(Document, 'wb').write(response.content)
else:
print('Document download problem')
Related
I have no problems printing without headless mode, however once I enable headless mode, it just refuses to print a PDF. I'm currently working on an app with a GUI, so I'd rather not have the Selenium webdriver visible to the end user if possible.
For this project I'm using an older version of Selenium, 4.2.0. That coupled with Python 3.9.
import os
from os.path import exists
import json
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver import Chrome, ChromeOptions
# Paths
dir_path = os.getcwd()
download_path = os.path.join(dir_path, "letters")
chrome_path = os.path.join(dir_path, "chromium\\app\\Chrome-bin\\chrome.exe")
user_data_path = os.path.join(dir_path, "sessions")
website = "https://www.google.com/"
def main():
print_settings = {
"recentDestinations": [{
"id": "Save as PDF",
"origin": "local",
"account": "",
}],
"selectedDestinationId": "Save as PDF",
"version": 2,
"isHeaderFooterEnabled": False,
"isLandscapeEnabled": True
}
options = ChromeOptions()
options.binary_location = chrome_path
options.add_argument("--start-maximized")
options.add_argument('--window-size=1920,1080')
options.add_argument(f"user-data-dir={user_data_path}")
options.add_argument("--headless")
options.add_argument('--enable-print-browser')
options.add_experimental_option("prefs", {
"printing.print_preview_sticky_settings.appState": json.dumps(print_settings),
"savefile.default_directory": download_path, # Change default directory for downloads
"download.default_directory": download_path, # Change default directory for downloads
"download.prompt_for_download": False, # To auto download the file
"download.directory_upgrade": True,
"profile.default_content_setting_values.automatic_downloads": 1,
"safebrowsing.enabled": True
})
options.add_argument("--kiosk-printing")
driver = Chrome(service=Service(ChromeDriverManager().install()), options=options)
driver.get(website)
driver.execute_script("window.print();")
if exists(os.path.join(user_data_path, "Google.pdf")):
print("YAY!")
else:
print(":(")
if __name__ == '__main__':
main()
For anyone else coming across this with a similar issue, I fixed it by using the print method described here: Selenium print PDF in A4 format
Using my example from above, I replaced:
driver.execute_script("window.print();")
with:
pdf_data = driver.execute_cdp_cmd("Page.printToPDF", print_settings)
with open('Google.pdf', 'wb') as file:
file.write(base64.b64decode(pdf_data['data']))
And that worked just fine for me.
I wrote a code that use Selenium Webdriver to download files via a list of URLs but for some reason it didn't download anything to my assignedn directory. The code works perfectly fine when I only download it one by one but when I use a for loop, it doesn't work.
This is an example URL: https://www.regulations.gov/contentStreamer?documentId=WHD-2020-0007-1730&attachmentNumber=1&contentType=pdf
Here is my code:
download_dir = '/Users/datawizard/files/'
for web in down_link:
try:
options = webdriver.ChromeOptions()
options.add_argument('headless')
options.add_experimental_option("prefs", {
"download.default_directory": '/Users/clinton/GRA_2021/scraping_project/pdf/',
"download.prompt_for_download": False,
"download.directory_upgrade": True,
# "safebrowsing.enabled": True,
"plugins.always_open_pdf_externally": True
})
driver = webdriver.Chrome(chrome_options=options)
driver.command_executor._commands["send_command"] = ("POST", '/session/$sessionId/chromium/send_command')
params = {'cmd': 'Page.setDownloadBehavior', 'params': {'behavior': 'allow', 'downloadPath': download_dir}}
command_result = driver.execute("send_command", params)
driver.get(url)
except:
print(str(web)+"Link cannot be open")
I am wondering did I do something wrong with the code since it doesn't give me any error when I ran the code above.
You don't need Selenium to download files, you can download files easily using the request library
import requests
for web in down_link:
fileName = YOUR_DOWNLOAD_PATH + web.split("=")[1].split("&")[0] + ".pdf" #I created a filename
r = requests.get(web, stream=True)
with open(fileName, 'wb') as f:
for chunk in r.iter_content():
f.write(chunk)
Updated Answer based on Selenium
#replace the below value with your urls list
down_link = [
'https://www.regulations.gov/contentStreamer?documentId=WHD-2020-0007-1730&attachmentNumber=1&contentType=pdf',
'https://www.regulations.gov/contentStreamer?documentId=WHD-2020-0007-1730&attachmentNumber=1&contentType=pdf']
download_dir = "/Users/datawizard/files/"
options = webdriver.ChromeOptions()
options.add_argument('headless')
options.add_experimental_option("prefs", {
"download.default_directory": download_dir,
"download.prompt_for_download": False,
"download.directory_upgrade": True,
"plugins.always_open_pdf_externally": True
})
driver = webdriver.Chrome(chrome_options=options)
for web in down_link:
driver.get(web)
time.sleep(5) #wait for the download to end, a better handling it's to check if the file exists
driver.quit()
If your files don't have a unique file name - the above code will replace the existing file with the downloaded one.
i'm currently trying to automate the task of clicking different download links from this website:
https://www.theice.com/clear-us/risk-management#margin-rates
in this page, i first have to click the "Download ICE Risk Model array files" header which gives me 2 dropdowns from which i want to first click the "Final" link which downloads a csv file for each month of each available year.
Currently, both the dropdowns change due to hidden dropdown menus above, i have first tried to make them visible which was successful as well as changing year in it using selenium click,
The problem arising is that i'm not able to click the "Final" link in the csv section but it just clicks the "
Intercontinental Exchange"
button in the footer and navigates to a new page.
Is there anyway to get this task done ?
As well as is it possible to change the download location to the current directory where the .py script is ?
This is the python code so far, i currently removed the headless part to see what's going on :
from bs4 import BeautifulSoup
import requests
from selenium.webdriver.chrome.options import Options
from shutil import which
from selenium.webdriver.common.keys import Keys
from selenium import webdriver
from selenium.webdriver.support.ui import Select
import time
import os
chrome_options = Options()
chrome_options.add_argument("--headless")
chrome_options.add_argument("--log-level=3")
chrome_path = which("chromedriver")
driver = webdriver.Chrome(executable_path=chrome_path, options=chrome_options)
driver.set_window_size(1366, 768)
driver.get("https://www.theice.com/clear-us/risk-management#margin-rates")
main_button = driver.find_element_by_xpath('//h4[#class="collapsible-section-header"]')
main_button.click()
time.sleep(5)
driver.execute_script("document.getElementById('icus-ice-form-year').style.display = 'block';")
driver.execute_script("document.getElementById('icus-ice-form-month').style.display = 'block';")
time.sleep(1)
dropdown_1 = Select(driver.find_element_by_xpath('//select[#id="icus-ice-form-year"]'))
dropdown_2 = Select(driver.find_element_by_xpath('//select[#id="icus-ice-form-year"]'))
main_table_div = driver.find_element_by_xpath('//div[#id="icus-ice-riskarraytable"]')
main_table = main_table_div.find_element_by_xpath('//table[#class="table table-data"]')
for opt in dropdown_1.options:
opt.click()
for opt2 in dropdown_2.options:
opt2.click()
time.sleep(3)
download_links_1 = main_table.find_elements_by_xpath('//td[#class="table-partitioned"]')
for dow in download_links_1:
try:
temp_dow = dow.find_element_by_xpath('//a')
temp_dow.click()
time.sleep(4)
except:
pass
This should switch the downloads to the current working directory and print all the early csvs. change '//[#id="icus-ice-riskarraytable"]/table/tbody/tr[{0}]/td[2]/a' to '//[#id="icus-ice-riskarraytable"]/table/tbody/tr[{0}]/td[3]/a' for the other final csv.
options = Options()
currentDirectory = os.getcwd()
prefs = {
"download.default_directory": currentDirectory,
"download.prompt_for_download": False
}
#print(currentDirectory)
options.add_experimental_option("prefs", prefs)
driver.get("https://www.theice.com/clear-us/risk-management#margin-rates")
driver.implicitly_wait(5)
main_button = driver.find_element_by_xpath('//h4[#class="collapsible-section-header"]')
main_button.click()
driver.implicitly_wait(5)
driver.execute_script("document.getElementById('icus-ice-form-year').style.display = 'block';")
driver.execute_script("document.getElementById('icus-ice-form-month').style.display = 'block';")
driver.implicitly_wait(5)
drop1length=len(driver.find_elements_by_xpath('//select[#id="icus-ice-form-year"]/option'))
#print(drop1length)
for i in range(1,drop1length-1):
drop1=Select(driver.find_element_by_xpath('//select[#id="icus-ice-form-year"]'))
drop1.select_by_index(i)
drop2length=len(driver.find_elements_by_xpath('//select[#id="icus-ice-form-month"]/option'))
#print(drop2length)
for j in range(1,drop2length-1):
drop2=Select(driver.find_element_by_xpath('//select[#id="icus-ice-form-month"]'))
driver.implicitly_wait(5)
drop2.select_by_index(j)
download_links_length = len(driver.find_elements_by_xpath('//*[#id="icus-ice-riskarraytable"]/table/tbody/tr/td[2]/a'))
#print(download_links_length)
for dow in range(1,download_links_length-1):
try:
element = driver.find_element_by_xpath('//*[#id="icus-ice-riskarraytable"]/table/tbody/tr[{0}]/td[2]/a'.format(str(dow)))
driver.implicitly_wait(5)
driver.execute_script("arguments[0].click();", element)
driver.switch_to.window(driver.window_handles[0])
except Exception as e:
print(e)
Import these
import os
from selenium.webdriver.chrome.options import Options
I write a Script for a massdownload of PDF-files in a Loop. Firefoxbased Selenium-Webdriver stuck after the first Download, so i decide to try Chrome.
With Chrome the Loop is working now. But i can not download the PDF-files. Chrome just show them with pdf_viewer.js in the Browserwindow.
I try different Options like plugins.plugins_list": [{"enabled": False, "name": "Chrome PDF Viewer"}], download.extensions_to_open": "applications/pdf and plugins.always_open_pdf_externally": True. Nothing works.
#!/usr/bin/python
downPath = "/home/user/xxx/"
Y = ["2017", "2018", "2019"]
def main():
options = webdriver.ChromeOptions()
profile = {
"plugins.plugins_list": [{"enabled": False, "name": "Chrome PDF Viewer"}],
"download.default_directory": downPath ,
"download.extensions_to_open": "applications/pdf",
"plugins.always_open_pdf_externally": True,
"download.prompt_for_download": False,
"safebrowsing.enabled": True
}
options.add_experimental_option("prefs", profile)
browser = webdriver.Chrome(options=options)
driver = webdriver.Chrome()
driver.get('login-url')
#LOGIN-Stuff ...
time.sleep(4)
for y in Y:
print(y)
for x in range(53):
url = "https://j.world/files/{0}/filename_{0}-{1:02d}.pdf".format(y, x)
print(url)
driver.get(url)
time.sleep(4)
driver.quit()
if __name__ == "__main__":
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import time
import sys
import os
main()
I use Python 3.7.4 with python-selenium 3.141.0-1 from the Arch-Linux Repository.
you can download file with requests lib or with curl/wget/etc instead of
driver.get(url)
I have overlooked that in the code were still fragments of my attempts with firefox.
browser = webdriver.Chrome(options=options)
driver = webdriver.Chrome()
are obviously wrong
i wrote driver = webdriver.Chrome(options=options) and all things work.
excuse my carelessness
Have tried all the solutions I could find on the Internet to be able to print a page that is open in Selenium in Python. However, while the print pop-up shows up, after a second or two it goes away, with no PDF saved.
Here is the code being tried. Based on the code here - https://stackoverflow.com/a/43752129/3973491
Coding on a Mac with Mojave 10.14.5.
from selenium import webdriver
from selenium.webdriver.support.select import Select
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import WebDriverException
import time
import json
options = Options()
appState = {
"recentDestinations": [
{
"id": "Save as PDF",
"origin": "local"
}
],
"selectedDestinationId": "Save as PDF",
"version": 2
}
profile = {'printing.print_preview_sticky_settings.appState': json.dumps(appState)}
# profile = {'printing.print_preview_sticky_settings.appState':json.dumps(appState),'savefile.default_directory':downloadPath}
options.add_experimental_option('prefs', profile)
options.add_argument('--kiosk-printing')
CHROMEDRIVER_PATH = '/usr/local/bin/chromedriver'
driver = webdriver.Chrome(options=options, executable_path=CHROMEDRIVER_PATH)
driver.implicitly_wait(5)
driver.get(url)
driver.execute_script('window.print();')
$chromedriver --v
ChromeDriver 75.0.3770.90 (a6dcaf7e3ec6f70a194cc25e8149475c6590e025-refs/branch-heads/3770#{#1003})
Any hints or solutions as to what can be done to print the open html page to a PDF. Have spent hours trying to make this work. Thank you!
Update on 2019-07-11:
My question has been identified as a duplicate, but a) the other question seems to be using javascript code, and b) the answer does not solve the problem being raised in this question - it may be to do with more recent software versions. Chrome version being used is Version 75.0.3770.100 (Official Build) (64-bit), and chromedriver is ChromeDriver 75.0.3770.90. On Mac OS Mojave. Script is running on Python 3.7.3.
Update on 2019-07-11:
Changed the code to
from selenium import webdriver
import json
chrome_options = webdriver.ChromeOptions()
settings = {
"appState": {
"recentDestinations": [{
"id": "Save as PDF",
"origin": "local",
"account": "",
}],
"selectedDestinationId": "Save as PDF",
"version": 2
}
}
prefs = {'printing.print_preview_sticky_settings': json.dumps(settings)}
chrome_options.add_experimental_option('prefs', prefs)
chrome_options.add_argument('--kiosk-printing')
CHROMEDRIVER_PATH = '/usr/local/bin/chromedriver'
driver = webdriver.Chrome(chrome_options=chrome_options, executable_path=CHROMEDRIVER_PATH)
driver.get("https://google.com")
driver.execute_script('window.print();')
driver.quit()
And now, nothing happens. Chrome launches, loads url, print dialog appears but then nothing seems to happen - nothing in the default printer queue, and no pdf either - I even searched for the PDF files by looking up "Recent Files" on Mac.
The answer here, worked when I did not have any other printer setup in my OS. But when I had another default printer, this did not work.
I don't understand how, but making small change this way seems to work.
from selenium import webdriver
import json
chrome_options = webdriver.ChromeOptions()
settings = {
"recentDestinations": [{
"id": "Save as PDF",
"origin": "local",
"account": "",
}],
"selectedDestinationId": "Save as PDF",
"version": 2
}
prefs = {'printing.print_preview_sticky_settings.appState': json.dumps(settings)}
chrome_options.add_experimental_option('prefs', prefs)
chrome_options.add_argument('--kiosk-printing')
CHROMEDRIVER_PATH = '/usr/local/bin/chromedriver'
driver = webdriver.Chrome(chrome_options=chrome_options, executable_path=CHROMEDRIVER_PATH)
driver.get("https://google.com")
driver.execute_script('window.print();')
driver.quit()
You can use the following code to print PDFs in A5 size with background css enabled:
import os
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import json
import time
chrome_options = webdriver.ChromeOptions()
settings = {
"recentDestinations": [{
"id": "Save as PDF",
"origin": "local",
"account": ""
}],
"selectedDestinationId": "Save as PDF",
"version": 2,
"isHeaderFooterEnabled": False,
"mediaSize": {
"height_microns": 210000,
"name": "ISO_A5",
"width_microns": 148000,
"custom_display_name": "A5"
},
"customMargins": {},
"marginsType": 2,
"scaling": 175,
"scalingType": 3,
"scalingTypePdf": 3,
"isCssBackgroundEnabled": True
}
mobile_emulation = { "deviceName": "Nexus 5" }
chrome_options.add_experimental_option("mobileEmulation", mobile_emulation)
chrome_options.add_argument('--enable-print-browser')
#chrome_options.add_argument('--headless')
prefs = {
'printing.print_preview_sticky_settings.appState': json.dumps(settings),
'savefile.default_directory': '<path>'
}
chrome_options.add_argument('--kiosk-printing')
chrome_options.add_experimental_option('prefs', prefs)
for dirpath, dirnames, filenames in os.walk('<source path>'):
for fileName in filenames:
print(fileName)
driver = webdriver.Chrome("./chromedriver", options=chrome_options)
driver.get(f'file://{os.path.join(dirpath, fileName)}')
time.sleep(7)
driver.execute_script('window.print();')
driver.close()
The solution is not very good, but you can take a screenshot and convert to pdf by Pillow...
from selenium import webdriver
from io import BytesIO
from PIL import Image
driver = webdriver.Chrome(executable_path='path to your driver')
driver.get('your url here')
img = Image.open(BytesIO(driver.find_element_by_tag_name('body').screenshot_as_png))
img.save('filename.pdf', "PDF", quality=100)
Here is the solution I use with Windows :
First download the ChromeDriver here : http://chromedriver.chromium.org/downloads and install Selenium
Then run this code (based on the accepted answer, slightly modified to work on Windows):
import json
from selenium import webdriver
chrome_options = webdriver.ChromeOptions()
settings = {"recentDestinations": [{"id": "Save as PDF", "origin": "local", "account": ""}], "selectedDestinationId": "Save as PDF", "version": 2}
prefs = {'printing.print_preview_sticky_settings.appState': json.dumps(settings)}
chrome_options.add_experimental_option('prefs', prefs)
chrome_options.add_argument('--kiosk-printing')
browser = webdriver.Chrome(r"chromedriver.exe", options=chrome_options)
browser.get("https://google.com/")
browser.execute_script('window.print();')
browser.close()
I would suggest Downloading the page source html which can be done like so
in vb.net:
Dim Html As String = webdriver.PageSource
Not sure how it is done in python but I'm sure it's very similar
Once you have done that then you can select the parts of the page you want to save using an html parser or by parsing it manually with string parsing code. Once you have the html for the part you want to save stored in a string then use an html to pdf converter library or program. There are lots of these for programming languages like C# and vb.net. I don't know about any for python but I'm sure some exist. Just do some research. (some are free and some are expensive)