I've been searching the web for examples of people using the pdfkit (python wrapper) in implementing headers and footers and could not find any examples.
Would anyone be able to show some examples of how to implement the options in wkhtmltopdf using the pdfkit python wrapper?
I'm using it only with headers but I think that it will work the same with footers.
You need to have separate html file for the header.
header.html
<!DOCTYPE html>
<html>
<head>
<meta charset="UTF-8">
</head>
<body>
Code of your header goes here.
</body>
</html>
Then you can use it like that in Python
import pdfkit
pdfkit.from_file('path/to/your/file.html', 'out.pdf', {
'--header-html': 'path/to/header.html'
})
The tricky part if you use some backend like Django and want to use templates is that you can't pass the header html as rendered string. You need to have a file.
This is what I do to render PDFs with Django.
import os
import tempfile
import pdfkit
from django.template.loader import render_to_string
def render_pdf(template, context, output, header_template=None):
"""
Simple function for easy printing of pdfs from django templates
Header template can also be set
"""
html = render_to_string(template, context)
options = {
'--load-error-handling': 'skip',
}
try:
if header_template:
with tempfile.NamedTemporaryFile(suffix='.html', delete=False) as header_html:
options['header-html'] = header_html.name
header_html.write(render_to_string(header_template, context).encode('utf-8'))
return pdfkit.from_string(html, output, options=options)
finally:
# Ensure temporary file is deleted after finishing work
if header_template:
os.remove(options['header-html'])
In my example I create temporary file where I put rendered content. Important part is that temporary file need to end with .html and to be deleted manually.
Improving #V Stoykov answer as it helped me using Flask, the render function with custom header in Flask will be as follows:
import os
import tempfile
import pdfkit
from flask import render_template, make_response
#app.route('/generate_pdf')
def render_pdf_custom_header(foo, bar):
main_content = render_template('main_pdf.html', foo=foo)
options = {
'--encoding': "utf-8"
}
add_pdf_header(options, bar)
add_pdf_footer(options)
try:
pdf = pdfkit.from_string(main_content, False, options=options)
finally:
os.remove(options['--header-html'])
os.remove(options['--footer-html'])
response = build_response(pdf)
return response
def add_pdf_header(options, bar):
with tempfile.NamedTemporaryFile(suffix='.html', delete=False) as header:
options['--header-html'] = header.name
header.write(
render_template('header.html', bar=bar).encode('utf-8')
)
return
def add_pdf_footer(options):
# same behaviour as add_pdf_header but without passing any variable
return
def build_response(pdf):
response = make_response(pdf)
response.headers['Content-Type'] = 'application/pdf'
filename = 'pdf-from-html.pdf'
response.headers['Content-Disposition'] = ('attachment; filename=' + filename)
return response
Notice that I used the '--header-html' and '--footer-html' notation as it matches the wkhtmltopdf options format.
options = {
'page-size': 'Letter',
'margin-top': '0.9in',
'margin-right': '0.9in',
'margin-bottom': '0.9in',
'margin-left': '0.9in',
'encoding': "UTF-8",
'header-center': 'YOUR HEADER',
'custom-header' : [
('Accept-Encoding', 'gzip')
],
'no-outline':None
}
you can add the header, which you need in the value for header-center
This question and its answers are quite old and not working for me.
wkhtmltopdf version: $ wkhtmltopdf --version
wkhtmltopdf 0.12.6 (with patched qt)
python 3.8
For wkhtmltopdf,
header-html and footer-html can only be URI, e.g. html url or file path, CANNOT be string. so the idea is to keep each html file on cloud or create a temp file in local as footer and header for reference.
Request:
content as url or html
header as html url or html string
footer as html url or html string
Example:
import logging
import os
import tempfile
import pdfkit
from flask import Flask, Response, make_response
from flask_restx import Resource, Api, fields
Request = api.model('Request', {
'url': fields.String(
required=False,
description='url',
example='https://www.w3schools.com/html/html5_svg.asp',
),
'html': fields.String(
required=False,
description='content html string',
example=example_content_html
),
'header_html': fields.String(
required=False,
description='pdf header html string',
example=example_header_html
),
'footer_html': fields.String(
required=False,
description='pdf footer html string',
example=example_footer_html
),
})
#api.route("/convert_html_to_pdf", endpoint = 'html2pdf')
#api.representation('application/octet-stream')
class PdfConverter(Resource):
#api.doc(body=Request)
def post(self):
logging.info(request.json)
url = request.json.get('url')
html = request.json.get('html')
header_html = request.json.get('header_html')
footer_html = request.json.get('footer_html')
header_uri = 'https://xxxxx/header.html' # default header
footer_uri = 'https://xxxxx/footer.html' # default footer
if header_html:
fph = tempfile.NamedTemporaryFile(suffix='.html')
fph.write(header_html.encode('utf-8'))
fph.flush()
header_uri = fph.name
if footer_html:
fpf = tempfile.NamedTemporaryFile(suffix='.html')
fpf.write(footer_html.encode('utf-8'))
fpf.flush()
footer_uri = fpf.name
options = {
'page-size': 'A4',
'margin-top': '32mm',
'header-spacing': 6,
'footer-spacing': 6,
'header-html': header_uri,
'footer-html': footer_uri,
'margin-right': '0',
'margin-bottom': '16mm',
'margin-left': '0',
'encoding': "UTF-8",
'cookie': [
('cookie-empty-value', '""'),
('cookie-name1', 'cookie-value1'),
('cookie-name2', 'cookie-value2'),
],
'no-outline': None
}
logging.info(options)
if url:
# api.payload['requestedBlobUrl'] = url
# return api.payload
pdf = pdfkit.from_url(url, options=options)
else:
pdf = pdfkit.from_string(html, options=options)
if header_html:
fph.close()
if footer_html:
fpf.close() # close will delete the temp file
response = make_response(pdf)
response.headers['Content-Type'] = 'application/pdf'
response.headers['Content-Disposition'] = 'attachment;filename=report.pdf'
return response
Python pdfkit wrapper only supports html files as header and footer. To be able to support string, all you need to do is generate a temporary file and delete it on close. Here is my code sample.
Using tempfile with delete=True and suffix='.html' arguments will generate a deletable file on temp.close()
import tempfile
temp = tempfile.NamedTemporaryFile(delete=True,suffix='.html')
with open(temp.name, 'w') as f:
f.write("""
<!DOCTYPE html>
<html>
<head>
<meta charset="UTF-8">
</head>
<body>
Code of your header goes here.
</body>
</html>
""")
options = {
'page-size': 'A4',
'margin-top': '1in',
'margin-bottom': '0.75in',
'margin-right': '0.75in',
'margin-left': '0.75in',
'encoding': "UTF-8",
'header-html': temp.name,
'footer-center': "Page [page] of [topage]",
'footer-font-size': "9",
'custom-header': [
('Accept-Encoding', 'gzip')
],
'enable-local-file-access': False,
'no-outline': None
}
pdf = pdfkit.from_string(html_string, options=options)
temp.close()
Related
I am trying to render (get previewed) a stored Word or PDF file in HTML from project directory.
For example, I have a user who has his resume stored in his account. Now I want to give him a preview of his resume in his account page. It can be doc, docx or pdf. How can I return msword or pdf data in Python/Flask so that it can be rendered in HTML / Jinja2 either in iframe or embeded. Here is my code
from flask import render_template, url_for, flash, redirect, request, abort, jsonify,session, send_file
import os
import mimetypes
#app.route("/account", methods=['GET', 'POST'])
#login_required
def account():
resume = current_user.resume
resumefile = downloadresume()
return render_template('account.html', pagetitle='Account', profile_pic=profile_pic, form=form, resume=resume, resumefile = resumefile)
#app.route("/account/downloadresume", methods=['GET', 'POST'])
def downloadresume():
filename = current_user.resume
mime = mimetypes.MimeTypes().guess_type(filename)[0]
if mime in ["application/vnd.openxmlformats-officedocument.wordprocessingml.document","application/msword"]:
# return filename.data
return send_file(resume.data, as_attachment=False)
Jinja / HTML
<iframe src="{{ resumefile }}"></iframe>
I'm unsure about Word documents, but I know the PDFjs library will be able to load a PDF and render it into a canvas.
An example of its usage can be found here: https://jsfiddle.net/pdfjs/9engc9mw/
// If absolute URL from the remote server is provided, configure the CORS
// header on that server.
var url = 'https://raw.githubusercontent.com/mozilla/pdf.js/ba2edeae/examples/learning/helloworld.pdf';
// Loaded via <script> tag, create shortcut to access PDF.js exports.
var pdfjsLib = window['pdfjs-dist/build/pdf'];
// The workerSrc property shall be specified.
pdfjsLib.GlobalWorkerOptions.workerSrc = '//mozilla.github.io/pdf.js/build/pdf.worker.js';
// Asynchronous download of PDF
var loadingTask = pdfjsLib.getDocument(url);
loadingTask.promise.then(function(pdf) {
console.log('PDF loaded');
// Fetch the first page
var pageNumber = 1;
pdf.getPage(pageNumber).then(function(page) {
console.log('Page loaded');
var scale = 1.5;
var viewport = page.getViewport({scale: scale});
// Prepare canvas using PDF page dimensions
var canvas = document.getElementById('the-canvas');
var context = canvas.getContext('2d');
canvas.height = viewport.height;
canvas.width = viewport.width;
// Render PDF page into canvas context
var renderContext = {
canvasContext: context,
viewport: viewport
};
var renderTask = page.render(renderContext);
renderTask.promise.then(function () {
console.log('Page rendered');
});
});
}, function (reason) {
// PDF loading error
console.error(reason);
});
#the-canvas {
border:1px solid black;
}
<script src="//mozilla.github.io/pdf.js/build/pdf.js"></script>
<h1>PDF.js 'Hello, world!' example</h1>
<canvas id="the-canvas"></canvas>
Since it's done in the frontend, you'll just need to make sure the PDF file is accessible. With Word files, I'm sure you can convert them to PDF, however you may lose some functionality.
There are 2 possible ways to this. You do not have a single method to render pdf and docx files.
For pdf files:
You can either use the method stated by #MichaelBauer or place a hyperlink which points to the pdf file in a directory. All current browsers have the capability to render pdf files from any offline or online directory.
For docx files
You have to convert the docx file to a pdf file so that rendering becomes easier. And then follow the process of rendering the converted pdf. As for converting a pdf to docx seamlessly, you have Adobe's Document Cloud SDK service.
What's a good way for me to output (AMP compliant) HTML using a template and JSON for the data? I have a nice little python script:
import requests
import json
from bs4 import BeautifulSoup
url = requests.get('https://www.perfectimprints.com/custom-promos/20492/Beach-Balls.html')
source = BeautifulSoup(url.text, 'html.parser')
products = source.find_all('div', class_="product_wrapper")
infos = source.find_all('div', class_="categories_wrapper")
def get_category_information(category):
category_name = category.find('h1', class_="category_head_name").text
return {
"category_name": category_name.strip()
}
category_information = [get_category_information(info) for info in infos]
with open("category_info.json", "w") as write_file:
json.dump(category_information, write_file)
def get_product_details(product):
product_name = product.find('div', class_="product_name").a.text
sku = product.find('div', class_="product_sku").text
product_link = product.find('div', class_="product_image_wrapper").find("a")["href"]
src = product.find('div', class_="product_image_wrapper").find('a').find("img")["src"]
return {
"title": product_name,
"link": product_link.strip(),
"sku": sku.strip(),
"src": src.strip()
}
all_products = [get_product_details(product) for product in products]
with open("products.json", "w") as write_file:
json.dump({'items': all_products}, write_file)
print("Success")
Which generates the JSON files I need. However, I now need to use those JSON files and input it into my template (gist) everywhere it says {{ JSON DATA HERE }}.
I'm not even sure where to start. I'm most comfortable with JavaScript so I'd like to use that if possible. I figure something involving Node.js.
Here's how you can render HTML with a template engine by itself and use it to return pure HTML:
from jinja2 import Template
me = Template('<h1>Hello {{x}}<h1>')
html = me.render({'x': 'world'})
return html
html is your rendered HTML string. Alternatively, you can render from a file:
from jinja2 import Template
with open('your_template.html') as file_:
template = Template(file_.read())
html = template.render(your_dict)
return html
Since you're going to generate HTML one way or another, using a template engine will save you much time. You can also do {% for item in list %} and such thing will greatly simplify your task.
How can I print a webpage as a PDF with headers and footers so it looks like it has been printed from google chrome.
This is what I have tried so far using PhantomJS but it doesn't have the headers and footers.
from selenium import webdriver
import selenium
def execute(script, args):
driver.execute('executePhantomScript', {'script': script, 'args' : args })
driver = webdriver.PhantomJS('phantomjs')
# hack while the python interface lags
driver.command_executor._commands['executePhantomScript'] = ('POST', '/session/$sessionId/phantom/execute')
driver.get('http://stackoverflow.com')
# set page format
# inside the execution script, webpage is "this"
pageFormat = '''this.paperSize = {format: "A4", orientation: "portrait" };'''
execute(pageFormat, [])
# render current page
render = '''this.render("test.pdf")'''
execute(render, [])
This can be done using Selenium v4.1.3 by following pyppeteer examples.
You will need to be able to send commands to chromium and call Page.printToPDF Devtools API as shown in the following snip:
# ...
result = send_cmd(driver, "Page.printToPDF", params={
'landscape': True
,'margin':{'top':'1cm', 'right':'1cm', 'bottom':'1cm', 'left':'1cm'}
,'format': 'A4'
,'displayHeaderFooter': True
,'headerTemplate': '<span style="font-size:8px; margin-left: 5px">Page 1 of 1</span>'
,'footerTemplate': f'<span style="font-size:8px; margin-left: 5px">Generated on {datetime.now().strftime("%-m/%-d/%Y at %H:%M")}</span>'
,'scale': 1.65
,'pageRanges': '1'
})
with open(out_path_full, 'wb') as file:
file.write(base64.b64decode(result['data']))
# ...
def send_cmd(driver, cmd, params={}):
response = driver.command_executor._request(
'POST'
,f"{driver.command_executor._url}/session/{driver.session_id}/chromium/send_command_and_get_result"
,json.dumps({'cmd': cmd, 'params': params}))
if response.get('status'): raise Exception(response.get('value'))
return response.get('value')
I've included a full example in my GitHub Repo.
pdfkit may help you out. It will render a PDF from an html page.
I'm written a bit of code in an attempt to pull photos from a website. I want it to find photos, then download them for use to tweet:
import urllib2
from lxml.html import fromstring
import sys
import time
url = "http://www.phillyhistory.org/PhotoArchive/Search.aspx"
response = urllib2.urlopen(url)
html = response.read()
dom = fromstring(html)
sels = dom.xpath('//*[(#id = "large_media")]')
for pic in sels[:1]:
output = open("file01.jpg","w")
output.write(pic.read())
output.close()
#twapi = tweepy.API(auth)
#twapi.update_with_media(imagefilename, status=xxx)
I'm new at this sort of thing, so I'm not really sure why this isn't working. No file is created, and no 'sels' are being created.
Your problem is that the image search (Search.aspx) doesn't just return a HTML page with all the content in it, but instead delivers a JavaScript application that then makes several subsequent requests (see AJAX) to fetch raw information about assets, and then builds a HTML page dynamically that contains all those search results.
You can observe this behavior by looking at the HTTP requests your browser makes when you load the page. Use the Firebug extension for Firefox or the builtin Chrome developer tools and open the Network tab. Look for requests that happen after the initial page load, particularly POST requests.
In this case the interesting requests are the ones to Thumbnails.ashx, Details.ashx and finally MediaStream.ashx. Once you identify those requests, look at what headers and form data your browser sends, and emulate that behavior with plain HTTP requests from Python.
The response from Thumbnails.ashx is actually JSON, so it's much easier to parse than HTML.
In this example I use the requests module because it's much, much better and easier to use than urllib(2). If you don't have it, install it with pip install requests.
Try this:
import requests
import urllib
BASE_URL = 'http://www.phillyhistory.org/PhotoArchive/'
QUERY_URL = BASE_URL + 'Thumbnails.ashx'
DETAILS_URL = BASE_URL + 'Details.ashx'
def get_media_url(asset_id):
response = requests.post(DETAILS_URL, data={'assetId': asset_id})
image_details = response.json()
media_id = image_details['assets'][0]['medialist'][0]['mediaId']
return '{}/MediaStream.ashx?mediaId={}'.format(BASE_URL, media_id)
def save_image(asset_id):
filename = '{}.jpg'.format(asset_id)
url = get_media_url(asset_id)
with open(filename, 'wb') as f:
response = requests.get(url)
f.write(response.content)
return filename
urlqs = {
'maxx': '-8321310.550067',
'maxy': '4912533.794965',
'minx': '-8413034.983992',
'miny': '4805521.955385',
'onlyWithoutLoc': 'false',
'sortOrderM': 'DISTANCE',
'sortOrderP': 'DISTANCE',
'type': 'area',
'updateDays': '0',
'withoutLoc': 'false',
'withoutMedia': 'false'
}
data = {
'start': 0,
'limit': 12,
'noStore': 'false',
'request': 'Images',
'urlqs': urllib.urlencode(urlqs)
}
response = requests.post(QUERY_URL, data=data)
result = response.json()
print '{} images found'.format(result['totalImages'])
for image in result['images']:
asset_id = image['assetId']
print 'Name: {}'.format(image['name'])
print 'Asset ID: {}'.format(asset_id)
filename = save_image(asset_id)
print "Saved image to '{}'.\n".format(filename)
Note: I didn't check what http://www.phillyhistory.org/'s Terms of Service have to say about automated crawling. You need to check yourself and make sure you're not in violation of their ToS with whatever you're doing.
I need to read a url link from text file in python as a variable, and use it in html.
The text file "file.txt" contains only one line "http://188.xxx.xxx.xx:8878", this line should be saved in the variable "link", then I should use the contain of this variable in the html, so that the link should be opened when I click on the button image "go_online.png". I tried to change my code as following but it doesn't work! any help please?
#!/usr/bin/python
import cherrypy
import os.path
from auth import AuthController, require, member_of, name_is
class Server(object):
_cp_config = {
'tools.sessions.on': True,
'tools.auth.on': True
}
auth = AuthController()
#cherrypy.expose
#require()
def index(self):
f = open ("file.txt","r")
link = f.read()
print link
f.close()
html = """
<html>
<script language="javascript" type="text/javascript">
var var_link = '{{ link }}';
</script>
<body>
<p>{htmlText}
<p>
</body>
</html>
"""
myText = ''
myText = "Hellow World"
return html.format(htmlText=myText)
index.exposed = True
#configuration
conf = {
'global' : {
'server.socket_host': '0.0.0.0', #0.0.0.0 or specific IP
'server.socket_port': 8085 #server port
},
'/images': { #images served as static files
'tools.staticdir.on': True,
'tools.staticdir.dir': os.path.abspath('/home/ubuntu/webserver/images')
}
}
cherrypy.quickstart(Server(), config=conf)
first off, not sure that the javascript part makes any sense, just leave it out. Also, your opening a p tag but not closing it. Not sure what your templating engine is, but you could just pass in the variables in pure python. Also, make sure to put quotes around your link. So your code should be something like:
class Server(object):
_cp_config = {
'tools.sessions.on': True,
'tools.auth.on': True
}
auth = AuthController()
#cherrypy.expose
#require()
def index(self):
f = open ("file.txt","r")
link = f.read()
f.close()
myText = "Hello World"
html = """
<html>
<body>
<p>%s</p>
</body>
</html>
""" %(myText, link)
return html
index.exposed = True
(btw, the %s things are string placeholders, that will be poplulated the variables in %(firstString, secondString) at the end of the the multi line string.