How to add watermark to pdf file? - python

How to add watermark to pdf file generated from this code?
import xhtml2pdf
from xhtml2pdf import pisa
def delivery_cancel(request, did):
d_instance = get_object_or_404(Delivery, pk=did, user=request.user)
users = request.user.get_profile()
user = request.user
contents = render_to_string('delivery_cancel.html', {'delivery':d_instance,'users':users,'user':user})
response = HttpResponse(mimetype='application/pdf')
response['Content-Disposition'] = 'inline; filename=mypdf.pdf'
result = StringIO.StringIO()
pdf = pisa.pisaDocument(StringIO.StringIO(contents.encode('utf-8')), result, show_error_as_pdf=True, encoding='UTF-8')
response.write(result.getvalue())
result.close()
return response
I tried to use reportlab but I failed so I'm asking for another solution.

The input to xhtml2pdf is XHTML, so you probably want to specify your watermark there. The documentation says to use a background-image on #page.
Alternatively, you can create a single-page PDF that just contains the watermark and apply it to your generated file after the fact using something like pdftk's background option.

My approach is a longer one but it should solve most of the problems faced.
With this script you will be able to add the list of watermark email address from a xlsx sheet and add the same email address as watermark to all the pages of a pdf which you input
# Importing all required packages
import xlrd
from reportlab.pdfgen import canvas
from reportlab.lib.units import inch, cm
from PyPDF2 import PdfFileWriter, PdfFileReader
from reportlab.lib.colors import HexColor
# create watermarked booklet
def final_booklets(file_name,booklet):
watermark_obj = PdfFileReader(file_name)
watermark_page = watermark_obj.getPage(0)
pdf_reader = PdfFileReader(booklet)
pdf_writer = PdfFileWriter()
# Watermark all the pages
for page in range(pdf_reader.getNumPages()):
page = pdf_reader.getPage(page)
page.mergePage(watermark_page)
pdf_writer.addPage(page)
output = file_name+"_booklet.pdf"
with open(output, 'wb') as out:
pdf_writer.write(out)
# Create watermark pdf again each email address
def watermark_pdf(target,booklet):
file_name = (target + ".pdf")
c = canvas.Canvas(file_name)
c.saveState()
c.setFillColor(HexColor('#dee0ea'))
c.setFont("Helvetica", 40)
c.translate(15*cm, 20*cm )
c.rotate(45)
c.drawRightString(0,0,target)
c.restoreState()
c.showPage()
c.save()
final_booklets(file_name,booklet)
# Read the sheet to get everyones email address
def read_xlsx(fn):
book = xlrd.open_workbook(fn)
sheet = book.sheet_by_index(0)
booklet = "book.pdf"
for cell in range(1,sheet.nrows):
target = sheet.cell(cell,1).value
watermark_pdf(target,booklet)
# main controller
if __name__ == "__main__":
fn = "Test.xlsx"
read_xlsx(fn)
Original Github link: https://github.com/manojitballav/python_watermark/blob/master/master.py

Related

Python - Pdf created in landscape format : How to change in portrait?

I have a program in my company which generate PDF in landscape format. (I can't change print settings)
I want to get the pdf in a portrait format without rotate the text.
I tried with PyPdF, changing MediaBox Settings. It's almost good... But datas are centered . I want them to be at the top of the page.
How can i get that ?
Thanks for your help !
Here's my Python script :
# -*- coding: utf-8 -*-
import datetime
import os
import sys
import PyPDF2
pdf_in = open('test.pdf', 'rb')
pdf_reader = PyPDF2.PdfFileReader(pdf_in, strict=False)
pdf_writer = PyPDF2.PdfFileWriter()
page = pdf_reader.getPage(0)
page.mediaBox.setLowerLeft((0,0))
page.mediaBox.setLowerRight((595.3,0))
page.mediaBox.setUpperLeft((0,841.9))
page.mediaBox.setUpperRight((595.3,841.9))
pdf_writer.addPage(page)
pdf_out = open('test2.pdf', 'wb')
pdf_writer.write(pdf_out)
pdf_out.close()
pdf_in.close()
Hope it helps
import PyPDF2
pdf_in = open('test.pdf', 'rb')
pdf_reader = PyPDF2.PdfFileReader(pdf_in, strict=False)
pdf_writer = PyPDF2.PdfFileWriter()
# add this part in your code
for pagenum in range(pdf_reader.numPages):
page = pdf_reader.getPage(pagenum)
# you can add another condition here if you choose to rotate only certain pages
# input the rotation degree here *rotateClockwise(**degree**)*
page.rotateClockwise(90)
pdf_writer.addPage(page)
pdf_writer.addPage(page)
pdf_out = open('test2.pdf', 'wb')
pdf_writer.write(pdf_out)
pdf_out.close()
pdf_in.close()

Using Python, how to extract text and images from PDF + color strings and numbers from the output txt file

Using Python, I would like to
extract text from a PDF into a txt file (done)
color all numbers and specific strings of the txt file like this example (https://tex.stackexchange.com/questions/521383/how-to-highlight-numbers-only-outside-a-string-in-lstlisting) (not done)
Translate using Google translator all text to EN (not done)
extract images from the PDF file into PNGs/or a new PDF file containing all of the images (not done)
To perform 1. I used the following code which is working
pip install PyPDF2
from PyPDF2 import PdfFileReader, PdfFileWriter
file_path = 'AR_Finland_2021.pdf'
pdf = PdfFileReader(file_path)
with open('AR_Finland_2021.txt', 'w') as f:
for page_num in range(pdf.numPages):
# print('Page: {0}'.format(page_num))
pageObj = pdf.getPage(page_num)
try:
txt = pageObj.extractText()
print(''.center(100, '-'))
except:
pass
else:
f.write('Page {0}\n'.format(page_num+1))
f.write(''.center(100, '-'))
f.write(txt)
f.close()
To perform 3 (extract images) I tried the following code but always get an error.
pip install PyMuPDF Pillow
pip install PyMuPDF
pip install python-gettext
import fitz
import io
from PIL import Image
# file path you want to extract images from
file = "AR_Finland_2021.pdf"
# open the file
pdf_file = fitz.open(file)
# iterate over PDF pages
for page_index in range(len(pdf_file)):
# get the page itself
page = pdf_file[page_index]
image_list = page.getImageList()
# printing number of images found in this page
if image_list:
print(f"[+] Found a total of {len(image_list)} images in page {page_index}")
else:
print("[!] No images found on page", page_index)
for image_index, img in enumerate(page.getImageList(), start=1):
# get the XREF of the image
xref = img[0]
# extract the image bytes
base_image = pdf_file.extractImage(xref)
image_bytes = base_image["image"]
# get the image extension
image_ext = base_image["ext"]
# load it to PIL
image = Image.open(io.BytesIO(image_bytes))
# save it to local disk
image.save(open(f"image{page_index+1}_{image_index}.{image_ext}", "wb"))
Error:
----> 5 image_list = page.getImageList()
AttributeError: 'Page' object has no attribute 'getImageList'
Would someone know how to perform 3 (extract images) and 2 (color numbers and certain strings from the txt file extracted from the PDF)?
You can do:
import fitz
doc = fitz.open("AR_Finland_2021.pdf")
for page in doc:
for img_tuple in page.get_images():
img_dict = doc.extract_image(img_tuple[0])
img_bytes = img_dict['image']
# Do whatever you want with it
See Page.get_images() and Document.extract_image()
To write these images into a new pdf:
doc = fitz.open("/path/to/new/pdf")
page = doc.newPage()
img_location = fitz.Rect(100, 100, 200, 200)
page.insert_image(img_location, stream=img_bytes)
See Rect for different ways to construct the rectangle, but you probably want to use img_tuple[1] from earlier. Again look at get_page_images to see the data available to you there.

PyPDF2 corrupts file when watermarking

I have been trying to speed up our date stamping process by adding a stamp as a watermark to PDFs through PyPDF2. I found the code below online as I'm pretty new to coding.
When I run this it seems to work, but the file is corrupted and won't open. Does anyone have any ideas where I am going wrong?
from PyPDF2 import PdfFileWriter, PdfFileReader
def create_watermark(input_pdf, output_pdf, watermark):
watermark_obj = PdfFileReader(watermark,False,)
watermark_page = watermark_obj.getPage(0)
pdf_reader = PdfFileReader(input_pdf)
pdf_writer = PdfFileWriter()
# Watermark all the pages
for page in range(pdf_reader.getNumPages()):
page = pdf_reader.getPage(page)
page.mergePage(watermark_page)
pdf_writer.addPage(page)
with open(input_pdf, 'wb') as out:
pdf_writer.write(out)
if __name__ == '__main__':
input_pdf = "C:\\Users\\A***\\OneDrive - ***\\Desktop\\Invoice hold\\Test\\1.pdf"
output_pdf = "C:\\Users\\A***\\OneDrive - ***\\Desktop\\Invoice hold\\Test\\1 WM.pdf"
watermark = "C:\\Users\\A***\\OneDrive - ***\\Desktop\\Invoice hold\\WM.pdf"
create_watermark(input_pdf,output_pdf,watermark)
If you want to save pdf file under the name of output_pdf,
try this :
result = open(output_pdf, 'wb')
pdf_writer.write(result)
your code :
with open(input_pdf, 'wb') as out:
pdf_writer.write(out)
Your code is to overwrite input_pdf.
And if there is a problem while working, the pdf file will be damaged.
I succeeded in inserting the watermark by applying your code and my proposed method.
I recommend checking if the pdf file is not damaged.

Download a file to heroku process it and send as an attachment

I'm trying to do the following:
Download a pdf file from S3 to my heroku.
Process the pdf.
Email the pdf as an attachment.
Is it possible? If yes, could you please give me a tip how?
I'm running Django and pdf is about 1MB.
This is my processing part:
from PyPDF2 import PdfFileWriter, PdfFileReader
from reportlab.pdfgen import canvas
from reportlab.lib.colors import HexColor
import os, sys
import requests
from io import BytesIO
URL = "https://domainname.com/sample.pdf"
response=requests.get(URL)
p = BytesIO(response.content)
p.seek(0, os.SEEK_END)
def watermark_product(watermark_text, input_file_path, output_file_path):
c = canvas.Canvas("watermark.pdf")
c.setFont("Helvetica", 24)
c.setFillGray(0.5,0.5)
c.saveState()
c.translate(500,100)
c.rotate(45)
c.drawCentredString(0, 300, watermark_text)
c.restoreState()
c.save()
input_file = PdfFileReader(input_file_path)
output_writer = PdfFileWriter()
total_pages = input_file.getNumPages()
for single_page in range(total_pages):
page = input_file.getPage(single_page)
watermark = PdfFileReader("watermark.pdf")
page.mergePage(watermark.getPage(0))
output_writer.addPage(page)
with open(output_file_path, "wb") as outputStream:
output_writer.write(outputStream)
os.remove("watermark.pdf")
watermark_product('testtesatd', p, 'w1.pdf')
EDIT:
I've managed to keep the pdf file in memory.

Merge Existing PDF into new ReportLab PDF via flowables

I have a reportlab SimpleDocTemplate and returning it as a dynamic PDF. I am generating it's content based on some Django model metadata. Here's my template setup:
buff = StringIO()
doc = SimpleDocTemplate(buff, pagesize=letter,
rightMargin=72,leftMargin=72,
topMargin=72,bottomMargin=18)
Story = []
I can easily add textual metadata from the Entry model into the Story list to be built later:
ptext = '<font size=20>%s</font>' % entry.title.title()
paragraph = Paragraph(ptext, custom_styles["Custom"])
Story.append(paragraph)
And then generate the PDF to be returned in the response by calling build on the SimpleDocTemplate:
doc.build(Story, onFirstPage=entry_page_template, onLaterPages=entry_page_template)
pdf = buff.getvalue()
resp = HttpResponse(mimetype='application/x-download')
resp['Content-Disposition'] = 'attachment;filename=logbook.pdf'
resp.write(pdf)
return resp
One metadata field on the model is a file attachment. When those file attachments are PDFs, I'd like to merge them into the Story that I am generating; IE meaning a PDF of reportlab "flowable" type.
I'm attempting to do so using pdfrw, but haven't had any luck. Ideally I'd love to just call:
from pdfrw import PdfReader
pdf = pPdfReader(entry.document.file.path)
Story.append(pdf)
and append the pdf to the existing Story list to be included in the generation of the final document, as noted above.
Anyone have any ideas? I tried something similar using pagexobj to create the pdf, trying to follow this example:
http://code.google.com/p/pdfrw/source/browse/trunk/examples/rl1/subset.py
from pdfrw.buildxobj import pagexobj
from pdfrw.toreportlab import makerl
pdf = pagexobj(PdfReader(entry.document.file.path))
But didn't have any luck either. Can someone explain to me the best way to merge an existing PDF file into a reportlab flowable? I'm no good with this stuff and have been banging my head on pdf-generation for days now. :) Any direction greatly appreciated!
I just had a similar task in a project. I used reportlab (open source version) to generate pdf files and pyPDF to facilitate the merge. My requirements were slightly different in that I just needed one page from each attachment, but I'm sure this is probably close enough for you to get the general idea.
from pyPdf import PdfFileReader, PdfFileWriter
def create_merged_pdf(user):
basepath = settings.MEDIA_ROOT + "/"
# following block calls the function that uses reportlab to generate a pdf
coversheet_path = basepath + "%s_%s_cover_%s.pdf" %(user.first_name, user.last_name, datetime.now().strftime("%f"))
create_cover_sheet(coversheet_path, user, user.performancereview_set.all())
# now user the cover sheet and all of the performance reviews to create a merged pdf
merged_path = basepath + "%s_%s_merged_%s.pdf" %(user.first_name, user.last_name, datetime.now().strftime("%f"))
# for merged file result
output = PdfFileWriter()
# for each pdf file to add, open in a PdfFileReader object and add page to output
cover_pdf = PdfFileReader(file( coversheet_path, "rb"))
output.addPage(cover_pdf.getPage(0))
# iterate through attached files and merge. I only needed the first page, YMMV
for review in user.performancereview_set.all():
review_pdf = PdfFileReader(file(review.pdf_file.file.name, "rb"))
output.addPage(review_pdf.getPage(0)) # only first page of attachment
# write out the merged file
outputStream = file(merged_path, "wb")
output.write(outputStream)
outputStream.close()
I used the following class to solve my issue. It inserts the PDFs as vector PDF images.
It works great because I needed to have a table of contents. The flowable object allowed the built in TOC functionality to work like a charm.
Is there a matplotlib flowable for ReportLab?
Note: If you have multiple pages in the file, you have to modify the class slightly. The sample class is designed to just read the first page of the PDF.
I know the question is a bit old but I'd like to provide a new solution using the latest PyPDF2.
You now have access to the PdfFileMerger, which can do exactly what you want, append PDFs to an existing file. You can even merge them in different positions and choose a subset or all the pages!
The official docs are here: https://pythonhosted.org/PyPDF2/PdfFileMerger.html
An example from the code in your question:
import tempfile
import PyPDF2
from django.core.files import File
# Using a temporary file rather than a buffer in memory is probably better
temp_base = tempfile.TemporaryFile()
temp_final = tempfile.TemporaryFile()
# Create document, add what you want to the story, then build
doc = SimpleDocTemplate(temp_base, pagesize=letter, ...)
...
doc.build(...)
# Now, this is the fancy part. Create merger, add extra pages and save
merger = PyPDF2.PdfFileMerger()
merger.append(temp_base)
# Add any extra document, you can choose a subset of pages and add bookmarks
merger.append(entry.document.file, bookmark='Attachment')
merger.write(temp_final)
# Write the final file in the HTTP response
django_file = File(temp_final)
resp = HttpResponse(django_file, content_type='application/pdf')
resp['Content-Disposition'] = 'attachment;filename=logbook.pdf'
if django_file.size is not None:
resp['Content-Length'] = django_file.size
return resp
Use this custom flowable:
class PDF_Flowable(Flowable):
#----------------------------------------------------------------------
def __init__(self,P,page_no):
Flowable.__init__(self)
self.P = P
self.page_no = page_no
#----------------------------------------------------------------------
def draw(self):
"""
draw the line
"""
canv = self.canv
pages = self.P
page_no = self.page_no
canv.translate(x, y)
canv.doForm(makerl(canv, pages[page_no]))
canv.restoreState()
and then after opening existing pdf i.e.
pages = PdfReader(BASE_DIR + "/out3.pdf").pages
pages = [pagexobj(x) for x in pages]
for i in range(0, len(pages)):
F = PDF_Flowable(pages,i)
elements.append(F)
elements.append(PageBreak())
use this code to add this custom flowable in elements[].

Categories

Resources