Editing a pdf page by page - python

I'm trying to make unique edits to individual pages in a pre-existing pdf. However, the edits remain the same.
I've tried using FPDF (wasn't sure of how to edit a pre-existing pdf with this) and then am now trying PYPDF2 with reportlab.
#
from PyPDF2 import PdfFileWriter, PdfFileReader
import io
from reportlab.pdfgen import canvas
from reportlab.lib.pagesizes import letter
def WriteOnPdf (targetpdf, pageTopicsDict):
packet = io.BytesIO()
# Create a new PDF with Reportlab
can = canvas.Canvas(packet, pagesize=letter)
can.setFont('Helvetica', 13)
can.drawString(5, 730, pageTopicsDict[0])
can.save()
# Move to the beginning of the StringIO buffer
packet.seek(0)
new_pdf = PdfFileReader(packet)
# Read your existing PDF
existing_pdf = PdfFileReader(open(targetpdf, "rb"))
output = PdfFileWriter()
# Add the "watermark" (which is the new pdf) on the existing page
for i in range(existing_pdf.numPages):
print(i, pageTopicsDict[i])
can.drawString(5, 730, pageTopicsDict[i])
page = existing_pdf.getPage(i)
page.mergePage(new_pdf.getPage(0))# index out of range if not set to 0.
output.addPage(page)
# Finally, write "output" to a real file
outputStream = open("destination.pdf", "wb")
output.write(outputStream)
outputStream.close()
dummyDict = {0: "abc", 1: "de, fg", 2: "hijklmn"}
WriteOnPdf ("test.pdf", dummyDict)
Expected: pdf with "abc" on top left hand corner of page 0, "de, fg" on page 1, "hijklmn" on page 2...
Actual: all pages have "abc"

Solved; initialized the packet and relevant variables in the for loop instead of outside.

Related

Text rotated when merging pdf pages using Pypdf2 and Reportlab

I'm trying to merge two pages
one from reportlab that has the text I wish and another one is my source pdf
But when I merge those two pages, my text is rotated 90 degree
Pdf created using Report lab -> Overlay Created using Reportlab
when Merged with Source pdf -> Source Pdf
Code that I have Used :
from PyPDF2 import PdfFileWriter, PdfFileReader
import io
from reportlab.pdfgen import canvas
from reportlab.lib.units import inch
packet = io.BytesIO()
c = canvas.Canvas(packet)
c.drawString(0,0,"Hello World")
c.save()
packet.seek(0)
packet_pdf = PdfFileReader(packet)
input_pdf = PdfFileReader(open("Source.pdf", "rb"))
output = PdfFileWriter()
page = input_pdf.getPage(0)
page.mergePage(packet_pdf.getPage(0))
output.addPage(page)
outputStream = open("destination.pdf", "wb")
output.write(outputStream)
outputStream.close()
reference : Add text to Existing PDF using Python
Refered to this and created an own solution -> Python PyPDF2 merge rotated pages
from PyPDF2 import PdfFileWriter, PdfFileReader
import io
from reportlab.pdfgen.canvas import Canvas
page_to_merge = 0 #Refers to the First page of PDF
xcoor = 250 #To be changed according to your pdf
ycoor = 650 #To be changed according to your pdf
input_pdf = PdfFileReader(open("Source.pdf", "rb"))
page_count = input_pdf.getNumPages()
inputpdf_page_to_be_merged = input_pdf.getPage(page_to_merge)
packet = io.BytesIO()
c = Canvas(packet,pagesize=(inputpdf_page_to_be_merged.mediaBox.getWidth(),inputpdf_page_to_be_merged.mediaBox.getHeight()))
c.drawString(xcoor,ycoor,"Hello World")
c.save()
packet.seek(0)
overlay_pdf = PdfFileReader(packet)
overlay = overlay_pdf.getPage(0)
output = PdfFileWriter()
for PAGE in range(page_count):
if PAGE == page_to_merge:
inputpdf_page_to_be_merged.mergeRotatedTranslatedPage(overlay,
inputpdf_page_to_be_merged.get('/Rotate') or 0,
overlay.mediaBox.getWidth()/2, overlay.mediaBox.getWidth()/2)
output.addPage(inputpdf_page_to_be_merged)
else:
Page_in_pdf = input_pdf.getPage(PAGE)
output.addPage(Page_in_pdf)
outputStream = open("destination.pdf", "wb")
output.write(outputStream)
outputStream.close()

Separating large PDF document into smaller documents based on content

I have a large pdf file with very specific formatting, a bunch of reports if you will, all in one big pdf document. I'm using pdfplumber to extract specific text within a bounding box on each page. I've called this variable scene_text. The value of scene_text changes throughout the document, but many pages contain the same value for scene_text. I want to separate the large pdf into multiple smaller pdf files named according to their scene_text value with each pdf file containing all of the pages with matching scene_text. I'm terribly stuck, any help would be appreciated.
import pdfplumber
from PyPDF2 import PdfFileWriter, PdfFileReader
import os
file = 'report.pdf'
with pdfplumber.open(file) as pdf:
for i, page in enumerate(pdf.pages):
# get scene text for current page
bounding_box = (880, 137, 1048, 180)
scene_text = page.within_bbox(bounding_box, relative=True).extract_text()
previous_page_text = pdf.pages[i-1].within_bbox(bounding_box, relative=True).extract_text()
inputpdf = PdfFileReader(open(file, "rb"))
output = PdfFileWriter()
for x, page in enumerate(pdf.pages):
st2 = page.within_bbox(bounding_box, relative=True).extract_text()
if st2 != previous_page_text:
output.addPage(inputpdf.getPage(i))
if st2 == scene_text:
if st2 == pdf.pages[x+1].within_bbox(bounding_box, relative=True).extract_text():
previous_page_text = st2
with open("page_export/" + scene_text + ".pdf", "wb") as output_stream:
output.write(output_stream)

Correcting PDF pages with wrong orientation information with PyPDF2

I'm trying to merge a number of PDF documents in one. However, the documents have different sources, some of them being created in the computer, some of them scanned with different scanners / softwares. I'm scaling them all to A4 size before joining them.
My problem is with some documents that display OK but, when I check the orientation, it looks as if the document is rotated.
For example, for this document here, it displays OK in the browser and Acrobat Reader, but if I get the information using PyPDF2:
from PyPDF2 import PdfReader
reader = PdfReader(path)
for page in reader.pages:
orientation = page.get('/Rotate')
print(f"Document: {path}")
print(f" Orientation: {orientation}")
print(f" mediabox: {page.mediabox}")
print(f" artbox: {page.artbox}")
print(f" bleedbox: {page.bleedbox}")
print(f" cropbox: {page.cropbox}")
print(f" trimbox: {page.trimbox}")
I get:
Orientation: 90
mediaBox: RectangleObject([0, 0, 792, 542])
artBox: RectangleObject([0, 0, 792, 542])
bleedBox: RectangleObject([0, 0, 792, 542])
cropBox: RectangleObject([0, 0, 792, 542])
trimBox: RectangleObject([0, 0, 792, 542])
This is annoying because, in a subsequent step, I'm adding page numbers to the document, and they all get placed wrong because of the orientation.
Notice that the pages display correctly, they only have the wrong orientation data somehow. If I try to set the orientation rotating the page, e.g.
page.rotate(-orientation)
then they display sideways instead.
How can I correct the orientation?
There are two ways to change the orientation of a page. I don't quite understand why you want the /Rotate attribute to be zero; it doesn't tell you what the correct orientation is but rather applies a rotation to the content of that page before the content is displayed to the users.
The /Rotate attribute
from PyPDF2 import PdfReader, PdfWriter
from PyPDF2.generic import NameObject, NumberObject
# Add stuff to the PdfWriter
reader = PdfReader("example.pdf")
writer = PdfWriter()
writer.add_page(reader.pages[0])
# Change it in the writer
writer.pages[0][NameObject("/Rotate")] = NumberObject(90)
# Or simpler: writer.rotate(90)
# Write content back
with open("output.pdf", "wb") as fp:
writer.write(fp)
Use a transformation matrix
Using the PyPDF2 docs on transformations:
from PyPDF2 import PdfReader, PdfWriter, Transformation
from PyPDF2.generic import NameObject, NumberObject
# Add stuff to the PdfWriter
reader = PdfReader("example.pdf")
writer = PdfWriter()
writer.add_page(reader.pages[0])
# Change it in the writer
transformation = Transformation().rotate(90)
# you need to add .translate(tx=123, tx=456)
# as the coordinate system typically has it's origin in the bottom-left corner
writer.pages[0].add_transformation(transformation)
# Or simpler: writer.rotate(90)
# Write content back
with open("output.pdf", "wb") as fp:
writer.write(fp)

Batch generating barcodes using ReportLab

Yesterday, I asked a question that was perhaps too broad.
Today, I've acted on my ideas in an effort to implement a solution.
Using ReportLab, pdfquery and PyPDF2, I'm trying to automate the process of generating barcodes on hundreds of pages in a PDF document.
Each page needs to have one barcode. However, if a page has a letter in the top right ('A' through 'E') then it needs to use the same barcode as the previous page. The files with letters on the top right are duplicate forms with similar information.
If there is no letter present, then a unique barcode number (incremented by one is fine) should be used on that page.
My code seems to work, but I'm having two issues:
The barcode moves around ever so slightly (minor issue).
The barcode value will not change (major issue). Only the first barcode number is set on all pages.
I can't seem to tell why the value isn't changing. Does anyone have an a clue?
Code is here:
import pdfquery
import os
from io import BytesIO
from PyPDF2 import PdfFileWriter, PdfFileReader
from reportlab.graphics.barcode import eanbc
from reportlab.graphics.shapes import Drawing
from reportlab.lib.pagesizes import letter
from reportlab.lib.units import mm
from reportlab.pdfgen import canvas
from reportlab.graphics import renderPDF
pdf = pdfquery.PDFQuery("letters-test.pdf")
total_pages = pdf.doc.catalog['Pages'].resolve()['Count']
print("Total pages", total_pages)
barcode_value = 12345670
output = PdfFileWriter()
for i in range(0, total_pages):
pdf.load(i) # Load page i into memory
duplicate_letter = pdf.pq('LTTextLineHorizontal:in_bbox("432,720,612,820")').text()
if duplicate_letter != '':
print("Page " + str(i+1) + " letter " + str(duplicate_letter))
print(barcode_value)
packet = BytesIO()
c = canvas.Canvas(packet, pagesize=letter)
# draw the eanbc8 code
barcode_eanbc8 = eanbc.Ean8BarcodeWidget(str(barcode_value))
bounds = barcode_eanbc8.getBounds()
width = bounds[2] - bounds[0]
height = bounds[3] - bounds[1]
d = Drawing(50, 10)
d.add(barcode_eanbc8)
renderPDF.draw(d, c, 400, 700)
c.save()
packet.seek(0)
new_pdf = PdfFileReader(packet)
# read existing PDF
existing_pdf = PdfFileReader(open("letters-test.pdf", "rb"))
# add the "watermark" (which is the new pdf) on the existing page
page = existing_pdf.getPage(i)
page.mergePage(new_pdf.getPage(0))
output.addPage(page)
else:
# increment barcode value
barcode_value += 1
print("Page " + str(i+1) + " isn't a duplicate.")
print(barcode_value)
packet = BytesIO()
c = canvas.Canvas(packet, pagesize=letter)
# draw the eanbc8 code
barcode_eanbc8 = eanbc.Ean8BarcodeWidget(str(barcode_value))
bounds = barcode_eanbc8.getBounds()
width = bounds[2] - bounds[0]
height = bounds[3] - bounds[1]
d = Drawing(50, 10)
d.add(barcode_eanbc8)
renderPDF.draw(d, c, 420, 710)
c.save()
packet.seek(0)
new_pdf = PdfFileReader(packet)
# read existing PDF
existing_pdf = PdfFileReader(open("letters-test.pdf", "rb"))
# add the "watermark" (which is the new pdf) on the existing page
page = existing_pdf.getPage(i)
page.mergePage(new_pdf.getPage(0))
output.addPage(page)
# Clear page i from memory and re load.
# pdf = pdfquery.PDFQuery("letters-test.pdf")
outputStream = open("newpdf.pdf", "wb")
output.write(outputStream)
outputStream.close()
And here is letters-test.pdf
as Kamil Nicki's answer pointed out, Ean8BarcodeWidget limiting effective digits to 7:
class Ean8BarcodeWidget(Ean13BarcodeWidget):
_digits=7
...
self.value=max(self._digits-len(value),0)*'0'+value[:self._digits]
you may change your encoding scheme or use EAN 13 barcode with Ean13BarcodeWidget, which has 12 digits usable.
The reason why your barcode is not changing is that you provided too long integer into eanbc.Ean8BarcodeWidget.
According to EAN standard EAN-8 barcodes are 8 digits long (7 digits + checkdigit)
Solution:
If you change barcode_value from 12345670 to 1234560 and run your script you will see that barcode value is increased as you want and checkdigit is appended as eighth number.
With that information in hand you should use only 7 digits to encode information in barcode.

PyPDF2 & ReportLab editing a PDF and merging multiple pages

I'm trying to add some text (page numbers) to an existing PDF file.
Using PyPDF2 package iterating through the original file, creating a canvas, then merging the two files. My problem is that once the program is finished, the new pdf file only has the last page from the original pdf, not all the pages.
eg. If the original pdf has 33 pages, the new pdf only has the last page but with the correct numbering.
Maybe the code can do a better job at explainng:
def test(location, reference, destination):
file = open(location, "rb")
read_pdf = PyPDF2.PdfFileReader(file)
for i in range (0, read_pdf.getNumPages()):
page = read_pdf.getPage(i)
pageReference = "%s_%s"%(reference,format(i+1, '03d'))
width = getPageSizeW(page)
height = getPageSizeH(page)
pagesize = (width, height)
packet = io.BytesIO()
can = canvas.Canvas(packet, pagesize = pagesize)
can.setFillColorRGB(1,0,0)
can.drawString(height*3.5, height*2.75, pageReference)
can.save()
packet.seek(0)
new_pdf = PyPDF2.PdfFileReader(packet)
#add new pdf to old pdf
output = PyPDF2.PdfFileWriter()
page.mergePage(new_pdf.getPage(0))
output.addPage(page)
outputStream = open(destination, 'wb')
output.write(outputStream)
print(pageReference)
outputStream.close()
file.close()
def getPageSizeH(p):
h = float(p.mediaBox.getHeight()) * 0.352
return h
def getPageSizeW(p):
w = float(p.mediaBox.getWidth()) * 0.352
return w
Also if anyone has any ideas on how to insert the references on the top right in a better way, it would be appreciated.
I'm not an expert at PyPDF2 but it looks like the only area in your function where you have PyPDF2.PdfFileWriter() is in your for loop, so I suspect you are initiating a new file and adding to it each time in your for loop, which may cause the end result what you see.

Categories

Resources