Cropping PDF files but different dimension for each page

Cropping PDF files but different dimension for each page - python

I'm trying to crop a PDF file but different dimensions for each page. I have managed to use PyPDF2 but not sure how to crop on different dimensions for the second (and final) page. This is what I have for dimensions for first page so far. What do I need to add?
from PyPDF2 import PdfFileWriter, PdfFileReader
print_on = True
output = PdfFileWriter()
input = PdfFileReader(open('/Users/Downloads/lol.pdf', 'rb'))
n = input.getNumPages()
for i in range(n):
page = input.getPage(i)
page.cropBox.upperLeft = (63.389830508474574,643.7972508591065)
page.cropBox.lowerRight = (561.8644067796611,483.2096219931271)
output.addPage(page)
outputStream = open('/Users/Downloads/result.pdf', 'wb')
print('Done')
output.write(outputStream)
outputStream.close()

Related

Converting PDF page to JPG returns blank

I have a function that asks the user for a PDF file and receive the page number the user wish to convert into an image. The function usually works fine however with a few PDFs it does not work, the image that is returned is blank and it has 4 mega bytes. Apparently it has something to do with the size of the file. Is there a way to solve this problem?
from PyPDF2 import PdfFileReader, PdfFileWriter
from tkinter.filedialog import askopenfilename
from pdf2image import convert_from_path
import os
import PIL
PIL.Image.MAX_IMAGE_PIXELS = None
def convert_pdf(page_number):
filename = askopenfilename()
pdf_file_path = filename
file_base_name = pdf_file_path.replace('.pdf', '')
pdf = PdfFileReader(pdf_file_path)
pages = [page_number]
pdfWriter = PdfFileWriter()
for page_num in pages:
pdfWriter.addPage(pdf.getPage(page_num))
with open('{0}_subset.pdf'.format(file_base_name[:-5]), 'wb') as f:
pdfWriter.write(f)
f.close()
n = file_base_name[:-5]
nome = f'{n}_subset.pdf'
pages = convert_from_path(nome, poppler_path=r'C:\Program Files\poppler-0.68.0\bin')
i = 1
name = os.path.basename(nome).split('/')[-1][:-4]
for page in pages:
image_name = "Page_" + str(i) + f"{name}.jpg"
page.save(image_name, "JPEG")
i = i + 1

The solution to this problem was to change the DPI parameter of convert_from_path function. It is important to leave the DPI as it is, since I found that certain images become really small, and therefore unreadable.
try:
pages = convert_from_path(nome, poppler_path=r'C:\Program Files\poppler-0.68.0\bin')
i = 1
except:
PIL.Image.MAX_IMAGE_PIXELS = None
pages = convert_from_path(nome, 25,poppler_path=r'C:\Program Files\poppler-0.68.0\bin')
i = 1

Text rotated when merging pdf pages using Pypdf2 and Reportlab

I'm trying to merge two pages
one from reportlab that has the text I wish and another one is my source pdf
But when I merge those two pages, my text is rotated 90 degree
Pdf created using Report lab -> Overlay Created using Reportlab
when Merged with Source pdf -> Source Pdf
Code that I have Used :
from PyPDF2 import PdfFileWriter, PdfFileReader
import io
from reportlab.pdfgen import canvas
from reportlab.lib.units import inch
packet = io.BytesIO()
c = canvas.Canvas(packet)
c.drawString(0,0,"Hello World")
c.save()
packet.seek(0)
packet_pdf = PdfFileReader(packet)
input_pdf = PdfFileReader(open("Source.pdf", "rb"))
output = PdfFileWriter()
page = input_pdf.getPage(0)
page.mergePage(packet_pdf.getPage(0))
output.addPage(page)
outputStream = open("destination.pdf", "wb")
output.write(outputStream)
outputStream.close()
reference : Add text to Existing PDF using Python

Refered to this and created an own solution -> Python PyPDF2 merge rotated pages
from PyPDF2 import PdfFileWriter, PdfFileReader
import io
from reportlab.pdfgen.canvas import Canvas
page_to_merge = 0 #Refers to the First page of PDF
xcoor = 250 #To be changed according to your pdf
ycoor = 650 #To be changed according to your pdf
input_pdf = PdfFileReader(open("Source.pdf", "rb"))
page_count = input_pdf.getNumPages()
inputpdf_page_to_be_merged = input_pdf.getPage(page_to_merge)
packet = io.BytesIO()
c = Canvas(packet,pagesize=(inputpdf_page_to_be_merged.mediaBox.getWidth(),inputpdf_page_to_be_merged.mediaBox.getHeight()))
c.drawString(xcoor,ycoor,"Hello World")
c.save()
packet.seek(0)
overlay_pdf = PdfFileReader(packet)
overlay = overlay_pdf.getPage(0)
output = PdfFileWriter()
for PAGE in range(page_count):
if PAGE == page_to_merge:
inputpdf_page_to_be_merged.mergeRotatedTranslatedPage(overlay,
inputpdf_page_to_be_merged.get('/Rotate') or 0,
overlay.mediaBox.getWidth()/2, overlay.mediaBox.getWidth()/2)
output.addPage(inputpdf_page_to_be_merged)
else:
Page_in_pdf = input_pdf.getPage(PAGE)
output.addPage(Page_in_pdf)
outputStream = open("destination.pdf", "wb")
output.write(outputStream)
outputStream.close()

crop a pdf with PyPDF2

I've been working on a project in which I extract table data from a pdf with neural network,
I successfuly detect tables and get their coordinate (x,y,width,height) , I've been trying to crop the pdf with pypdf2 to isolate the table but for some reason cropping never matches the desired outcome.
After running inference i get these coordinates
[[5.0948269e+01, 1.5970685e+02, 1.1579385e+03, 2.7092386e+02
9.9353129e-01]]
the 5th number is my neural network precision , we can safely ignore it
trying them in pyplot works , so there's no problem with them:
However using the same coords in pypdf2 is always off
from PyPDF2 import PdfFileWriter, PdfFileReader
with open("mypdf.pdf", "rb") as in_f:
input1 = PdfFileReader(in_f)
output = PdfFileWriter()
numPages = input1.getNumPages()
for i in range(numPages):
page = input1.getPage(i)
page.cropBox.upperLeft = (5.0948269e+01,1.5970685e+02)
page.cropBox.upperLeft = (1.1579385e+03, 2.7092386e+02)
output.addPage(page)
with open("out.pdf", "wb") as out_f:
output.write(out_f)
This is the output I get :
Am i missing something ?
thank you !

Here you go:
from PyPDF2 import PdfFileWriter, PdfFileReader
with open("mypdf.pdf", "rb") as in_f:
input1 = PdfFileReader(in_f)
output = PdfFileWriter()
numPages = input1.getNumPages()
x, y, w, h = (5.0948269e+01, 1.5970685e+02, 1.1579385e+03, 2.7092386e+02)
page_x, page_y = input1.getPage(0).cropBox.getUpperLeft()
upperLeft = [page_x.as_numeric(), page_y.as_numeric()] # convert PyPDF2.FloatObjects into floats
new_upperLeft = (upperLeft[0] + x, upperLeft[1] - y)
new_lowerRight = (new_upperLeft[0] + w, new_upperLeft[1] - h)
for i in range(numPages):
page = input1.getPage(i)
page.cropBox.upperLeft = new_upperLeft
page.cropBox.lowerRight = new_lowerRight
output.addPage(page)
with open("out.pdf", "wb") as out_f:
output.write(out_f)
Note: in PyPDF2 the origin of coordinates placed in the lower left corner of a page. And the Y-axis is directed from the bottom to up. Not like on the screen. So if you want to get a PDF-coordinate of top edge of your crop area you need to subtract y-coordinate of the top edge of the crop area from the height of the page.

PyPDF2 & ReportLab editing a PDF and merging multiple pages

I'm trying to add some text (page numbers) to an existing PDF file.
Using PyPDF2 package iterating through the original file, creating a canvas, then merging the two files. My problem is that once the program is finished, the new pdf file only has the last page from the original pdf, not all the pages.
eg. If the original pdf has 33 pages, the new pdf only has the last page but with the correct numbering.
Maybe the code can do a better job at explainng:
def test(location, reference, destination):
file = open(location, "rb")
read_pdf = PyPDF2.PdfFileReader(file)
for i in range (0, read_pdf.getNumPages()):
page = read_pdf.getPage(i)
pageReference = "%s_%s"%(reference,format(i+1, '03d'))
width = getPageSizeW(page)
height = getPageSizeH(page)
pagesize = (width, height)
packet = io.BytesIO()
can = canvas.Canvas(packet, pagesize = pagesize)
can.setFillColorRGB(1,0,0)
can.drawString(height*3.5, height*2.75, pageReference)
can.save()
packet.seek(0)
new_pdf = PyPDF2.PdfFileReader(packet)
#add new pdf to old pdf
output = PyPDF2.PdfFileWriter()
page.mergePage(new_pdf.getPage(0))
output.addPage(page)
outputStream = open(destination, 'wb')
output.write(outputStream)
print(pageReference)
outputStream.close()
file.close()
def getPageSizeH(p):
h = float(p.mediaBox.getHeight()) * 0.352
return h
def getPageSizeW(p):
w = float(p.mediaBox.getWidth()) * 0.352
return w
Also if anyone has any ideas on how to insert the references on the top right in a better way, it would be appreciated.

I'm not an expert at PyPDF2 but it looks like the only area in your function where you have PyPDF2.PdfFileWriter() is in your for loop, so I suspect you are initiating a new file and adding to it each time in your for loop, which may cause the end result what you see.

How to "write to variable" instead of "to file" in Python

I'm trying to write a function which splits a pdf into separate pages. From this SO answer. I copied a simple function which splits a pdf into separate pages:
def splitPdf(file_):
pdf = PdfFileReader(file_)
pages = []
for i in range(pdf.getNumPages()):
output = PdfFileWriter()
output.addPage(pdf.getPage(i))
with open("document-page%s.pdf" % i, "wb") as outputStream:
output.write(outputStream)
return pages
This however, writes the new PDFs to file, instead of returning a list of the new PDFs as file variables. So I changed the line of output.write(outputStream) to:
pages.append(outputStream)
When trying to write the elements in the pages list however, I get a ValueError: I/O operation on closed file.
Does anybody know how I can add the new files to the list and return them, instead of writing them to file? All tips are welcome!

It is not completely clear what you mean by "list of PDFs as file variables. If you want to create strings instead of files with PDF contents, and return a list of such strings, replace open() with StringIO and call getvalue() to obtain the contents:
import cStringIO
def splitPdf(file_):
pdf = PdfFileReader(file_)
pages = []
for i in range(pdf.getNumPages()):
output = PdfFileWriter()
output.addPage(pdf.getPage(i))
io = cStringIO.StringIO()
output.write(io)
pages.append(io.getvalue())
return pages

You can use the in-memory binary streams in the io module. This will store the pdf files in your memory.
import io
def splitPdf(file_):
pdf = PdfFileReader(file_)
pages = []
for i in range(pdf.getNumPages()):
outputStream = io.BytesIO()
output = PdfFileWriter()
output.addPage(pdf.getPage(i))
output.write(outputStream)
# Move the stream position to the beginning,
# making it easier for other code to read
outputStream.seek(0)
pages.append(outputStream)
return pages
To later write the objects to a file, use shutil.copyfileobj:
import shutil
with open('page0.pdf', 'wb') as out:
shutil.copyfileobj(pages[0], out)

Haven't used PdfFileWriter, but think that this should work.
def splitPdf(file_):
pdf = PdfFileReader(file_)
pages = []
for i in range(pdf.getNumPages()):
output = PdfFileWriter()
output.addPage(pdf.getPage(i))
pages.append(output)
return pages
def writePdf(pages):
i = 1
for p in pages:
with open("document-page%s.pdf" % i, "wb") as outputStream:
p.write(outputStream)
i += 1

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Cropping PDF files but different dimension for each page - python

Related

Converting PDF page to JPG returns blank

Text rotated when merging pdf pages using Pypdf2 and Reportlab

crop a pdf with PyPDF2

PyPDF2 & ReportLab editing a PDF and merging multiple pages

How to "write to variable" instead of "to file" in Python

Categories

Resources