PDF split by file size - python

I would like help, as I need to split a pdf file into sizes smaller than 10mb. I already managed to split the file into pages, but could not divide by the size of the destination file.
Below is the code I used to split into pages, using the PyPDF2 library, with the information I've collected right here in stackoverflow.
Thank you for your help.
from PyPDF2 import PdfFileWriter, PdfFileReader
from tkinter.filedialog import askopenfilename as procArq
url = procArq ()
arquivo = PdfFileReader(open(url, "rb"))
for i in range(arquivo.numPages):
saida = PdfFileWriter()
saida.addPage(arquivo.getPage(i))
with open("document-page%s.pdf" % i, "wb") as arquivo_de_saida:
saida.write(arquivo_de_saida)

Related

Position two A4 pages from a pdf file on one A3 page using python

I need to extract two pages from a pdf (A4) and put them side by side on an A3 page. I considered using PyPDF2, but I cannot find anything related in the documentation.
from PyPDF2 import PdfFileWriter, PdfFileReader
inputpdf = PdfFileReader(open("file.pdf", "rb"))
output = PdfFileWriter()
...
with open("out.pdf", "wb") as outputStream:
output.write(outputStream)
Is it possible to do it with PyPDF2 or any other python package?

PyPDF2 split pdf keeping original pdf format

I am spliting a pdf file by ranges. My code works fine, but the problem is that the new pdf file does not preserve the format of the original pdf.
How can I splita pdf file without losing the original format?
from PyPDF2 import PdfFileReader, PdfFileWriter
#split range
pgi=30 #start
pgf=37 #end
pdf_document = "test.pdf"
pdf = PdfFileReader(pdf_document)
pdf_writer = PdfFileWriter()
for page in range(pgi-1,pgf):
current_page = pdf.getPage(page)
pdf_writer.addPage(current_page)
with open(f'test-{pgi}-{pgf}.pdf', "wb") as out:
pdf_writer.write(out)
PS: I don't have this problem when I use Adobe or MasterPdfEditor software.
The image below shows the original and new pdf format.
out format pdf

Download a file to heroku process it and send as an attachment

I'm trying to do the following:
Download a pdf file from S3 to my heroku.
Process the pdf.
Email the pdf as an attachment.
Is it possible? If yes, could you please give me a tip how?
I'm running Django and pdf is about 1MB.
This is my processing part:
from PyPDF2 import PdfFileWriter, PdfFileReader
from reportlab.pdfgen import canvas
from reportlab.lib.colors import HexColor
import os, sys
import requests
from io import BytesIO
URL = "https://domainname.com/sample.pdf"
response=requests.get(URL)
p = BytesIO(response.content)
p.seek(0, os.SEEK_END)
def watermark_product(watermark_text, input_file_path, output_file_path):
c = canvas.Canvas("watermark.pdf")
c.setFont("Helvetica", 24)
c.setFillGray(0.5,0.5)
c.saveState()
c.translate(500,100)
c.rotate(45)
c.drawCentredString(0, 300, watermark_text)
c.restoreState()
c.save()
input_file = PdfFileReader(input_file_path)
output_writer = PdfFileWriter()
total_pages = input_file.getNumPages()
for single_page in range(total_pages):
page = input_file.getPage(single_page)
watermark = PdfFileReader("watermark.pdf")
page.mergePage(watermark.getPage(0))
output_writer.addPage(page)
with open(output_file_path, "wb") as outputStream:
output_writer.write(outputStream)
os.remove("watermark.pdf")
watermark_product('testtesatd', p, 'w1.pdf')
EDIT:
I've managed to keep the pdf file in memory.

PyPDF2 - merging pages from two different PDF files is not working

I'm trying to merge pages from two PDF files into a single PDF with a single page. So I tried the code below that uses PyPDF2:
from PyPDF2 import PdfFileReader,PdfFileWriter
import sys
f = sys.argv[1]
k = sys.argv[2]
print f,k
file1 = PdfFileReader(file(f, "rb"))
file2 = PdfFileReader(file(k, "rb"))
output = PdfFileWriter()
page = file1.getPage(0)
page.mergePage(file2.getPage(0))
output.addPage(page)
outputStream = file("join.pdf", "wb")
output.write(outputStream)
outputStream.close()
It produces a single file and single page with the contents of page 1 from file 1, but I don't find any data from page 1 of file2. Seems like it didn't get merged.
On using your exact same code, I am able to get two PDF as merged PDF in one page with the second one overlapping the first one, I referred this link for detailed information.
And, instead of file() it is better to use open() as per this Python Documentation, so I did that.
Also, I made slight changes in your code but still, the working is same and correct on my machine. I am using Ubuntu 16.04 with python 2.7.
Here is the code:
from PyPDF2 import PdfFileReader,PdfFileWriter
import sys
f = sys.argv[1]
k = sys.argv[2]
print f, k
file1 = PdfFileReader(open(f, "rb"))
file2 = PdfFileReader(open(k, "rb"))
output = PdfFileWriter()
page = file1.getPage(0)
page.mergePage(file2.getPage(0))
output.addPage(page)
with open("join.pdf", "wb") as outputStream:
output.write(outputStream)
I hope this helps.
UPDATE:
Here is the code which is working for me and merging the two pdf's page as single page.
from pyPdf import PdfFileWriter, PdfFileReader
from pdfnup import generateNup
initial_output = PdfFileWriter()
input1 = PdfFileReader(open("landscape1.pdf", "rb"))
input2 = PdfFileReader(open("landscape2.pdf", "rb"))
initial_output.addPage(input1.getPage(0))
initial_output.addPage(input2.getPage(0))
# creates a new pdf file with required pages as separate pages.
initial_output.write(file("final.pdf", "wb"))
# merges newly created pdf file pages as one.
generateNup("final.pdf", 2, "intermediate.pdf")
# overwrite and rotates the final.pdf
final_output = PdfFileWriter()
final_output.addPage(PdfFileReader(open("intermediate.pdf", "rb")).getPage(0).rotateClockwise(90))
final_output.write(open("final.pdf", "wb"))
I have added a new code and now it is also rotating the final pdf. Output PDF that you need is final.pdf
And here is the Google Drive link to my drive for PDF files. Also, I made slight changes into pdfnup.py for compatibility with my system for Immutableset if you want to use the same file then, you can find it too in the drive link above.
def merge_page(self, output_pdf,*input_pdfs):
a=len(input_pdfs)
print (a)
merge = PyPDF2.PdfFileMerger()
outputStream = open(output_pdf, "wb")
if a<2:
raise Exception ("Need Atleast Two Pdf for Merging")
else:
for x in input_pdfs:
merge.append(open(x,"rb"))
merge.write(outputStream)
outputStream.close()
For me this code is working in PyCharm and it can take n no of pdf files for merging into single pdf file but the no should be 2 or more less than that will give error.

EOF marker not found - How to fix in PyPDF and PyPDF2?

I'm attempting to combine a few PDF files into a single PDF file using Python. I've tried both PyPDF and PyPDF2 - on some files, they both throw this same error:
PdfReadError: EOF marker not found
Here's my code (page_files) is a list of PDF file paths to combine:
from PyPDF2 import PdfReader, PdfWriter
writer = PdfWriter()
for path in ["example1.pdf", "example2.pdf"]:
reader = PdfReader(path)
for page in reader.pages:
writer.add_page(page)
with open("out.pdf", "wb") as fp:
writer.write(fp)
I've read a few StackOverflow threads on the topic, but none contain a solution that works. If you've successfully combined PDF files using Python, I'd love to hear how.
You were running in an issue of PyPDF2 which was solved with PR #321. The fix was released in PyPDF2==1.27.8 (released on 2022-04-21).
Is there is still someone looking for merging a "list" of pdfs:
Note:
Using glob to get the correct filelist. <- this will really safe your day ^^
Check this out: glob module reference
from PyPDF2 import PdfFileMerger, PdfFileReader, PdfFileWriter
import os
import glob
class MergeAllPDF:
def __init__(self):
self.mergelist = []
def create(self, filepath, outpath, outfilename):
self.outfilname = outfilename
self.filepath = filepath
self.outpath = outpath
self.pdfs = glob.glob(self.filepath)
self.myrange = len(self.pdfs)
for _ in range(self.myrange):
if self.pdfs:
self.mergelist.append(self.pdfs.pop(0))
self.merge()
def merge(self):
if self.mergelist:
self.merger = PdfFileMerger()
for pdf in self.mergelist:
self.merger.append(open(pdf, 'rb'))
self.merger.write(self.outpath + "%s.pdf" % (self.outfilname))
self.merger.close()
self.mergelist = []
else:
print("mergelist is empty please check your input path")
# example how to use
#update your path here:
inpath = r"C:\Users\Fabian\Desktop\mergeallpdfs\scan\*.pdf" #here are your single page pdfs stored
outpath = r"C:\Users\Fabian\Desktop\mergeallpdfs\output\\" #here your merged pdf will be stored
b = MergeAllPDF()
b.create(inpath, outpath, "mergedpdf")

Categories

Resources