python write to pdf on multiple pdf files - pypdf merges files - python

Desired: watermark multiple PDF files on each page
Issue: I can't seem to find a way to close a stream and open new stream for file (f), the end result is output of the PDF files but each preposit PDF contains the content of the preceding PDF file - this is not the desired outcome.
Here is my code:
# -*- coding: utf-8 -*-
import os
import re
from PyPDF4 import PdfFileWriter, PdfFileReader
from reportlab.pdfgen import canvas
from datetime import datetime
from reportlab.pdfbase import pdfmetrics
from reportlab.pdfbase.ttfonts import TTFont
from dotenv import load_dotenv
load_dotenv()
def put_watermark(input_files, output_files):
pdfmetrics.registerFont(TTFont('English', 'Arial.ttf'))
now = datetime.now()
hebrew = (" some string ")
dt_string = now.strftime("%d/%m/%Y %H:%M")
dt_string = dt_string + hebrew
genmark = canvas.Canvas("watermark.pdf")
genmark.setFont("Hebrew", 12)
genmark.drawString(350,15, dt_string)
genmark.save()
watermark_instance = PdfFileReader("watermark.pdf")
watermark_page = watermark_instance.getPage(0)
pdf_writer = PdfFileWriter()
for filename in os.listdir(input_files):
f = os.path.join(input_files, filename)
# Generate canvas with timestamp
pdf_reader = PdfFileReader(f)
for page in range(pdf_reader.getNumPages()):
page = pdf_reader.getPage(page)
page.mergePage(watermark_page)
pdf_writer.addPage(page)
output_file = os.path.join(output_files, filename)
with open(output_file, 'wb') as out:
pdf_writer.write(out)
os.remove('watermark.pdf')
if __name__ == '__main__':
put_watermark(input_files = os.environ["PYPDF_INPUT"],
output_files = os.environ["PYPDF_OUTPUT"])

Change to pyPDF2 from pyPDF4 solved this issue.

Related

Converting PDF page to JPG returns blank

I have a function that asks the user for a PDF file and receive the page number the user wish to convert into an image. The function usually works fine however with a few PDFs it does not work, the image that is returned is blank and it has 4 mega bytes. Apparently it has something to do with the size of the file. Is there a way to solve this problem?
from PyPDF2 import PdfFileReader, PdfFileWriter
from tkinter.filedialog import askopenfilename
from pdf2image import convert_from_path
import os
import PIL
PIL.Image.MAX_IMAGE_PIXELS = None
def convert_pdf(page_number):
filename = askopenfilename()
pdf_file_path = filename
file_base_name = pdf_file_path.replace('.pdf', '')
pdf = PdfFileReader(pdf_file_path)
pages = [page_number]
pdfWriter = PdfFileWriter()
for page_num in pages:
pdfWriter.addPage(pdf.getPage(page_num))
with open('{0}_subset.pdf'.format(file_base_name[:-5]), 'wb') as f:
pdfWriter.write(f)
f.close()
n = file_base_name[:-5]
nome = f'{n}_subset.pdf'
pages = convert_from_path(nome, poppler_path=r'C:\Program Files\poppler-0.68.0\bin')
i = 1
name = os.path.basename(nome).split('/')[-1][:-4]
for page in pages:
image_name = "Page_" + str(i) + f"{name}.jpg"
page.save(image_name, "JPEG")
i = i + 1
The solution to this problem was to change the DPI parameter of convert_from_path function. It is important to leave the DPI as it is, since I found that certain images become really small, and therefore unreadable.
try:
pages = convert_from_path(nome, poppler_path=r'C:\Program Files\poppler-0.68.0\bin')
i = 1
except:
PIL.Image.MAX_IMAGE_PIXELS = None
pages = convert_from_path(nome, 25,poppler_path=r'C:\Program Files\poppler-0.68.0\bin')
i = 1

python pypdf2 - make all annotations static including checkboxes and radio buttons in a PDF

I have adapted code from this answer and this answer, and came up with the following solution:
import sys
import pathlib
from io import BytesIO
from PyPDF2 import PdfFileReader, PdfFileWriter
from PyPDF2.generic import (
BooleanObject, NameObject, NumberObject, IndirectObject
)
def flatten_pdf(pdf_bytes: bytes) -> BytesIO:
"""Flatten a PDF, converting editable fields to non-editable."""
pdf = PdfFileReader(stream=BytesIO(initial_bytes=pdf_bytes))
if '/AcroForm' in pdf.trailer['/Root']:
pdf.trailer['/Root']['/AcroForm'].update({
NameObject('/NeedAppearances'): BooleanObject(True)
})
pdf_writer = PdfFileWriter()
# pylint: disable=protected-access
catalog = pdf_writer._root_object
if '/AcroForm' not in catalog:
pdf_writer._root_object.update({
NameObject('/AcroForm'):
IndirectObject(len(pdf_writer._objects), 0, pdf_writer)
})
pdf_writer._root_object['/AcroForm'][NameObject('/NeedAppearances')] = \
BooleanObject(True)
for page_index in range(0, len(pdf.pages)):
pdf_writer.addPage(pdf.getPage(page_index))
writer_page = pdf_writer.getPage(page_index)
for annotation_index in range(0, len(writer_page['/Annots'])):
writer_annot = writer_page['/Annots'][annotation_index].getObject()
writer_annot.update({NameObject('/Ff'): NumberObject(1)})
output_stream = BytesIO()
pdf_writer.write(output_stream)
return output_stream
if __name__ == "__main__":
pdf_bytes = pathlib.Path(sys.argv[1]).read_bytes()
flatten_output = flatten_pdf(pdf_bytes=pdf_bytes)
with open('output.pdf', 'wb') as f:
f.write(flatten_output.getbuffer())
This works for text-fillable fields and makes them uneditable, but the same is not true for other form types like checkboxes and radio buttons. How can I modify my code to preserve the selections in the input PDF and make the other form types non-editable yet preserve the entered information?
Here is a sample PDF

PyPDF2 creates unreadable PDF

My code merges two PDFs but in trying to set the desktop of the user as the destination of the new PDF the result is an unreadable PDF.
from PyPDF2 import *
from reportlab.pdfgen import canvas
from reportlab.lib.pagesizes import LETTER
from datetime import datetime
from PyPDF2 import PdfFileReader, PdfFileMerger
import io
from tkinter.filedialog import askopenfilename
import os
from tkinter import *
root=Tk()
root.withdraw()\
#CREATE THE CARB PDF PAGE##################################
currentDay = datetime.now().day
currentMonth = datetime.now().month
currentYear = datetime.now().year
c = canvas.Canvas("CARB.pdf", pagesize=LETTER)
c.drawString(10, 170, "TEST")
c.drawString(45, 160, "TEST")
c.drawString(25, 150, "TEST")
c.drawString(50, 140, "___________ MONTH __________ YEAR")
c.drawString(80, 140, str(currentMonth))
c.drawString(200, 140, str(currentYear))
c.save()
#########################################################
username = os.getenv('username')
packet = io.BytesIO()
filename = askopenfilename()
packet.seek(0)
new_pdf = PdfFileReader(filename)
name = (os.path.basename(filename))
name = name[:-4]
#MERGE THE CREATED PDF WITH THE EXESTING PDF##############
existing_pdf = "CARB.pdf"
f1 = new_pdf
f2 = PdfFileReader(open('carb.pdf', 'rb'))
merger = PdfFileMerger(strict=True)
merger.append(f1)
merger.append(f2)
#merger.write(name + "-CARB.pdf")
output = PdfFileWriter()
outputStream = open("C:\\Users\\" + username + "\\Desktop\\" + name + "-CARB.pdf", "wb")
output.write(outputStream)
outputStream.close()

How to downgrade a pdf version from 1.7 (Acrobat 8.x) to 1.4 (Acrobat 5.x) in Python

I have a script that modifies pdf files so that they comply with some specifications that are required to be uploaded to some other app (grayscale or black and white, 300 dpi, letter sized, etc.). I'm using pdf2image, img2pdf. PIL.Image and fitz.
The problem is that when I'm done modifying the files, the pdf version upgrades from 1.4 to 1.7 and I need it to be specifically 1.4. After reading online, I found out that PyPDF2 automatically converts pdf files to 1.3. I tried that thinking that 1.3 could work, but to my surprise it did not. It HAS to be 1.4. Here is my code if it helps:
import os
from os.path import join
from tempfile import TemporaryDirectory
from pdf2image import convert_from_path
from img2pdf import convert
import PIL.Image as Image
import fitz
from PyPDF2 import PdfFileWriter, PdfFileReader
#Here's where the source pdf is located.
pdf_input = os.path.join("PDF")
#Converting pdf to images
with TemporaryDirectory() as temp_dir:
for file in os.listdir(pdf_input):
pdfName = os.fsdecode(file)
pdf_to_open = os.path.join(pdf_input, pdfName)
images = convert_from_path(
pdf_to_open,
dpi=282, #For some reason, if I put 300dpi I end up with 325 dpi.
output_folder=temp_dir,
grayscale=True,
fmt="png",
thread_count=4
)
#Iterating through images
image_list = list()
for page_number in range(1, len(images) + 1):
path = join(temp_dir, "page_" + str(page_number) + ".png")
image_list.append(path)
images[page_number-1].save(path, "PNG")
#Converting to Black and WHite
image_file = Image.open(path)
image_file = image_file.convert('1')
image_file.save(path)
#Converting images to pdf
if not os.path.exists(pdf_input):
os.mkdir(pdf_input)
pdfPath = os.path.join(pdf_input, pdfName)
with open(pdfPath, "bw") as gray_pdf:
gray_pdf.write(convert(image_list))
#Changing pdf size
src = fitz.open(gray_pdf)
doc = fitz.open()
for ipage in src:
fmt = fitz.paper_rect("Letter")
page = doc.new_page(width=fmt.width, height=fmt.height)
page.show_pdf_page(page.rect, src, ipage.number)
src.close()
doc.save(gray_pdf)
#Downgrading with PyPDF2
infile = PdfFileReader(pdfPath, 'rb')
output = PdfFileWriter()
for i in range(infile.getNumPages()):
p = infile.getPage(i)
output.addPage(p)
with open(pdfPath, 'wb') as f:
output.write(f)
I managed to do it using ghostsrcipt. I have no idea how ghostscript works, but this code worked just fine:
import sys
import ghostscript
pdfPath = "path/pdfName.pdf"
newPdfPath = "path/NEW_pdfName.pdf"
args = [
"downgradePDF",
"-sDEVICE=pdfwrite", "-dCompatibilityLevel=1.4", "-dNOPAUSE", "-dQUIET", "-dBATCH",
"-sOutputFile=" + newPdfPath, pdfPath
]
ghostscript.Ghostscript(*args)

Incorrect output: Extracting text from pdf's,docx's pptx's will not output in their own spearte line

I created a function that will open each file in a directory and extract the text from each file and output it in an excel sheet using Pandas. The indexing for each file type seems to be working just fine.However the extracted text from each file comes out next to each other in a list and not separated and next to their corresponding file.
See bottom of script for current output and the out put I want.
** I believe the problem lies in the loader() function which takes in a path, goes through each directory file checks the file .ext and extracts the text.
Thank you!
import re
#import PyPDF4
import pathlib
from pathlib import Path
import shutil
from datetime import datetime
import time
from configparser import ConfigParser
import glob
import fileinput
import pandas as pd
import os
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from io import StringIO
import docx2txt
from pptx import Presentation
import more_itertools as mit
p = Path('C:/Users/XXXX/Desktop/test')
txt_files = list(p.rglob('*txt'))
PDF_files = list(p.rglob('*pdf'))
csv_files = list(p.rglob('*csv'))
docx_files = list(p.rglob('*docx'))
pptx_files = list(p.rglob('*pptx'))
#excel_files = list(p.rglob('xls'))
def pdf_to_text(x):
# PDFMiner
rsrcmgr = PDFResourceManager()
sio = StringIO()
codec = 'utf-8'
laparams = LAParams()
device = TextConverter(rsrcmgr, sio, codec=codec, laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device)
# Extract text
fp = open(x, 'rb')
for page in PDFPage.get_pages(fp):
interpreter.process_page(page)
fp.close()
# Get text from StringIO
text = sio.getvalue()
# Cleanup
device.close()
sio.close()
return text
#-------------------------------------------------------------------------------
def loader(path):
with open(str(path.resolve()),"r",encoding = "ISO-8859-1") as f:
docx_out,pptx_out,pdf_out = [],[],[]
if path.suffix == ".pdf":
for name1 in PDF_files:
pdf_out.append(pdf_to_text(name1))
return pdf_out
elif path.suffix == ".docx":
for name2 in docx_files:
docx_out.append(docx2txt.process(name2))
return docx_out
elif path.suffix == ".pptx":
for file in pptx_files:
prs = Presentation(file)
for slide in prs.slides:
for shape in slide.shapes:
if not shape.has_text_frame:
continue
for paragraph in shape.text_frame.paragraphs:
for run in paragraph.runs:
pptx_out.append(run.text)
return pptx_out
else:
return f.readlines()
print(pdf_out)
def file_generator():
files = txt_files+PDF_files+csv_files+docx_files+pptx_files
for item in files:
yield {
"path": item,
"name": item.name[0:],
"created": time.ctime(item.stat().st_ctime),
"modified": time.ctime(item.stat().st_mtime),
"content": loader(item)
}
def to_xlsx():
df = pd.DataFrame.from_dict(file_generator())
df.head()
df.to_excel("tester4.xlsx")
if __name__ == "__main__":
to_xlsx()
#------------------------------------------------------------
OUTPUT EXAMPLE
current output:
content
["content_test1","content_test2"] test1.pdf
["content_test1","content_test2"] test2.pdf
What I want:
["content_test1"] test1.pdf
["content_test2"] test2.pdf
The appends called by each filetype_out function look like they are adding the contents of each file to the end of the list pertaining to that filetype. If you want to generate a unique list with the contents of each individual file, I'd recommend creating a separate dict for each filetype, which then includes individual lists for each file processed. Taking the PDFs as an example:
def loader(path):
with open(str(path.resolve()),"r",encoding = "ISO-8859-1") as f:
docx_out,pptx_out,pdf_out = {},{},{}
if path.suffix == ".pdf":
for name1 in PDF_files:
name1_contents = []
name1_contents.append(pdf_to_text(name1))
pdf_out[name1] = name1_contents
return pdf_out
To then print out your results in a similar way as you have been:
for name, contents in pdf_out:
print(contents + ' ' + name)

Categories

Resources