Add metadata with PyPDF2 - python

The following code adds a watermark on every page and also adds metadata (or better should do).
The watermarking works perfectly fine, but there is no metadata in the output document and no error
from PyPDF2 import PdfFileReader as PdfReader, PdfFileWriter as PdfWriter
nf = []
sources = ["example.pdf", "example2.pdf"]
for i in sources:
# new pdf file name
new_file_name = i + " " + row["name"] + ".pdf"
nf.append(new_file_name)
print(f"Registering {i} watermarked version for {row['name']}.")
reader = PdfReader(i)
writer = PdfWriter()
# adding watermark to each page
for page in reader.pages:
# creating watermarked page object
wmpageObj = add_watermark(packet, page)
# adding watermarked page object to pdf writer
writer.addPage(wmpageObj)
# Write Metadate
writer.addMetadata({"/Registered to": row["name"]})
writer.addMetadata({"/ATC": "ACME Inc."})
# writing watermarked pages to new file
with open(new_file_name, "wb") as newFile:
writer.write(newFile)
Adding print(pdfWriter._info) before and after adding the metadata gives me only:
IndirectObject(2, 0)
IndirectObject(2, 0)
Also interesting: I tried Adobe Acrobat Reader DC on Mac and Windows and it's not possible to show the metadata of the output file (the window just won't open), but works fine with the source file, i.e. before adding watermark and metadata.

Related

PDFs written by PyPDF2 showing changes when opened in Acrobat

I'm using Python and PyPDF2 to generate a set of PDFs based on a template with form fields. The PDFs are created and all of the fields are filled correctly, but when I open the PDFs in Adobe Acrobat they show changes made to the file (i.e., the "Save" menu option is enabled, and when I try to close the file Adobe asks if I want to save changes, even if I haven't touched anything).
It's mostly just a slight annoyance, but is there a way to prevent this from happening? From my research it seems like this means (1) there's JavaScript modifying the file (there isn't), or (2) the file is corrupted and Adobe is fixing it.
A simplified version of my code is below. I set /NeedAppearances to True in both the reader and writer because otherwise the values didn't appear in the PDF unless I clicked on the field. I also set the annotations so that the fields are read-only and appear as regular text.
from PyPDF2 import PdfFileReader, PdfFileWriter
from PyPDF2.generic import BooleanObject, NameObject, IndirectObject, NumberObject
data = {'field1': 'Text1', 'field2': 'Text2'}
with open('template.pdf', 'rb') as read_file:
pdf_reader = PdfFileReader(read_file)
pdf_writer = PdfFileWriter()
# Set /NeedAppearances to make field values visible
try:
if '/AcroForm' in pdf_reader.trailer['/Root']:
pdf_reader.trailer['/Root']['/AcroForm'][NameObject('/NeedAppearances')] = BooleanObject(True)
if '/AcroForm' not in pdf_writer._root_object:
root = pdf_writer._root_object
acroform = {NameObject('/AcroForm'): IndirectObject(len(pdf_writer._objects), 0, pdf_writer)}
root.update(acroform)
root['/AcroForm'][NameObject('/NeedAppearances')] = BooleanObject(True)
except:
print('Warning: Error setting PDF /NeedAppearances value.')
# Add first page to writer
pdf_writer.addPage(pdf_reader.getPage(0))
page = pdf_writer.getPage(0)
# Update form fields
pdf_writer.updatePageFormFieldValues(page, data)
# Make fields read-only
for i in range(len(page['/Annots'])):
annot = page['/Annots'][i].getObject()
annot.update({NameObject('/Ff'): NumberObject(1)})
# Write PDF
with open('result.pdf', 'wb') as write_file:
pdf_writer.write(write_file)

PyPDF2 corrupts file when watermarking

I have been trying to speed up our date stamping process by adding a stamp as a watermark to PDFs through PyPDF2. I found the code below online as I'm pretty new to coding.
When I run this it seems to work, but the file is corrupted and won't open. Does anyone have any ideas where I am going wrong?
from PyPDF2 import PdfFileWriter, PdfFileReader
def create_watermark(input_pdf, output_pdf, watermark):
watermark_obj = PdfFileReader(watermark,False,)
watermark_page = watermark_obj.getPage(0)
pdf_reader = PdfFileReader(input_pdf)
pdf_writer = PdfFileWriter()
# Watermark all the pages
for page in range(pdf_reader.getNumPages()):
page = pdf_reader.getPage(page)
page.mergePage(watermark_page)
pdf_writer.addPage(page)
with open(input_pdf, 'wb') as out:
pdf_writer.write(out)
if __name__ == '__main__':
input_pdf = "C:\\Users\\A***\\OneDrive - ***\\Desktop\\Invoice hold\\Test\\1.pdf"
output_pdf = "C:\\Users\\A***\\OneDrive - ***\\Desktop\\Invoice hold\\Test\\1 WM.pdf"
watermark = "C:\\Users\\A***\\OneDrive - ***\\Desktop\\Invoice hold\\WM.pdf"
create_watermark(input_pdf,output_pdf,watermark)
If you want to save pdf file under the name of output_pdf,
try this :
result = open(output_pdf, 'wb')
pdf_writer.write(result)
your code :
with open(input_pdf, 'wb') as out:
pdf_writer.write(out)
Your code is to overwrite input_pdf.
And if there is a problem while working, the pdf file will be damaged.
I succeeded in inserting the watermark by applying your code and my proposed method.
I recommend checking if the pdf file is not damaged.

PyPDF2 returning blank PDF after copy

def EncryptPDFFiles(password, directory):
pdfFiles = []
success = 0
# Get all PDF files from a directory
for folderName, subFolders, fileNames in os.walk(directory):
for fileName in fileNames:
if (fileName.endswith(".pdf")):
pdfFiles.append(os.path.join(folderName, fileName))
print("%s PDF documents found." % str(len(pdfFiles)))
# Create an encrypted version for each document
for pdf in pdfFiles:
# Copy old PDF into a new PDF object
pdfFile = open(pdf,"rb")
pdfReader = PyPDF2.PdfFileReader(pdfFile)
pdfWriter = PyPDF2.PdfFileWriter()
for pageNum in range(pdfReader.numPages):
pdfWriter.addPage(pdfReader.getPage(pageNum))
pdfFile.close()
# Encrypt the new PDF and save it
saveName = pdf.replace(".pdf",ENCRYPTION_TAG)
pdfWriter.encrypt(password)
newFile = open(saveName, "wb")
pdfWriter.write(newFile)
newFile.close()
print("%s saved to: %s" % (pdf, saveName))
# Verify the the encrypted PDF encrypted properly
encryptedPdfFile = open(saveName,"rb")
encryptedPdfReader = PyPDF2.PdfFileReader(encryptedPdfFile)
canDecrypt = encryptedPdfReader.decrypt(password)
encryptedPdfFile.close()
if (canDecrypt):
print("%s successfully encrypted." % (pdf))
send2trash.send2trash(pdf)
success += 1
print("%s of %s successfully encrypted." % (str(success),str(len(pdfFiles))))
I am following along with Pythons Automate the Boring Stuff section. I've had off and on issues when doing the copy for a PDF document but as of right now everytime I run the program my copied PDF is all blank pages. There are the correct amount of pages of my newly encrypted PDF but they are all blank (no content on the pages). I've had this happen before but was not able to recreate. I've tried throwing in a sleep before closing my files. I'm not sure what the best practice for opening and closing files are in Python. For reference I'm using Python3.
Try moving the pdfFile.close to the very end of your for loop.
for pdf in pdfFiles:
#
# {stuff}
#
if (canDecrypt):
print("%s successfully encrypted." % (pdf))
send2trash.send2trash(pdf)
success += 1
pdfFile.close()
The thought is that the pdfFile needs to be available and open when the pdfWriter finally writes out, otherwise it cannot access the pages to write the new file.
The issue with getting a blank page even after adding a page to your pdf with writer.addPage(your_page_name) is the context manager.
You have to make sure that you're not closing the pdf from which you're reading the page.
For Example:
with open(str(_pdf), "rb") as in_f:
reader = PdfFileReader(in_f)
_page = reader.getPage(0)
writer = PdfFileWriter()
writer.addPage(_page)
with open(_filename, "wb+") as out_f:
writer.write(out_f)
This will NOT WORK since the file handle is being closed by the context manager. The file has to be open So we would have to indent it. Like the following:
with open(str(_pdf), "rb") as in_f:
reader = PdfFileReader(in_f)
_page = reader.getPage(0)
writer = PdfFileWriter()
writer.addPage(_page)
with open(_filename, "wb+") as out_f:
writer.write(out_f)
I know it's not a big deal but this literally made me pull out my hair, indentation wasted my 6 hours. That's why I thought I should write an answer for others

How to extract text from a directory of PDF files efficiently with OCR?

I have a large directory with PDF files (images), how can I extract efficiently the text from all the files inside the directory?. So far I tried to:
import multiprocessing
import textract
def extract_txt(file_path):
text = textract.process(file_path, method='tesseract')
p = multiprocessing.Pool(2)
file_path = ['/Users/user/Desktop/sample.pdf']
list(p.map(extract_txt, file_path))
However, it is not working... it takes a lot of time (I have some documents that have 600 pages). Additionally: a) I do not know how to handle efficiently the directory transformation part. b) I would like to add a page separator, let's say: <start/age = 1> ... page content ... <end/page = 1>, but I have no idea of how to do this.
Thus, how can I apply the extract_txt function to all the elements of a directory that end with .pdf and return the same files in another directory but in a .txt format, and add a page separator with OCR text extraction?.
Also, I was curios about using google docs to make this task, is it possible to programmatically use google docs to solve the aforementioned text extracting problem?.
UPDATE
Regarding the "adding a page separator" issue (<start/age = 1> ... page content ... <end/page = 1>) after reading Roland Smith's answer I tried to:
from PyPDF2 import PdfFileWriter, PdfFileReader
import textract
def extract_text(pdf_file):
inputpdf = PdfFileReader(open(pdf_file, "rb"))
for i in range(inputpdf.numPages):
w = PdfFileWriter()
w.addPage(inputpdf.getPage(i))
outfname = 'page{:03d}.pdf'.format(i)
with open(outfname, 'wb') as outfile: # I presume you need `wb`.
w.write(outfile)
print('\n<begin page pos =' , i, '>\n')
text = textract.process(str(outfname), method='tesseract')
os.remove(outfname) # clean up.
print(str(text, 'utf8'))
print('\n<end page pos =' , i, '>\n')
extract_text('/Users/user/Downloads/ImageOnly.pdf')
However, I still have issues with the print() part, since instead of printing, it would be more useful to save into a file all the output. Thus, I tried to redirect the output to a a file:
sys.stdout=open("test.txt","w")
print('\n<begin page pos =' , i, '>\n')
sys.stdout.close()
text = textract.process(str(outfname), method='tesseract')
os.remove(outfname) # clean up.
sys.stdout=open("test.txt","w")
print(str(text, 'utf8'))
sys.stdout.close()
sys.stdout=open("test.txt","w")
print('\n<end page pos =' , i, '>\n')
sys.stdout.close()
Any idea of how to make the page extraction/separator trick and saving everything into a file?...
In your code, you are extracting the text, but you don't do anything with it.
Try something like this:
def extract_txt(file_path):
text = textract.process(file_path, method='tesseract')
outfn = file_path[:-4] + '.txt' # assuming filenames end with '.pdf'
with open(outfn, 'wb') as output_file:
output_file.write(text)
return file_path
This writes the text to file that has the same name but a .txt extension.
It also returns the path of the original file to let the parent know that this file is done.
So I would change the mapping code to:
p = multiprocessing.Pool()
file_path = ['/Users/user/Desktop/sample.pdf']
for fn in p.imap_unordered(extract_txt, file_path):
print('completed file:', fn)
You don't need to give an argument when creating a Pool. By default it will create as many workers as there are cpu-cores.
Using imap_unordered creates an iterator that starts yielding values as soon as they are available.
Because the worker function returned the filename, you can print it to let the user know that this file is done.
Edit 1:
The additional question is if it is possible to mark page boundaries. I think it is.
A method that would surely work is to split the PDF file into pages before the OCR. You could use e.g. pdfinfo from the poppler-utils package to find out the number of pages in a document. And then you could use e.g. pdfseparate from the same poppler-utils package to convert that one pdf file of N pages into N pdf files of one page. You could then OCR the single page PDF files separately. That would give you the text on each page separately.
Alternatively you could OCR the whole document and then search for page breaks. This will only work if the document has a constant or predictable header or footer on every page. It is probably not as reliable as the abovementioned method.
Edit 2:
If you need a file, write a file:
from PyPDF2 import PdfFileWriter, PdfFileReader
import textract
def extract_text(pdf_file):
inputpdf = PdfFileReader(open(pdf_file, "rb"))
outfname = pdf_file[:-4] + '.txt' # Assuming PDF file name ends with ".pdf"
with open(outfname, 'w') as textfile:
for i in range(inputpdf.numPages):
w = PdfFileWriter()
w.addPage(inputpdf.getPage(i))
outfname = 'page{:03d}.pdf'.format(i)
with open(outfname, 'wb') as outfile: # I presume you need `wb`.
w.write(outfile)
print('page', i)
text = textract.process(outfname, method='tesseract')
# Add header and footer.
text = '\n<begin page pos = {}>\n'.format(i) + text + '\n<end page pos = {}>\n'.format(i)
# Write the OCR-ed text to the output file.
textfile.write(text)
os.remove(outfname) # clean up.
print(text)

PyPDF2 - merging pages from two different PDF files is not working

I'm trying to merge pages from two PDF files into a single PDF with a single page. So I tried the code below that uses PyPDF2:
from PyPDF2 import PdfFileReader,PdfFileWriter
import sys
f = sys.argv[1]
k = sys.argv[2]
print f,k
file1 = PdfFileReader(file(f, "rb"))
file2 = PdfFileReader(file(k, "rb"))
output = PdfFileWriter()
page = file1.getPage(0)
page.mergePage(file2.getPage(0))
output.addPage(page)
outputStream = file("join.pdf", "wb")
output.write(outputStream)
outputStream.close()
It produces a single file and single page with the contents of page 1 from file 1, but I don't find any data from page 1 of file2. Seems like it didn't get merged.
On using your exact same code, I am able to get two PDF as merged PDF in one page with the second one overlapping the first one, I referred this link for detailed information.
And, instead of file() it is better to use open() as per this Python Documentation, so I did that.
Also, I made slight changes in your code but still, the working is same and correct on my machine. I am using Ubuntu 16.04 with python 2.7.
Here is the code:
from PyPDF2 import PdfFileReader,PdfFileWriter
import sys
f = sys.argv[1]
k = sys.argv[2]
print f, k
file1 = PdfFileReader(open(f, "rb"))
file2 = PdfFileReader(open(k, "rb"))
output = PdfFileWriter()
page = file1.getPage(0)
page.mergePage(file2.getPage(0))
output.addPage(page)
with open("join.pdf", "wb") as outputStream:
output.write(outputStream)
I hope this helps.
UPDATE:
Here is the code which is working for me and merging the two pdf's page as single page.
from pyPdf import PdfFileWriter, PdfFileReader
from pdfnup import generateNup
initial_output = PdfFileWriter()
input1 = PdfFileReader(open("landscape1.pdf", "rb"))
input2 = PdfFileReader(open("landscape2.pdf", "rb"))
initial_output.addPage(input1.getPage(0))
initial_output.addPage(input2.getPage(0))
# creates a new pdf file with required pages as separate pages.
initial_output.write(file("final.pdf", "wb"))
# merges newly created pdf file pages as one.
generateNup("final.pdf", 2, "intermediate.pdf")
# overwrite and rotates the final.pdf
final_output = PdfFileWriter()
final_output.addPage(PdfFileReader(open("intermediate.pdf", "rb")).getPage(0).rotateClockwise(90))
final_output.write(open("final.pdf", "wb"))
I have added a new code and now it is also rotating the final pdf. Output PDF that you need is final.pdf
And here is the Google Drive link to my drive for PDF files. Also, I made slight changes into pdfnup.py for compatibility with my system for Immutableset if you want to use the same file then, you can find it too in the drive link above.
def merge_page(self, output_pdf,*input_pdfs):
a=len(input_pdfs)
print (a)
merge = PyPDF2.PdfFileMerger()
outputStream = open(output_pdf, "wb")
if a<2:
raise Exception ("Need Atleast Two Pdf for Merging")
else:
for x in input_pdfs:
merge.append(open(x,"rb"))
merge.write(outputStream)
outputStream.close()
For me this code is working in PyCharm and it can take n no of pdf files for merging into single pdf file but the no should be 2 or more less than that will give error.

Categories

Resources