Search and replace for text within a pdf, in Python - python

This question already has answers here:
How can I replace text in a PDF using Python?
(4 answers)
Closed 14 hours ago.
I am writing mailmerge software as part of a Python web app.
I have a template called letter.pdf which was generated from a MS Word file and includes the text {name} where the resident's name will go. I also have a list of c. 100 residents' names.
What I want to do is to read in letter.pdf do a search for "{name}" and replace it with the resident's name (for each resident) then write the result to another pdf. I then want to gather all these pdfs together into a big pdf (one page per letter) which my web app's users will print out to create their letters.
Are there any Python libraries that will do this? I've looked at pdfrw and pdfminer but I couldn't see where they would be able to do it.
(NB: I also have the MS Word file, so if there was another way of using that, and not going through a pdf, that would also do the job.)

This can be done with PyPDF2 package. The implementation may depend on the original PDF template structure. But if the template is stable enough and isn't changed very often the replacement code shouldn't be generic but rather simple.
I did a small sketch on how you could replace the text inside a PDF file. It replaces all occurrences of PDF tokens to DOC.
import os
import argparse
from PyPDF2 import PdfFileReader, PdfFileWriter
from PyPDF2.generic import DecodedStreamObject, EncodedStreamObject
def replace_text(content, replacements = dict()):
lines = content.splitlines()
result = ""
in_text = False
for line in lines:
if line == "BT":
in_text = True
elif line == "ET":
in_text = False
elif in_text:
cmd = line[-2:]
if cmd.lower() == 'tj':
replaced_line = line
for k, v in replacements.items():
replaced_line = replaced_line.replace(k, v)
result += replaced_line + "\n"
else:
result += line + "\n"
continue
result += line + "\n"
return result
def process_data(object, replacements):
data = object.getData()
decoded_data = data.decode('utf-8')
replaced_data = replace_text(decoded_data, replacements)
encoded_data = replaced_data.encode('utf-8')
if object.decodedSelf is not None:
object.decodedSelf.setData(encoded_data)
else:
object.setData(encoded_data)
if __name__ == "__main__":
ap = argparse.ArgumentParser()
ap.add_argument("-i", "--input", required=True, help="path to PDF document")
args = vars(ap.parse_args())
in_file = args["input"]
filename_base = in_file.replace(os.path.splitext(in_file)[1], "")
# Provide replacements list that you need here
replacements = { 'PDF': 'DOC'}
pdf = PdfFileReader(in_file)
writer = PdfFileWriter()
for page_number in range(0, pdf.getNumPages()):
page = pdf.getPage(page_number)
contents = page.getContents()
if isinstance(contents, DecodedStreamObject) or isinstance(contents, EncodedStreamObject):
process_data(contents, replacements)
elif len(contents) > 0:
for obj in contents:
if isinstance(obj, DecodedStreamObject) or isinstance(obj, EncodedStreamObject):
streamObj = obj.getObject()
process_data(streamObj, replacements)
writer.addPage(page)
with open(filename_base + ".result.pdf", 'wb') as out_file:
writer.write(out_file)
The results are
UPDATE 2021-03-21:
Updated the code example to handle DecodedStreamObject and EncodedStreamObject which actually contian data stream with text to update.

If #Dmytrio solution do not alter final PDF
Dymitrio's updated code example to handle DecodedStreamObject and EncodedStreamObject which actually contain data stream with text to update could run fine, but with a file different from example, was not able to alter pdf text content.
According to EDIT 3, from How to replace text in a PDF using Python?:
By inserting page[NameObject("/Contents")] = contents.decodedSelf before writer.addPage(page), we force pyPDF2 to update content of the page object.
This way I was able to overcome this problem and replace text from pdf file.
Final code should look like this:
import os
import argparse
from PyPDF2 import PdfFileReader, PdfFileWriter
from PyPDF2.generic import DecodedStreamObject, EncodedStreamObject, NameObject
def replace_text(content, replacements = dict()):
lines = content.splitlines()
result = ""
in_text = False
for line in lines:
if line == "BT":
in_text = True
elif line == "ET":
in_text = False
elif in_text:
cmd = line[-2:]
if cmd.lower() == 'tj':
replaced_line = line
for k, v in replacements.items():
replaced_line = replaced_line.replace(k, v)
result += replaced_line + "\n"
else:
result += line + "\n"
continue
result += line + "\n"
return result
def process_data(object, replacements):
data = object.getData()
decoded_data = data.decode('utf-8')
replaced_data = replace_text(decoded_data, replacements)
encoded_data = replaced_data.encode('utf-8')
if object.decodedSelf is not None:
object.decodedSelf.setData(encoded_data)
else:
object.setData(encoded_data)
if __name__ == "__main__":
ap = argparse.ArgumentParser()
ap.add_argument("-i", "--input", required=True, help="path to PDF document")
args = vars(ap.parse_args())
in_file = args["input"]
filename_base = in_file.replace(os.path.splitext(in_file)[1], "")
# Provide replacements list that you need here
replacements = { 'PDF': 'DOC'}
pdf = PdfFileReader(in_file)
writer = PdfFileWriter()
for page_number in range(0, pdf.getNumPages()):
page = pdf.getPage(page_number)
contents = page.getContents()
if isinstance(contents, DecodedStreamObject) or isinstance(contents, EncodedStreamObject):
process_data(contents, replacements)
elif len(contents) > 0:
for obj in contents:
if isinstance(obj, DecodedStreamObject) or isinstance(obj, EncodedStreamObject):
streamObj = obj.getObject()
process_data(streamObj, replacements)
# Force content replacement
page[NameObject("/Contents")] = contents.decodedSelf
writer.addPage(page)
with open(filename_base + ".result.pdf", 'wb') as out_file:
writer.write(out_file)
Important: from PyPDF2.generic import NameObject

Decompress the pdf to make parsing easier (solves many of the issues in the previous answer). I use pdftk. (If this step fails, one hack to pre-process the pdf is to open the pdf in OSX Preview, print it, and then choose save as pdf from the print menu. Then retry the command below.)
pdftk original.pdf output uncompressed.pdf uncompress
Parse and replace using PyPDF2.
from PyPDF2 import PdfFileReader, PdfFileWriter
replacements = [
("old string", "new string")
]
pdf = PdfFileReader(open("uncompressed.pdf", "rb"))
writer = PdfFileWriter()
for page in pdf.pages:
contents = page.getContents().getData()
for (a,b) in replacements:
contents = contents.replace(a.encode('utf-8'), b.encode('utf-8'))
page.getContents().setData(contents)
writer.addPage(page)
with open("modified.pdf", "wb") as f:
writer.write(f)
[Optional] Re-compress the pdf.
pdftk modified.pdf output recompressed.pdf compress

Here is a solution using the MS Word source file.
As trying to edit the pdf itself turned out to be too complicated for me because of the encoding errors, I went with the MS Word >> Pdf option.
Prepare MS Word template with {{input_fields}}
Fill in the template with data
Convert the filled in MS Word file to PDF
The DocxTemplate module uses jinja like syntax: {{variable_name}}
In my solution I use an intermediate temp file. I tried to get rid of this step using BytesIO/StringIO to virtualize this step only in memory, but haven't make that work yet.
Here is an easy and working solution to perform the required task:
import os
import comtypes.client
from pathlib import Path
from docxtpl import DocxTemplate
import random
# CFG
in_file_path = "files/template.docx"
temp_file_path = "files/"+str(random.randint(0,50))+".docx"
out_file_path = "files/output.pdf"
# Fill in text
data_to_fill = {'Field_name' : "John Tester",
'Field_ocupation' : "Test tester",
'Field_address' : "Test Address 123",
}
template = DocxTemplate(Path(in_file_path))
template.render(data_to_fill)
template.save(Path(temp_file_path))
# Convert to PDF
wdFormatPDF = 17
in_file = os.path.abspath(Path(temp_file_path))
out_file = os.path.abspath(Path(out_file_path))
word = comtypes.client.CreateObject('Word.Application')
doc = word.Documents.Open(in_file)
doc.SaveAs(out_file, FileFormat=wdFormatPDF)
doc.Close()
word.Quit()
# Get rid of the temp file
os.remove(Path(temp_file_path))

Related

Python - Check Box Status Program

I'm trying to make a program that scans PDFs downloaded from a website with selectable text and highlights specific discrepancies. I can make it work for specific "bad words" and "good words" but I am stuck on how to make it find missing check boxes. They are no longer interactive fields in PDF form:
Here is my code for everything else so far:
import os
import fitz
source_folder = r"C:\Users\Sserb\Desktop\Test Files"
list_files = os.listdir(source_folder)
good_terms = ["trend", "decrease", "increase"]
bad_terms = ["school", "academic", "homework"] # words that should be in every pdf file (not every page)
pdf_files = [x for x in list_files if x.endswith(".pdf")]
highlight_summary = []
good_term_summary = []
for file_name in pdf_files:
# READ IN PDF
full_filename = os.path.join(source_folder, file_name)
doc = fitz.open(full_filename)
good_terms_not_found = good_terms.copy()
list_hl_pages = []
for page_num, page in enumerate(doc, 1):
# SEARCH
for text in bad_terms:
text_instances = page.search_for(text)
# HIGHLIGHT
for inst in text_instances:
highlight = page.addHighlightAnnot(inst)
highlight.update()
if page_num not in list_hl_pages:
list_hl_pages.append(page_num)
# Search for good terms- all must be found
words_found = []
for good_word in good_terms_not_found:
text_instances = page.search_for(good_word)
if text_instances:
words_found.append(good_word)
for word in words_found:
good_terms_not_found.remove(word)
highlight_summary.append([file_name, list_hl_pages.copy()])
if good_terms_not_found:
good_term_summary.append([file_name, good_terms_not_found.copy()])
# OUTPUT
if list_hl_pages:
out_file = file_name.replace(".pdf", "-errors.pdf")
doc.save(os.path.join(source_folder, "output", out_file), garbage=4, deflate=True, clean=True)
else:
doc.close()
#print(highlight_summary)
print(good_term_summary)
output_folder=r"C:\Users\Sserb\Desktop\Test Files\output"
new = os.path.join(output_folder,'outputfile.txt')
file = open(new, 'w')
value = str(good_term_summary) + '\n'
file.write(value)
file.close()
Both "value" and "export value" are always treated as text, but there are at least 8 different kinds of check-boxes in word. see how these are altered by the font used here Check boxes are shown as ☐ when unchecked, or ☑ or ☒ when checked, so search for ☑Client rather than ☐Client etc

Get PDF attachments using Python

I admit that I am new to Python.
We have to process PDF files with attachments or annotated attachments. I am trying to extract attachments from a PDF file using PyPDF2 library.
The only (!) example found on GitHub contains the following code:
import PyPDF2
def getAttachments(reader):
catalog = reader.trailer["/Root"]
# VK
print (catalog)
#
fileNames = catalog['/Names']['/EmbeddedFiles']['/Names']
And the call is:
rootdir = "C:/Users/***.pdf" # My file path
handler = open(rootdir, 'rb')
reader = PyPDF2.PdfFileReader(handler)
dictionary = getAttachments(reader)
I am getting a KeyError: '/EmbeddedFiles'
A print of the catalog indeed does not contain EmbeddedFiles:
{'/Extensions': {'/ADBE': {'/BaseVersion': '/1.7', '/ExtensionLevel': 3}}, '/Metadata': IndirectObject(2, 0), '/Names': IndirectObject(5, 0), '/OpenAction': IndirectObject(6, 0), '/PageLayout': '/OneColumn', '/Pages': IndirectObject(3, 0), '/PieceInfo': IndirectObject(7, 0), '/Type': '/Catalog'}
This particular PDF contains 9 attachments. How can I get them?
Too Long for comments, and I have not tested personally this code, which looks very similar to your outline in the question, however I am adding here for others to test. It is the subject of a Pull Request https://github.com/mstamy2/PyPDF2/pull/440 and here is the full updated sequence as described by Kevin M Loeffler in https://kevinmloeffler.com/2018/07/08/how-to-extract-pdf-file-attachments-using-python-and-pypdf2/
Viewable at https://gist.github.com/kevinl95/29a9e18d474eb6e23372074deff2df38#file-extract_pdf_attachments-py
Download as
https://gist.github.com/kevinl95/29a9e18d474eb6e23372074deff2df38/raw/acdc194058f9fa2c4d2619a4c623d0efeec32555/extract_pdf_attachments.py
It always helps if you can provide an example input of the type you have problems with so that others can adapt the extraction routine to suit.
In response to getting an error
"I’m guessing the script is breaking because the embedded files section of the PDF doesn’t always exist so trying to access it throws an error."
"Something I would try is to put everything after the ‘catalog’ line in the get_attachments method in a try-catch."
Unfortunately there are many pending pull requests not included into PyPDF2 https://github.com/mstamy2/PyPDF2/pulls and others may also be relevant or needed to aid with this and other shortcomings. Thus you need to see if any of those may also help.
For one pending example of a try catch that you might be able to include / and adapt for other use cases see https://github.com/mstamy2/PyPDF2/pull/551/commits/9d52ef517319b538f007669631ba6b778f8ec3a3
Associated keywords for imbedded files apart from /Type/EmbeddedFiles include /Type /Filespec & /Subtype /FileAttachment note the pairs may not always have spaces so perhaps see if those can be interrogated for the attachments
Again on that last point the example searches for /EmbeddedFiles as indexed in the plural whilst any individual entry itself is identified as singular
This can be improved but it was tested to work (using PyMuPDF).
It detects corrupted PDF files, encryption, attachments, annotations and portfolios.
I am yet to compare the output with our internal classification.
Produces a semicolon separated file that can be imported into Excel.
import fitz # = PyMuPDF
import os
outfile = open("C:/Users/me/Downloads/testPDF3.txt", "w", encoding="utf-8")
folder = "C:/Users/me/Downloads"
print ("filepath;","encrypted;","pages;", "embedded;","attachments;","annotations;","portfolio", file = outfile)
enc=pages=count=names=annots=collection=''
for subdir, dirs, files in os.walk(folder):
for file in files:
#print (os.path.join(subdir, file))
filepath = subdir + os.sep + file
if filepath.endswith(".pdf"):
#print (filepath, file = outfile)
try:
doc = fitz.open(filepath)
enc = doc.is_encrypted
#print("Encrypted? ", enc, file = outfile)
pages = doc.page_count
#print("Number of pages: ", pages, file = outfile)
count = doc.embfile_count()
#print("Number of embedded files:", count, file = outfile) # shows number of embedded files
names = doc.embfile_names()
#print("Embedded files:", str(names), file = outfile)
#if count > 0:
# for emb in names:
# print(doc.embfile_info(emb), file = outfile)
annots = doc.has_annots()
#print("Has annots?", annots, file = outfile)
links = doc.has_links()
#print("Has links?", links, file = outfile)
trailer = doc.pdf_trailer()
#print("Trailer: ", trailer, file = outfile)
xreflen = doc.xref_length() # length of objects table
for xref in range(1, xreflen): # skip item 0!
#print("", file = outfile)
#print("object %i (stream: %s)" % (xref, doc.is_stream(xref)), file = outfile)
#print(doc.xref_object(i, compressed=False), file = outfile)
if "Collection" in doc.xref_object(xref, compressed=False):
#print ("Portfolio", file = outfile)
collection ='True'
break
else: collection="False"
#print(doc.xref_object(xref, compressed=False), file = outfile)
except:
#print ("Not a valid PDF", file = outfile)
enc=pages=count=names=annots=collection="Not a valid PDF"
print(filepath,";", enc,";",pages, ";",count, ";",names, ";",annots, ";",collection, file = outfile )
outfile.close()
I was also running into the same problem with several pdfs that I have. I was able to make these changes to the referenced code that got it to work for me:
import PyPDF2
def getAttachments(reader):
"""
Retrieves the file attachments of the PDF as a dictionary of file names
and the file data as a bytestring.
:return: dictionary of filenames and bytestrings
"""
attachments = {}
#First, get those that are pdf attachments
catalog = reader.trailer["/Root"]
if "/EmbeddedFiles" in catalog["/Names"]:
fileNames = catalog['/Names']['/EmbeddedFiles']['/Names']
for f in fileNames:
if isinstance(f, str):
name = f
dataIndex = fileNames.index(f) + 1
fDict = fileNames[dataIndex].getObject()
fData = fDict['/EF']['/F'].getData()
attachments[name] = fData
#Next, go through all pages and all annotations to those pages
#to find any attached files
for pagenum in range(0, reader.getNumPages()):
page_object = reader.getPage(pagenum)
if "/Annots" in page_object:
for annot in page_object['/Annots']:
annotobj = annot.getObject()
if annotobj['/Subtype'] == '/FileAttachment':
fileobj = annotobj["/FS"]
attachments[fileobj["/F"]] = fileobj["/EF"]["/F"].getData()
return attachments
handler = open(filename, 'rb')
reader = PyPDF2.PdfFileReader(handler)
dictionary = getAttachments(reader)
for fName, fData in dictionary.items():
with open(fName, 'wb') as outfile:
outfile.write(fData)
I know it is a late reply, but i only started looking into this yesterday. I have used the PyMuPdf library to extract the embedded files. here is my code:
import os
import fitz
def get_embedded_pdfs(input_pdf_path, output_path=None):
input_path = "/".join(input_pdf_path.split('/')[:-1])
if not output_path :
output_path = input_pdf_path.split(".")[0] + "_embeded_files/"
if output_path not in os.listdir(input_path):
os.mkdir(output_path)
doc = fitz.open(input_pdf_path)
item_name_dict = {}
for each_item in doc.embfile_names():
item_name_dict[each_item] = doc.embfile_info(each_item)["filename"]
for item_name, file_name in item_name_dict.items():
out_pdf = output_path + file_name
## get embeded_file in bytes
fData = doc.embeddedFileGet(item_name)
## save embeded file
with open(out_pdf, 'wb') as outfile:
outfile.write(fData)
disclaimer: I am the author of borb (the library used in this answer)
borb is an open-source, pure Python PDF library. It abstracts away most of the unpleasantness of dealing with PDF (such as having to deal with dictionaries and having to know PDF-syntax and structure).
There is a huge repository of examples, containing a section on dealing with embedded files, which you can find here.
I'll repeat the relevant example here for completeness:
import typing
from borb.pdf.document.document import Document
from borb.pdf.pdf import PDF
def main():
# read the Document
doc: typing.Optional[Document] = None
with open("output.pdf", "rb") as pdf_file_handle:
doc = PDF.loads(pdf_file_handle)
# check whether we have read a Document
assert doc is not None
# retrieve all embedded files and their bytes
for k, v in doc.get_embedded_files().items():
# display the file name, and the size
print("%s, %d bytes" % (k, len(v)))
if __name__ == "__main__":
main()
After the Document has been read, you can simply ask it for a dict mapping the filenames unto the bytes.

Run a python script on all files in a directory

First time posting a question here, hopefully, someone who experienced/tried this please share your insights... I've been working to get this far in the last few days and nights... now I am getting nowhere to loop this script on every file in a directory.
Bascially, these two scripts work perfectly fine it brings a pdf file and changes it to an excel workbook. Now what I need to do is going through all files from a selected directory and do the same job.
I am keep getting stuck at the opening the file stage - is this saying that the data (the pdf page - data[0]) cant be called in? or should i add more stages in to bring the dataset in...?
Do I have to create a list for the dataset so I can call in the data as you would have more than a data to call in.. is this why python can read the data[0] ???
Revised Script
# import
import os
import glob
import pdftotext
import openpyxl
from pathlib import Path
from string import ascii_uppercase
# open a pdf file
def to_excel(pdf_file):
with open(pdf_file,'rb') as f:
data = pdftotext.PDF(f)
# operate data to get titles, values
datas = data[0].split('\r\n')
finalData = list()
for item in datas:
if item != '':
finalData.append(item)
finalDataRefined = list()
for item in finalData:
if item != ' BCA Scheduled Maintenance Questions' and item != ' Do you suspect there is Asbestos at the property?' and item != ' Yes' and item != ' No' and item != '\x0c':
finalDataRefined.append(item.strip())
titles = list()
values = list()
for num, item in enumerate(finalDataRefined):
if num % 2 == 0:
titles.append(item)
else:
values.append(item)
# get an output file name
OPRAST = values[1]
filename = work_dir / f"{OPRAST}.xlxs"
# create an excel workbook
excel_file = openpyxl.Workbook()
excel_sheet = excel_file.active
excel_sheet.append([])
alphaList = list(ascii_uppercase)
for alphabet in alphaList:
excel_sheet.column_dimensions[alphabet].width = 20
excel_sheet.append(titles)
excel_sheet.append(values)
# save the excel workbook
excel_file.save(filename)
excel_file.close
# run a python script every file in a directory
alphaList = list(ascii_uppercase)
work_dir = Path(r"C:\Users\Sunny Kim\Downloads\Do Forms")
for pdf_file in work_dir.glob("*.pdf"):
to_excel(pdf_file)
I basically know what you want to do, but your code's indent is not so readable... especially it's python.
Your goal is to create a excel for each pdf file in you prefix dir? or aggregate all the pdf files together to a single excel file?
The follow coding is for the first goal.
Code logic.
get all the pdf file
loop over all the pdf file, for each:
open pdf file
some operation
export to excel file
You full code maybe like this(just guess):
# ----------------import part-------------------
import os
import glob
import pdftotext
import openpyxl
from string import ascii_uppercase
from pathlib import Path
def to_excel(pdf_file):
with open(pdf_file, 'rb') as f: # this open the pdf file
data = pdftotext.PDF(f)
# ---------------operate the data, get title and value-----------
datas = data[0].split('\r\n')
finalData = list()
for item in datas:
if item != '':
finalData.append(item)
finalDataRefined = list()
for item in finalData:
if item != ' BCA Scheduled Maintenance Questions' and item != ' Do you suspect there is Asbestos at the property?' and item != ' Yes' and item != ' No' and item != '\x0c':
finalDataRefined.append(item.strip())
titles = list()
values = list()
for num, item in enumerate(finalDataRefined):
if num % 2 == 0:
titles.append(item)
else:
values.append(item)
# ------------------get output file name---------------------
OPRAST = values[1]
filename = work_dir / f"{OPRAST}.xlxs"
# ------------------create excel file sheet------------------
excel_file = openpyxl.Workbook()
excel_sheet = excel_file.active
excel_sheet.append([])
alphaList = list(ascii_uppercase)
for alphabet in alphaList:
excel_sheet.column_dimensions[alphabet].width = 20
excel_sheet.append(titles)
excel_sheet.append(values)
# --------------------save----------------
excel_file.save(filename)
excel_file.close
# -------------------main program---------------
alphaList = list(ascii_uppercase)
work_dir = Path(r"C:\Users\Sunny Kim\Downloads\Do Forms")
for pdf_file in work_dir.glob("*.pdf"):
to_excel(pdf_file)

Using Textract, how do you extract tables from a pdf file and output it into a csv file via .py script?

I want to use textract (via aws cli) to extract tables from a pdf file (located in an s3 location) and export it into a csv file. I have tried writing a .py script but am struggling to read from the file.
Any suggestions for writing the .py script is welcome.
This is my current script. I run into the error:
File "extract-table.py", line 63, in get_table_csv_results
bash: File: command not found
blocks=response['Blocks']
KeyError: 'Blocks'
import webbrowser, os
import json
import boto3
import io
from io import BytesIO
import sys
from pprint import pprint
def get_rows_columns_map(table_result, blocks_map):
rows = {}
for relationship in table_result['Relationships']:
if relationship['Type'] == 'CHILD':
for child_id in relationship['Ids']:
cell = blocks_map[child_id]
if cell['BlockType'] == 'CELL':
row_index = cell['RowIndex']
col_index = cell['ColumnIndex']
if row_index not in rows:
# create new row
rows[row_index] = {}
# get the text value
rows[row_index][col_index] = get_text(cell, blocks_map)
return rows
def get_text(result, blocks_map):
text = ''
if 'Relationships' in result:
for relationship in result['Relationships']:
if relationship['Type'] == 'CHILD':
for child_id in relationship['Ids']:
word = blocks_map[child_id]
if word['BlockType'] == 'WORD':
text += word['Text'] + ' '
if word['BlockType'] == 'SELECTION_ELEMENT':
if word['SelectionStatus'] =='SELECTED':
text += 'X '
def get_table_csv_results(file_name):
with open(file_name, 'rb') as file:
img_test = file.read()
bytes_test = bytearray(img_test)
print('Image loaded', file_name)
# process using image bytes
# get the results
client = boto3.client('textract')
#Response
response = client.start_document_text_detection(
DocumentLocation={
'S3Object': {
'Bucket': s3BucketName,
'Name': documentName
}
})
# Get the text blocks
blocks=response['Blocks']
pprint(blocks)
blocks_map = {}
table_blocks = []
for block in blocks:
blocks_map[block['Id']] = block
if block['BlockType'] == "TABLE":
table_blocks.append(block)
if len(table_blocks) <= 0:
return "<b> NO Table FOUND </b>"
csv = ''
for index, table in enumerate(table_blocks):
csv += generate_table_csv(table, blocks_map, index +1)
csv += '\n\n'
return csv
def generate_table_csv(table_result, blocks_map, table_index):
rows = get_rows_columns_map(table_result, blocks_map)
table_id = 'Table_' + str(table_index)
# get cells.
csv = 'Table: {0}\n\n'.format(table_id)
for row_index, cols in rows.items():
for col_index, text in cols.items():
csv += '{}'.format(text) + ","
csv += '\n'
csv += '\n\n\n'
return csv
def main(file_name):
table_csv = get_table_csv_results(file_name)
output_file = 'output.csv'
# replace content
with open(output_file, "wt") as fout:
fout.write(table_csv)
# show the results
print('CSV OUTPUT FILE: ', output_file)
# Document
s3BucketName = "chrisyou.sagemi.com"
documentName = "DETAIL.pdf"
if __name__ == "__main__":
file_name = sys.argv[1]
main(file_name)
There is a much simpler way using the Amazon Textractor Textractor library. pip install amazon-textract-textractor
This will create a csv per table in your pdf document. e.g output_p0_t0.csv
from textractor import Textractor
def extract_tables(s3_file_path, output_directory, s3_output_path):
extractor = Textractor(profile_name="default")
document = extractor.start_document_analysis(s3_file_path, textractor.data.constants.TextractFeatures.TABLES, s3_output_path)
for j, page in enumerate(document.pages):
for i, table in enumerate(document.tables):
with open(output_directory+f'/output_p{j}_t{i}.csv', 'w') as csv_file:
csv_file.write(table.to_csv())
return document
document = extract_tables('s3://<INPUT_FILE.PDF>', './<LOCAL_DIRECTORY_FOR_CSV>', 's3://<TEXTRACT_OUTPUT_DIRECTORY>')
I had to make slight changes to #Thomas answer by importing profile `
extractor = Textractor(profile_name="default") right after importing Textractor as shown below to avoid getting this error -> NameError: name 'textractor' is not defined.
from textractor import Textractor
extractor = Textractor(profile_name="default")
def extract_tables(s3_file_path, output_directory, s3_output_path):
document = extractor.start_document_analysis(s3_file_path, textractor.data.constants.TextractFeatures.TABLES, s3_output_path)
for j, page in enumerate(document.pages):
for i, table in enumerate(document.tables):
with open(output_directory+f'/output_p{j}_t{i}.csv', 'w') as csv_file:
csv_file.write(table.to_csv())
return document
document = extract_tables('s3://<INPUT_FILE.PDF>', './<LOCAL_DIRECTORY_FOR_CSV>', 's3://<TEXTRACT_OUTPUT_DIRECTORY>')
Hope it helps someone out there.

finding on which page a search string is located in a pdf document using python

Which python packages can I use to find out out on which page a specific “search string” is located ?
I looked into several python pdf packages but couldn't figure out which one I should use.
PyPDF does not seem to have this functionality and PDFMiner seems to be an overkill for such simple task.
Any advice ?
More precise:
I have several PDF documents and I would like to extract pages which are between a string “Begin” and a string “End” .
I finally figured out that pyPDF can help. I am posting it in case it can help somebody else.
(1) a function to locate the string
def fnPDF_FindText(xFile, xString):
# xfile : the PDF file in which to look
# xString : the string to look for
import pyPdf, re
PageFound = -1
pdfDoc = pyPdf.PdfFileReader(file(xFile, "rb"))
for i in range(0, pdfDoc.getNumPages()):
content = ""
content += pdfDoc.getPage(i).extractText() + "\n"
content1 = content.encode('ascii', 'ignore').lower()
ResSearch = re.search(xString, content1)
if ResSearch is not None:
PageFound = i
break
return PageFound
(2) a function to extract the pages of interest
def fnPDF_ExtractPages(xFileNameOriginal, xFileNameOutput, xPageStart, xPageEnd):
from pyPdf import PdfFileReader, PdfFileWriter
output = PdfFileWriter()
pdfOne = PdfFileReader(file(xFileNameOriginal, "rb"))
for i in range(xPageStart, xPageEnd):
output.addPage(pdfOne.getPage(i))
outputStream = file(xFileNameOutput, "wb")
output.write(outputStream)
outputStream.close()
I hope this will be helpful to somebody else
I was able to successfully get the output using the code below.
Code:
import PyPDF2
import re
# Open the pdf file
object = PyPDF2.PdfFileReader(r"C:\TEST.pdf")
# Get number of pages
NumPages = object.getNumPages()
# Enter code here
String = "Enter_the_text_to_Search_here"
# Extract text and do the search
for i in range(0, NumPages):
PageObj = object.getPage(i)
Text = PageObj.extractText()
if re.search(String,Text):
print("Pattern Found on Page: " + str(i))
Sample Output:
Pattern Found on Page: 7
Finding on which page a search string is located in a pdf document using python
PyPDF2
# import packages
import PyPDF2
import re
# open the pdf file
object = PyPDF2.PdfFileReader(r"source_file_path")
# get number of pages
NumPages = object.getNumPages()
# define keyterms
String = "P4F-21B"
# extract text and do the search
for i in range(0, NumPages):
PageObj = object.getPage(i)
Text = PageObj.extractText()
ResSearch = re.search(String, Text)
if ResSearch != None:
print(ResSearch)
print("Page Number" + str(i+1))
Output:
<re.Match object; span=(57, 64), match='P4F-21B'>
Page Number1
PyMuPDF
import fitz
import re
# load document
doc = fitz.open(r"C:\Users\shraddha.shetty\Desktop\OCR-pages-deleted.pdf")
# define keyterms
String = "P4F-21B"
# get text, search for string and print count on page.
for page in doc:
text = ''
text += page.get_text()
if len(re.findall(String, text)) > 0:
print(f'count on page {page.number + 1} is: {len(re.findall(String, text))}')
In addition to what #user1043144 mentioned,
To use with python 3.x
Use PyPDF2
import PyPDF2
Use open instead of file
PdfFileReader(open(xFile, 'rb'))
updated answer with PYDF2
import re
import PyPDF2
def pdf_find_text(xfile_pdf, xsearch_string, ignore_case = False):
'''
find page(s) on which a given text is located in a pdf
input: pdf file and the string to search
(string to search can be in a regex like 'references\n')
N.B:
results need to be checked
in case of pdf whose page numbers are not zero indexed ,
the results seems off (by one page)
'''
xlst_res = []
xreader = PyPDF2.PdfFileReader(xfile_pdf)
for xpage_nr, xpage in enumerate(xreader.pages):
xpage_text = xpage.extractText()
xhits = None
if ignore_case == False:
xhits = re.search(xsearch_string, xpage_text.lower())
else:
xhits = re.search(xsearch_string, xpage_text.lower(), re.IGNORECASE)
if xhits:
xlst_res.append(xpage_nr)
return {'num_pages': xreader.numPages, 'page_hits': xlst_res}
def pdf_extract_pages(xpdf_original, xpdf_new , xpage_start, xpage_end):
'''
given a pdf extract a page range and save it in a new pdf file
'''
with open(xpdf_original, 'rb') as xfile_1, open(xpdf_new , 'wb') as xfile_2:
xreader = PyPDF2.PdfFileReader(xfile_1)
xwriter = PyPDF2.PdfFileWriter()
for xpage_nr in range(xpage_start, xpage_end ):
xwriter.addPage(xreader.getPage(xpage_nr))
xwriter.write(xfile_2)

Categories

Resources