Python - Check Box Status Program - python

I'm trying to make a program that scans PDFs downloaded from a website with selectable text and highlights specific discrepancies. I can make it work for specific "bad words" and "good words" but I am stuck on how to make it find missing check boxes. They are no longer interactive fields in PDF form:
Here is my code for everything else so far:
import os
import fitz
source_folder = r"C:\Users\Sserb\Desktop\Test Files"
list_files = os.listdir(source_folder)
good_terms = ["trend", "decrease", "increase"]
bad_terms = ["school", "academic", "homework"] # words that should be in every pdf file (not every page)
pdf_files = [x for x in list_files if x.endswith(".pdf")]
highlight_summary = []
good_term_summary = []
for file_name in pdf_files:
# READ IN PDF
full_filename = os.path.join(source_folder, file_name)
doc = fitz.open(full_filename)
good_terms_not_found = good_terms.copy()
list_hl_pages = []
for page_num, page in enumerate(doc, 1):
# SEARCH
for text in bad_terms:
text_instances = page.search_for(text)
# HIGHLIGHT
for inst in text_instances:
highlight = page.addHighlightAnnot(inst)
highlight.update()
if page_num not in list_hl_pages:
list_hl_pages.append(page_num)
# Search for good terms- all must be found
words_found = []
for good_word in good_terms_not_found:
text_instances = page.search_for(good_word)
if text_instances:
words_found.append(good_word)
for word in words_found:
good_terms_not_found.remove(word)
highlight_summary.append([file_name, list_hl_pages.copy()])
if good_terms_not_found:
good_term_summary.append([file_name, good_terms_not_found.copy()])
# OUTPUT
if list_hl_pages:
out_file = file_name.replace(".pdf", "-errors.pdf")
doc.save(os.path.join(source_folder, "output", out_file), garbage=4, deflate=True, clean=True)
else:
doc.close()
#print(highlight_summary)
print(good_term_summary)
output_folder=r"C:\Users\Sserb\Desktop\Test Files\output"
new = os.path.join(output_folder,'outputfile.txt')
file = open(new, 'w')
value = str(good_term_summary) + '\n'
file.write(value)
file.close()

Both "value" and "export value" are always treated as text, but there are at least 8 different kinds of check-boxes in word. see how these are altered by the font used here Check boxes are shown as ☐ when unchecked, or ☑ or ☒ when checked, so search for ☑Client rather than ☐Client etc

Related

Search specific words in pdf and return only pdf link where words were found (Python)

I am trying to search for multiple words in many PDFs. Links to these PDFs are saved in a dataframe. The goal is for python to return a text stating "The words are located in pdf link"). Here is the code I have so far: (FYI g7 is the name of the dataframe where the links are saved).The issue here is that the code returns the same link multiple times for every time the word is found.
The dataframe (named g7) looks like this:
URL
0 https://westafricatradehub.com/wp-content/uploads/2021/07/RFA-WATIH-1295_Senegal-RMNCAH-Activity_English-Version.pdf
1 https://westafricatradehub.com/wp-content/uploads/2021/07/RFA-WATIH-1295_Activit%C3%A9-RMNCAH-S%C3%A9n%C3%A9gal_Version-Fran%C3%A7aise.pdf
2 https://westafricatradehub.com/wp-content/uploads/2021/08/Senegal-Health-RFA-Webinar-QA.pdf
3 https://westafricatradehub.com/wp-content/uploads/2021/02/APS-WATIH-1021_Catalytic-Business-Concepts-Round-2.pdf
4 https://westafricatradehub.com/wp-content/uploads/2021/02/APS-WATIH-1021_Concepts-d%E2%80%99Affaires-Catalytiques-2ieme-Tour.pdf
5 https://westafricatradehub.com/wp-content/uploads/2021/06/APS-WATIH-1247_Research-Development-Round-2.pdf
The code is as follows:
import glob
import pathlib
import PyPDF2
import re
import os
for i in range(g7.shape[0]):
pdf_link=g7.iloc[i,0]
download_file(pdf_link, f"pdf_{i}")
text = textract.process(f"/Users/fze/pdf_{i}.PDF")
# open the pdf file
object = PyPDF2.PdfFileReader(f"/Users/fze/pdf_{i}.PDF")
all_files = glob.glob('/Users/fze/*.pdf') #User input: give path to your downloads folder file path
latest_pdf_path = max(all_files, key=os.path.getctime)
path = pathlib.PurePath(latest_pdf_path)
latest_pdf_name=path.name
print(latest_pdf_name)
# get number of pages
NumPages = object.getNumPages()
# define keyterms
search_word = 'organization'
# extract text and do the search
for i in range(0, NumPages):
page = object.getPage(i)
text = page.extractText()
search_text = text.lower().split()
for word in search_text:
if search_word in word:
print("The word '{}' was found in '{}'".format(search_word,pdf_link))
Thank you !

Python PyMuPDF searchFor method not working

I am using PyMuPDF library in python to search for a specific text in a PDF document and then highlight it.
pdf_document = fitz.open(pdf_path)
for i in range(len(page_num)):
page=pdf_document[page_num[i]]
for item in search_terms:
search_instances = page.searchFor(item)
for inst in search_instances:
page.addHighlightAnnot(inst)
The PDF document image is as given below:
I am able to highlight all of the terms in the PDF document except for the search term 'Gross profit'. searchFor() returns an empty list instead of the co-ordinates. Is it anything to do with the poor quality PDF?.If that is so it should not work for the other search terms as well.It works for terms like 'Turnover' and 'Cost of Sales' etc
Any ideas please?
It seems that it's a non text PDF.
You have to use a OCR like pytesseract for convert it to a text pdf then use fitz for highlighted it.
something like that should work :
from pdf2image import convert_from_path, convert_from_bytes
from fpdf import FPDF
pytesseract.pytesseract.tesseract_cmd = /pathto/Tesseract-OCR/tesseract.exe'
def string_stream(s, separators="\n"):
start = 0
for end in range(len(s)):
if s[end] in separators:
yield s[start:end]
start = end + 1
if start < end:
yield s[start:end+1]
def multipage_simple(whole_text):
pdf = FPDF(format='letter') #pdf format
pdf.add_page() #create new page
pdf.set_font("Arial", size=12) # font and textsize
cnt = 1
stream = string_stream(normalize(whole_text))
for s in stream:
pdf.cell(200, 10, txt=s, ln=cnt, align="L")
cnt += 1
pdf.output("multipage_simple.pdf", "F")
def get_text_from_pdf_with_ocr(file_name_to_image_pdf):
res = []
seq_of_images = convert_from_path(file_name)
for img in seq_of_images:
text = re.sub(REMOVAL_SPECIAL_CHARACTER_PATTERN, " ", string=pytesseract.image_to_string(img))
res += [text]
return res
then do :
doc = fitz.open("multipage_simple.pdf")
You could get the whole text by doing :
text_ = get_text_from_pdf_with_ocr(pdf_id)
whole_text = reduce(lambda x, y: x + y, text_)

How to add images with figure captions to a word document

import win32com.client as win32
import os
#creating a word application object
wordApp = win32.gencache.EnsureDispatch('Word.Application') #create a word application object
wordApp.Visible = True # hide the word application
doc = wordApp.Documents.Add() # create a new application
#Formating the document
doc.PageSetup.RightMargin = 10
doc.PageSetup.LeftMargin = 10
doc.PageSetup.Orientation = win32.constants.wdOrientLandscape
# a4 paper size: 595x842
doc.PageSetup.PageWidth = 595
doc.PageSetup.PageHeight = 842
# Inserting Tables
my_dir="C:/Users/David/Documents/EGi/EGi Plots/FW_plots/Boxplots"
filenames = os.listdir(my_dir)
piccount=0
file_count = 0
for i in filenames:
if i[len(i)-3: len(i)].upper() == 'JPG': # check whether the current object is a JPG file
piccount = piccount + 1
print piccount, " images will be inserted"
total_column = 1
total_row = int(piccount/total_column)+2
rng = doc.Range(0,0)
rng.ParagraphFormat.Alignment = win32.constants.wdAlignParagraphCenter
table = doc.Tables.Add(rng,total_row, total_column)
table.Borders.Enable = False
if total_column > 1:
table.Columns.DistributeWidth()
#Collecting images in the same directory and inserting them into the document
piccount = 1
for index, filename in enumerate(filenames): # loop through all the files and folders for adding pictures
if os.path.isfile(os.path.join(os.path.abspath(my_dir), filename)): # check whether the current object is a file or not
if filename[len(filename)-3: len(filename)].upper() == 'JPG': # check whether the current object is a JPG file
piccount = piccount + 1
print filename, len(filename), filename[len(filename)-3: len(filename)].upper()
cell_column = (piccount % total_column + 1) #calculating the position of each image to be put into the correct table cell
cell_row = (piccount/total_column + 1)
#print 'cell_column=%s,cell_row=%s' % (cell_column,cell_row)
#we are formatting the style of each cell
cell_range= table.Cell(cell_row, cell_column).Range
cell_range.ParagraphFormat.LineSpacingRule = win32.constants.wdLineSpaceSingle
cell_range.ParagraphFormat.SpaceBefore = 0
cell_range.ParagraphFormat.SpaceAfter = 3
#this is where we are going to insert the images
current_pic=cell_range.InlineShapes.AddPicture(os.path.join(os.path.abspath(my_dir), filename))
#Currently this puts a lable in a cell after the pic, I want to put a proper ms word figure caption below the image instead.
table.Cell(cell_row, cell_column).Range.InsertAfter("\n"+"Appendix II Figure "+ str(piccount-1)+": "+filename[:len(filename)-4]+"\n"+"\n"+"\n")
else: continue
This code gets all the images in a chosen directory and puts them in a table in a word doc, and then puts the file name (stripped of the file extn) in the cell below. I would like a proper figure caption (so that these will update if I insert additional pictures) but everything I've tried has failed.
I just can't get the VB commands right, this:
table.Cell(cell_row, cell_column).Range.InsertAfter(InsertCaption(Label="Figure", Title=": "+filename[:len(filename)-4]))
gives me a list of figure captions at the end of the document, which isn't really what I want. I feel like I am close but I just cant quite get it. Thanks!
In order to use Word's built-in captioning instead of current_pic.InsertCaption use current_Pic.Range.InsertCaption. The InsertCaption method is a member of the Range not the InlineShape object. For me, this automatically inserts the caption below the picture, in its own paragraph. But if you want to specificy "below" use the Position argument, as well:
current_pic.Range.InsertCaption(Label="Figure", Title=": "+filename[:len(filename)-4]), Position=win32.constants.wdCaptionPositionBelow
Note: FWIW when I test the line of code (in VBA) that you say gives you a list of captions at the end of the document I do see the text in the same cell as the inserted picture.

Search and replace for text within a pdf, in Python

This question already has answers here:
How can I replace text in a PDF using Python?
(4 answers)
Closed 14 hours ago.
I am writing mailmerge software as part of a Python web app.
I have a template called letter.pdf which was generated from a MS Word file and includes the text {name} where the resident's name will go. I also have a list of c. 100 residents' names.
What I want to do is to read in letter.pdf do a search for "{name}" and replace it with the resident's name (for each resident) then write the result to another pdf. I then want to gather all these pdfs together into a big pdf (one page per letter) which my web app's users will print out to create their letters.
Are there any Python libraries that will do this? I've looked at pdfrw and pdfminer but I couldn't see where they would be able to do it.
(NB: I also have the MS Word file, so if there was another way of using that, and not going through a pdf, that would also do the job.)
This can be done with PyPDF2 package. The implementation may depend on the original PDF template structure. But if the template is stable enough and isn't changed very often the replacement code shouldn't be generic but rather simple.
I did a small sketch on how you could replace the text inside a PDF file. It replaces all occurrences of PDF tokens to DOC.
import os
import argparse
from PyPDF2 import PdfFileReader, PdfFileWriter
from PyPDF2.generic import DecodedStreamObject, EncodedStreamObject
def replace_text(content, replacements = dict()):
lines = content.splitlines()
result = ""
in_text = False
for line in lines:
if line == "BT":
in_text = True
elif line == "ET":
in_text = False
elif in_text:
cmd = line[-2:]
if cmd.lower() == 'tj':
replaced_line = line
for k, v in replacements.items():
replaced_line = replaced_line.replace(k, v)
result += replaced_line + "\n"
else:
result += line + "\n"
continue
result += line + "\n"
return result
def process_data(object, replacements):
data = object.getData()
decoded_data = data.decode('utf-8')
replaced_data = replace_text(decoded_data, replacements)
encoded_data = replaced_data.encode('utf-8')
if object.decodedSelf is not None:
object.decodedSelf.setData(encoded_data)
else:
object.setData(encoded_data)
if __name__ == "__main__":
ap = argparse.ArgumentParser()
ap.add_argument("-i", "--input", required=True, help="path to PDF document")
args = vars(ap.parse_args())
in_file = args["input"]
filename_base = in_file.replace(os.path.splitext(in_file)[1], "")
# Provide replacements list that you need here
replacements = { 'PDF': 'DOC'}
pdf = PdfFileReader(in_file)
writer = PdfFileWriter()
for page_number in range(0, pdf.getNumPages()):
page = pdf.getPage(page_number)
contents = page.getContents()
if isinstance(contents, DecodedStreamObject) or isinstance(contents, EncodedStreamObject):
process_data(contents, replacements)
elif len(contents) > 0:
for obj in contents:
if isinstance(obj, DecodedStreamObject) or isinstance(obj, EncodedStreamObject):
streamObj = obj.getObject()
process_data(streamObj, replacements)
writer.addPage(page)
with open(filename_base + ".result.pdf", 'wb') as out_file:
writer.write(out_file)
The results are
UPDATE 2021-03-21:
Updated the code example to handle DecodedStreamObject and EncodedStreamObject which actually contian data stream with text to update.
If #Dmytrio solution do not alter final PDF
Dymitrio's updated code example to handle DecodedStreamObject and EncodedStreamObject which actually contain data stream with text to update could run fine, but with a file different from example, was not able to alter pdf text content.
According to EDIT 3, from How to replace text in a PDF using Python?:
By inserting page[NameObject("/Contents")] = contents.decodedSelf before writer.addPage(page), we force pyPDF2 to update content of the page object.
This way I was able to overcome this problem and replace text from pdf file.
Final code should look like this:
import os
import argparse
from PyPDF2 import PdfFileReader, PdfFileWriter
from PyPDF2.generic import DecodedStreamObject, EncodedStreamObject, NameObject
def replace_text(content, replacements = dict()):
lines = content.splitlines()
result = ""
in_text = False
for line in lines:
if line == "BT":
in_text = True
elif line == "ET":
in_text = False
elif in_text:
cmd = line[-2:]
if cmd.lower() == 'tj':
replaced_line = line
for k, v in replacements.items():
replaced_line = replaced_line.replace(k, v)
result += replaced_line + "\n"
else:
result += line + "\n"
continue
result += line + "\n"
return result
def process_data(object, replacements):
data = object.getData()
decoded_data = data.decode('utf-8')
replaced_data = replace_text(decoded_data, replacements)
encoded_data = replaced_data.encode('utf-8')
if object.decodedSelf is not None:
object.decodedSelf.setData(encoded_data)
else:
object.setData(encoded_data)
if __name__ == "__main__":
ap = argparse.ArgumentParser()
ap.add_argument("-i", "--input", required=True, help="path to PDF document")
args = vars(ap.parse_args())
in_file = args["input"]
filename_base = in_file.replace(os.path.splitext(in_file)[1], "")
# Provide replacements list that you need here
replacements = { 'PDF': 'DOC'}
pdf = PdfFileReader(in_file)
writer = PdfFileWriter()
for page_number in range(0, pdf.getNumPages()):
page = pdf.getPage(page_number)
contents = page.getContents()
if isinstance(contents, DecodedStreamObject) or isinstance(contents, EncodedStreamObject):
process_data(contents, replacements)
elif len(contents) > 0:
for obj in contents:
if isinstance(obj, DecodedStreamObject) or isinstance(obj, EncodedStreamObject):
streamObj = obj.getObject()
process_data(streamObj, replacements)
# Force content replacement
page[NameObject("/Contents")] = contents.decodedSelf
writer.addPage(page)
with open(filename_base + ".result.pdf", 'wb') as out_file:
writer.write(out_file)
Important: from PyPDF2.generic import NameObject
Decompress the pdf to make parsing easier (solves many of the issues in the previous answer). I use pdftk. (If this step fails, one hack to pre-process the pdf is to open the pdf in OSX Preview, print it, and then choose save as pdf from the print menu. Then retry the command below.)
pdftk original.pdf output uncompressed.pdf uncompress
Parse and replace using PyPDF2.
from PyPDF2 import PdfFileReader, PdfFileWriter
replacements = [
("old string", "new string")
]
pdf = PdfFileReader(open("uncompressed.pdf", "rb"))
writer = PdfFileWriter()
for page in pdf.pages:
contents = page.getContents().getData()
for (a,b) in replacements:
contents = contents.replace(a.encode('utf-8'), b.encode('utf-8'))
page.getContents().setData(contents)
writer.addPage(page)
with open("modified.pdf", "wb") as f:
writer.write(f)
[Optional] Re-compress the pdf.
pdftk modified.pdf output recompressed.pdf compress
Here is a solution using the MS Word source file.
As trying to edit the pdf itself turned out to be too complicated for me because of the encoding errors, I went with the MS Word >> Pdf option.
Prepare MS Word template with {{input_fields}}
Fill in the template with data
Convert the filled in MS Word file to PDF
The DocxTemplate module uses jinja like syntax: {{variable_name}}
In my solution I use an intermediate temp file. I tried to get rid of this step using BytesIO/StringIO to virtualize this step only in memory, but haven't make that work yet.
Here is an easy and working solution to perform the required task:
import os
import comtypes.client
from pathlib import Path
from docxtpl import DocxTemplate
import random
# CFG
in_file_path = "files/template.docx"
temp_file_path = "files/"+str(random.randint(0,50))+".docx"
out_file_path = "files/output.pdf"
# Fill in text
data_to_fill = {'Field_name' : "John Tester",
'Field_ocupation' : "Test tester",
'Field_address' : "Test Address 123",
}
template = DocxTemplate(Path(in_file_path))
template.render(data_to_fill)
template.save(Path(temp_file_path))
# Convert to PDF
wdFormatPDF = 17
in_file = os.path.abspath(Path(temp_file_path))
out_file = os.path.abspath(Path(out_file_path))
word = comtypes.client.CreateObject('Word.Application')
doc = word.Documents.Open(in_file)
doc.SaveAs(out_file, FileFormat=wdFormatPDF)
doc.Close()
word.Quit()
# Get rid of the temp file
os.remove(Path(temp_file_path))

finding on which page a search string is located in a pdf document using python

Which python packages can I use to find out out on which page a specific “search string” is located ?
I looked into several python pdf packages but couldn't figure out which one I should use.
PyPDF does not seem to have this functionality and PDFMiner seems to be an overkill for such simple task.
Any advice ?
More precise:
I have several PDF documents and I would like to extract pages which are between a string “Begin” and a string “End” .
I finally figured out that pyPDF can help. I am posting it in case it can help somebody else.
(1) a function to locate the string
def fnPDF_FindText(xFile, xString):
# xfile : the PDF file in which to look
# xString : the string to look for
import pyPdf, re
PageFound = -1
pdfDoc = pyPdf.PdfFileReader(file(xFile, "rb"))
for i in range(0, pdfDoc.getNumPages()):
content = ""
content += pdfDoc.getPage(i).extractText() + "\n"
content1 = content.encode('ascii', 'ignore').lower()
ResSearch = re.search(xString, content1)
if ResSearch is not None:
PageFound = i
break
return PageFound
(2) a function to extract the pages of interest
def fnPDF_ExtractPages(xFileNameOriginal, xFileNameOutput, xPageStart, xPageEnd):
from pyPdf import PdfFileReader, PdfFileWriter
output = PdfFileWriter()
pdfOne = PdfFileReader(file(xFileNameOriginal, "rb"))
for i in range(xPageStart, xPageEnd):
output.addPage(pdfOne.getPage(i))
outputStream = file(xFileNameOutput, "wb")
output.write(outputStream)
outputStream.close()
I hope this will be helpful to somebody else
I was able to successfully get the output using the code below.
Code:
import PyPDF2
import re
# Open the pdf file
object = PyPDF2.PdfFileReader(r"C:\TEST.pdf")
# Get number of pages
NumPages = object.getNumPages()
# Enter code here
String = "Enter_the_text_to_Search_here"
# Extract text and do the search
for i in range(0, NumPages):
PageObj = object.getPage(i)
Text = PageObj.extractText()
if re.search(String,Text):
print("Pattern Found on Page: " + str(i))
Sample Output:
Pattern Found on Page: 7
Finding on which page a search string is located in a pdf document using python
PyPDF2
# import packages
import PyPDF2
import re
# open the pdf file
object = PyPDF2.PdfFileReader(r"source_file_path")
# get number of pages
NumPages = object.getNumPages()
# define keyterms
String = "P4F-21B"
# extract text and do the search
for i in range(0, NumPages):
PageObj = object.getPage(i)
Text = PageObj.extractText()
ResSearch = re.search(String, Text)
if ResSearch != None:
print(ResSearch)
print("Page Number" + str(i+1))
Output:
<re.Match object; span=(57, 64), match='P4F-21B'>
Page Number1
PyMuPDF
import fitz
import re
# load document
doc = fitz.open(r"C:\Users\shraddha.shetty\Desktop\OCR-pages-deleted.pdf")
# define keyterms
String = "P4F-21B"
# get text, search for string and print count on page.
for page in doc:
text = ''
text += page.get_text()
if len(re.findall(String, text)) > 0:
print(f'count on page {page.number + 1} is: {len(re.findall(String, text))}')
In addition to what #user1043144 mentioned,
To use with python 3.x
Use PyPDF2
import PyPDF2
Use open instead of file
PdfFileReader(open(xFile, 'rb'))
updated answer with PYDF2
import re
import PyPDF2
def pdf_find_text(xfile_pdf, xsearch_string, ignore_case = False):
'''
find page(s) on which a given text is located in a pdf
input: pdf file and the string to search
(string to search can be in a regex like 'references\n')
N.B:
results need to be checked
in case of pdf whose page numbers are not zero indexed ,
the results seems off (by one page)
'''
xlst_res = []
xreader = PyPDF2.PdfFileReader(xfile_pdf)
for xpage_nr, xpage in enumerate(xreader.pages):
xpage_text = xpage.extractText()
xhits = None
if ignore_case == False:
xhits = re.search(xsearch_string, xpage_text.lower())
else:
xhits = re.search(xsearch_string, xpage_text.lower(), re.IGNORECASE)
if xhits:
xlst_res.append(xpage_nr)
return {'num_pages': xreader.numPages, 'page_hits': xlst_res}
def pdf_extract_pages(xpdf_original, xpdf_new , xpage_start, xpage_end):
'''
given a pdf extract a page range and save it in a new pdf file
'''
with open(xpdf_original, 'rb') as xfile_1, open(xpdf_new , 'wb') as xfile_2:
xreader = PyPDF2.PdfFileReader(xfile_1)
xwriter = PyPDF2.PdfFileWriter()
for xpage_nr in range(xpage_start, xpage_end ):
xwriter.addPage(xreader.getPage(xpage_nr))
xwriter.write(xfile_2)

Categories

Resources