Use Tesseract OCR to extract text from a scanned pdf folders

Use Tesseract OCR to extract text from a scanned pdf folders - python

I have the code to extract/convert text from scanned pdf files/normal pdf files by using Tesseract OCR. But I want to make my code to convert a pdf folder rather than a single pdf file, then the extract text files will be store in a folder that I want.
See my code below:
filePath = '/Users/CodingStark/scanned/scanned-file.pdf'
pages = convert_from_path(filePath, 500)
image_counter = 1
# Iterate through all the pages stored above
for page in pages:
filename = "page_"+str(image_counter)+".jpg"
page.save(filename, 'JPEG')
image_counter = image_counter + 1
filelimit = image_counter-1
# Creating a text file to write the output
outfile = "scanned-file.txt"
f = open(outfile, "a")
# Iterate from 1 to total number of pages
for i in range(1, filelimit + 1):
filename = "page_"+str(i)+".jpg"
# Recognize the text as string in image using pytesserct
text = str(((pytesseract.image_to_string(Image.open(filename)))))
text = text.replace('-\n', '')
f.write(text)
#Close the file after writing all the text.
f.close()
I want to automate my code so it will convert all my pdf files in the scanned folder and those extract text files will be in a folder that I want. Also, are there any ways to delete all the jpg files after the code? Since it takes a lot of memory spaces. Thank you so much!!
Updated with Answer
def tesseractOCR_pdf(pdf):
filePath = pdf
pages = convert_from_path(filePath, 500)
# Counter to store images of each page of PDF to image
image_counter = 1
# Iterate through all the pages stored above
for page in pages:
# Declaring filename for each page of PDF as JPG
# For each page, filename will be:
# PDF page 1 -> page_1.jpg
# PDF page 2 -> page_2.jpg
# PDF page 3 -> page_3.jpg
# ....
# PDF page n -> page_n.jpg
filename = "page_"+str(image_counter)+".jpg"
# Save the image of the page in system
page.save(filename, 'JPEG')
# Increment the counter to update filename
image_counter = image_counter + 1
# Variable to get count of total number of pages
filelimit = image_counter-1
# Create an empty string for stroing purposes
text = ""
# Iterate from 1 to total number of pages
for i in range(1, filelimit + 1):
# Set filename to recognize text from
# Again, these files will be:
# page_1.jpg
# page_2.jpg
# ....
# page_n.jpg
filename = "page_"+str(i)+".jpg"
# Recognize the text as string in image using pytesserct
text += str(((pytesseract.image_to_string(Image.open(filename)))))
text = text.replace('-\n', '')
#Delete all the jpg files that created from above
for i in glob.glob("*.jpg"):
os.remove(i)
return text
def tesseractOCR_img(img):
filePath = img
text = str(pytesseract.image_to_string(filePath,lang='eng',config='--psm 6'))
text = text.replace('-\n', '')
return text
def Tesseract_ALL(docDir, txtDir):
if docDir == "": docDir = os.getcwd() + "\\" #if no docDir passed in
for doc in os.listdir(docDir): #iterate through docs in doc directory
try:
fileExtension = doc.split(".")[-1]
if fileExtension == "pdf":
pdfFilename = docDir + doc
text = tesseractOCR_pdf(pdfFilename) #get string of text content of pdf
textFilename = txtDir + doc + ".txt"
textFile = open(textFilename, "w") #make text file
textFile.write(text) #write text to text file
else:
# elif (fileExtension == "tif") | (fileExtension == "tiff") | (fileExtension == "jpg"):
imgFilename = docDir + doc
text = tesseractOCR_img(imgFilename) #get string of text content of img
textFilename = txtDir + doc + ".txt"
textFile = open(textFilename, "w") #make text file
textFile.write(text) #write text to text file
except:
print("Error in file: "+ str(doc))
for filename in os.listdir(txtDir):
fileExtension = filename.split(".")[-2]
if fileExtension == "pdf":
os.rename(txtDir + filename, txtDir + filename.replace('.pdf', ''))
elif fileExtension == "tif":
os.rename(txtDir + filename, txtDir + filename.replace('.tif', ''))
elif fileExtension == "tiff":
os.rename(txtDir + filename, txtDir + filename.replace('.tiff', ''))
elif fileExtension == "jpg":
os.rename(txtDir + filename, txtDir + filename.replace('.jpg', ''))
#Below are the code to run the functions
#Specific telling the function where the documents located and where you want the txt files to be at
docDir = "pdf_folder"
txtDir = "text_folder"
Tesseract_ALL(docDir, txtDir)

here is the loop to read from a path,
import glob,os
import os, subprocess
pdf_dir = "dir"
os.chdir(pdf_dir)
for pdf_file in glob.glob(os.path.join(pdf_dir, "*.PDF")):
//// put here what you want to do for each pdf file

Related

Split/Extract PDF pages between two strings in python

I am trying to split/extract PDF pages between two strings - excluding pages that contains both strings.
For example,
String1 = "String1"
String2 = "String2"
If page 2 has "String1" and page 10 has "String2", then the output PDF should contain pages from 3 to 9.
The below script extracts all the pages which contains or have the string and creates a single PDF will all the pages containing the string.
from PyPDF2 import PdfReader, PdfWriter
from PyPDF2 import PdfFileReader, PdfFileWriter
import os
import fitz
import re
nameList = list()
directory = r"C:\Users\rohitpandey\Downloads\OneDrive_1_1-25-2023\CLAIMS Analysis\Format 2(Wire)"
output = r"C:\Users\rohitpandey\Downloads\OneDrive_1_1-25-2023\New folder"
for file in os.listdir(directory):
if not file.endswith(".pdf"):
continue
with open(os.path.join(directory,file), 'rb') as pdfFileObj: # Changes here
doc = fitz.open(directory+ "\\" + file)
nameList.append(str(file))
docPageCount = doc.page_count
reader = PdfReader(pdfFileObj)
writer = PdfWriter()
pageNo = list()
# Open the pdf file
object = PdfReader(doc.name, 'rb') # Get number of pages
NumPages = docPageCount # Enter code here
String = "REMIT-TO-CODE" # Extract text and do the search
for i in range(0, NumPages):
PageObj = object.getPage(i)
Text = PageObj.extractText()
if re.search(String,Text):
##print("Pattern Found on Page: " + str(i))
pageNo.append(str(i))
minPg = min(pageNo, key=float)
minPg = int(minPg)
for page_num in range(minPg, docPageCount):
page = reader.pages[page_num]
# This is CPU intensive! It ZIPs the contents of the page
page.compress_content_streams()
end = output+"\\"+"PAYMENT_WIRE_STEP_1_2_" + file
writer.add_page(page)
with open(end, "wb") as fh:
writer.remove_links()
writer.write(fh)

reading text from PDF contains unknown encoding

I'm using PyPDF4 to read text from a PDF I downloaded. This works, but the text string is not readable:
ÓŒŁ–Ł#`#äŽ–Ł#`#Ä›¥–Ž¢–#¥ŒŒŽ—–ﬁ–Ł
Áﬁ⁄–ﬂ–Ł–#›ŁƒŒŽﬂ†£›–
As far as I know the file is not encrypted, I can open it in Acrobat Reader without problem. In reader I can also select / copy / paste the text correctly.
for reference: this is the code:
import glob
import PyPDF4
relevant_path = 'C:\\_Personal\\Mega\\PycharmProjects\\PDFHandler\\docs\\input\\'
if __name__ == '__main__':
for PDFFile in glob.iglob(relevant_path + '*.pdf', recursive=True):
print('Processing File: ' + PDFFile.split('\\')[-1])
pdfReader = PyPDF4.PdfFileReader(PDFFile)
num_pages = pdfReader.numPages
print(num_pages)
page_count = 0
text = ''
while page_count < num_pages:
pageObj = pdfReader.getPage(page_count)
page_count += 1
text += pageObj.extractText()
print(text)
any hints? other packages I could use? ...

Save files to a new folder with python

This code saves text files from a data frame of sentences, then saves each one as a ssml file.
How can I get the sentences to be saved in a new folder?
max = len(sentences)
for i in range(0,max):
txt = sentences[i]
new_txt = starter + txt + ender
print(new_txt)
num = num + 1
with open("text" + str(num) + ".ssml", 'w+') as f:
f.writelines(new_txt)

Add this at the start:
import os
folder_name = 'my_folder'
os.makedirs(folder_name, exist_ok=True)
Then change:
with open("text" + str(num) + ".ssml", 'w+') as f:
to:
with open(f'{folder_name}\\text{num}.ssml', 'w+') as f:

Convert Scanned PDF to Text

I am trying to extract text from PDF, but when I extract the text some words and numbers are missing.
Is it possible to extract the text without missing words and numbers?
from pdf2image import convert_from_path
import pytesseract
from PyPDF2 import PdfReader
path = "file.pdf"
reader = PdfReader(path)
pages = len(reader.pages)
output = open(path.rsplit(".", 1)[0] + ".txt", "a")
for i in range(1, pages + 1):
image = convert_from_path(path, first_page=i, last_page=i)
print(f"Converting page: {i}/{pages}")
say = pytesseract.image_to_string(image[0])
output.write(say + "\n")
output.flush()
print(f"Conversion of {path} Complete")

Detect the content type of multiple PDF in a Folder

so far I am using PyPDF2 in anaconda platform to place a watermark in 20000+ pdfs. The code is working for the majority of PDF files but there are a few of them where the content is a poorly scanned image from reports.
I want to know if there is a tool within python or any other way where I can analyse the content of the PDF and determine if the PDF is an image or is a pdf file with text characters. This will allow me to know which files have this defect and place them in other folder.
Thanks
I added my code.
import PyPDF2 #this library requires to be installed
import os
if __name__ == "__main__":
ROOT_PATH = "."
#STAMP_PATH = "." + "/stamped/"
TEMPLATE_PATH = "."
STAMP_PATH = "."
count = 0
for dirName, subdirList, fileList in os.walk(ROOT_PATH):
files=[]
print('Found directory: %s' % dirName)
for fileName in fileList:
if fileName.find('.pdf') > 0:
count += 1
print('\tHandling %s - %s %s' % (count, dirName, fileName))
files.append(fileName)
#=======================main code part ==========================================
file= open(fileName,'rb')
reader = PyPDF2.PdfFileReader(file)
page= reader.getPage(0)
water = open(TEMPLATE_PATH + 'StampTemplate1109.pdf','rb')
reader2 = PyPDF2.PdfFileReader(water)
waterpage = reader2.getPage(0)
#command to merge parent PDF first page with PDF watermark page
page.mergeTranslatedPage(waterpage, 0, -20, expand=True)
writer =PyPDF2.PdfFileWriter()
writer.addPage(page)
#add rest of PDF pages
for pageNum in range(1, reader.numPages): # this will give length of book
pageObj = reader.getPage(pageNum)
writer.addPage(pageObj)
#return the parent PDF file with the watermark
# here we are writing so 'wb' is for write binary
resultFile = open(STAMP_PATH + 'Reviewed ' + fileName,'wb')
writer.write(resultFile)
file.close()
resultFile.close()
#==============================================================================
print "TOTAL OF %s PROCESSED" % count

Since you're already using PyPDF2 you may want to use the PageObject.extractText function to see if you get any text on each page of the PDF. If you get an empty string from a page then it's probably an image.

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Use Tesseract OCR to extract text from a scanned pdf folders - python

here is the loop to read from a path, import glob,os import os, subprocess pdf_dir = "dir" os.chdir(pdf_dir) for pdf_file in glob.glob(os.path.join(pdf_dir, "*.PDF")): //// put here what you want to do for each pdf file

Related

Split/Extract PDF pages between two strings in python

reading text from PDF contains unknown encoding

Save files to a new folder with python

Convert Scanned PDF to Text

Detect the content type of multiple PDF in a Folder

Categories

Resources