Split/Extract PDF pages between two strings in python

Split/Extract PDF pages between two strings in python - python

I am trying to split/extract PDF pages between two strings - excluding pages that contains both strings.
For example,
String1 = "String1"
String2 = "String2"
If page 2 has "String1" and page 10 has "String2", then the output PDF should contain pages from 3 to 9.
The below script extracts all the pages which contains or have the string and creates a single PDF will all the pages containing the string.
from PyPDF2 import PdfReader, PdfWriter
from PyPDF2 import PdfFileReader, PdfFileWriter
import os
import fitz
import re
nameList = list()
directory = r"C:\Users\rohitpandey\Downloads\OneDrive_1_1-25-2023\CLAIMS Analysis\Format 2(Wire)"
output = r"C:\Users\rohitpandey\Downloads\OneDrive_1_1-25-2023\New folder"
for file in os.listdir(directory):
if not file.endswith(".pdf"):
continue
with open(os.path.join(directory,file), 'rb') as pdfFileObj: # Changes here
doc = fitz.open(directory+ "\\" + file)
nameList.append(str(file))
docPageCount = doc.page_count
reader = PdfReader(pdfFileObj)
writer = PdfWriter()
pageNo = list()
# Open the pdf file
object = PdfReader(doc.name, 'rb') # Get number of pages
NumPages = docPageCount # Enter code here
String = "REMIT-TO-CODE" # Extract text and do the search
for i in range(0, NumPages):
PageObj = object.getPage(i)
Text = PageObj.extractText()
if re.search(String,Text):
##print("Pattern Found on Page: " + str(i))
pageNo.append(str(i))
minPg = min(pageNo, key=float)
minPg = int(minPg)
for page_num in range(minPg, docPageCount):
page = reader.pages[page_num]
# This is CPU intensive! It ZIPs the contents of the page
page.compress_content_streams()
end = output+"\\"+"PAYMENT_WIRE_STEP_1_2_" + file
writer.add_page(page)
with open(end, "wb") as fh:
writer.remove_links()
writer.write(fh)

Related

reading text from PDF contains unknown encoding

I'm using PyPDF4 to read text from a PDF I downloaded. This works, but the text string is not readable:
ÓŒŁ–Ł#`#äŽ–Ł#`#Ä›¥–Ž¢–#¥ŒŒŽ—–ﬁ–Ł
Áﬁ⁄–ﬂ–Ł–#›ŁƒŒŽﬂ†£›–
As far as I know the file is not encrypted, I can open it in Acrobat Reader without problem. In reader I can also select / copy / paste the text correctly.
for reference: this is the code:
import glob
import PyPDF4
relevant_path = 'C:\\_Personal\\Mega\\PycharmProjects\\PDFHandler\\docs\\input\\'
if __name__ == '__main__':
for PDFFile in glob.iglob(relevant_path + '*.pdf', recursive=True):
print('Processing File: ' + PDFFile.split('\\')[-1])
pdfReader = PyPDF4.PdfFileReader(PDFFile)
num_pages = pdfReader.numPages
print(num_pages)
page_count = 0
text = ''
while page_count < num_pages:
pageObj = pdfReader.getPage(page_count)
page_count += 1
text += pageObj.extractText()
print(text)
any hints? other packages I could use? ...

Convert Scanned PDF to Text

I am trying to extract text from PDF, but when I extract the text some words and numbers are missing.
Is it possible to extract the text without missing words and numbers?
from pdf2image import convert_from_path
import pytesseract
from PyPDF2 import PdfReader
path = "file.pdf"
reader = PdfReader(path)
pages = len(reader.pages)
output = open(path.rsplit(".", 1)[0] + ".txt", "a")
for i in range(1, pages + 1):
image = convert_from_path(path, first_page=i, last_page=i)
print(f"Converting page: {i}/{pages}")
say = pytesseract.image_to_string(image[0])
output.write(say + "\n")
output.flush()
print(f"Conversion of {path} Complete")

Use Tesseract OCR to extract text from a scanned pdf folders

I have the code to extract/convert text from scanned pdf files/normal pdf files by using Tesseract OCR. But I want to make my code to convert a pdf folder rather than a single pdf file, then the extract text files will be store in a folder that I want.
See my code below:
filePath = '/Users/CodingStark/scanned/scanned-file.pdf'
pages = convert_from_path(filePath, 500)
image_counter = 1
# Iterate through all the pages stored above
for page in pages:
filename = "page_"+str(image_counter)+".jpg"
page.save(filename, 'JPEG')
image_counter = image_counter + 1
filelimit = image_counter-1
# Creating a text file to write the output
outfile = "scanned-file.txt"
f = open(outfile, "a")
# Iterate from 1 to total number of pages
for i in range(1, filelimit + 1):
filename = "page_"+str(i)+".jpg"
# Recognize the text as string in image using pytesserct
text = str(((pytesseract.image_to_string(Image.open(filename)))))
text = text.replace('-\n', '')
f.write(text)
#Close the file after writing all the text.
f.close()
I want to automate my code so it will convert all my pdf files in the scanned folder and those extract text files will be in a folder that I want. Also, are there any ways to delete all the jpg files after the code? Since it takes a lot of memory spaces. Thank you so much!!
Updated with Answer
def tesseractOCR_pdf(pdf):
filePath = pdf
pages = convert_from_path(filePath, 500)
# Counter to store images of each page of PDF to image
image_counter = 1
# Iterate through all the pages stored above
for page in pages:
# Declaring filename for each page of PDF as JPG
# For each page, filename will be:
# PDF page 1 -> page_1.jpg
# PDF page 2 -> page_2.jpg
# PDF page 3 -> page_3.jpg
# ....
# PDF page n -> page_n.jpg
filename = "page_"+str(image_counter)+".jpg"
# Save the image of the page in system
page.save(filename, 'JPEG')
# Increment the counter to update filename
image_counter = image_counter + 1
# Variable to get count of total number of pages
filelimit = image_counter-1
# Create an empty string for stroing purposes
text = ""
# Iterate from 1 to total number of pages
for i in range(1, filelimit + 1):
# Set filename to recognize text from
# Again, these files will be:
# page_1.jpg
# page_2.jpg
# ....
# page_n.jpg
filename = "page_"+str(i)+".jpg"
# Recognize the text as string in image using pytesserct
text += str(((pytesseract.image_to_string(Image.open(filename)))))
text = text.replace('-\n', '')
#Delete all the jpg files that created from above
for i in glob.glob("*.jpg"):
os.remove(i)
return text
def tesseractOCR_img(img):
filePath = img
text = str(pytesseract.image_to_string(filePath,lang='eng',config='--psm 6'))
text = text.replace('-\n', '')
return text
def Tesseract_ALL(docDir, txtDir):
if docDir == "": docDir = os.getcwd() + "\\" #if no docDir passed in
for doc in os.listdir(docDir): #iterate through docs in doc directory
try:
fileExtension = doc.split(".")[-1]
if fileExtension == "pdf":
pdfFilename = docDir + doc
text = tesseractOCR_pdf(pdfFilename) #get string of text content of pdf
textFilename = txtDir + doc + ".txt"
textFile = open(textFilename, "w") #make text file
textFile.write(text) #write text to text file
else:
# elif (fileExtension == "tif") | (fileExtension == "tiff") | (fileExtension == "jpg"):
imgFilename = docDir + doc
text = tesseractOCR_img(imgFilename) #get string of text content of img
textFilename = txtDir + doc + ".txt"
textFile = open(textFilename, "w") #make text file
textFile.write(text) #write text to text file
except:
print("Error in file: "+ str(doc))
for filename in os.listdir(txtDir):
fileExtension = filename.split(".")[-2]
if fileExtension == "pdf":
os.rename(txtDir + filename, txtDir + filename.replace('.pdf', ''))
elif fileExtension == "tif":
os.rename(txtDir + filename, txtDir + filename.replace('.tif', ''))
elif fileExtension == "tiff":
os.rename(txtDir + filename, txtDir + filename.replace('.tiff', ''))
elif fileExtension == "jpg":
os.rename(txtDir + filename, txtDir + filename.replace('.jpg', ''))
#Below are the code to run the functions
#Specific telling the function where the documents located and where you want the txt files to be at
docDir = "pdf_folder"
txtDir = "text_folder"
Tesseract_ALL(docDir, txtDir)

here is the loop to read from a path,
import glob,os
import os, subprocess
pdf_dir = "dir"
os.chdir(pdf_dir)
for pdf_file in glob.glob(os.path.join(pdf_dir, "*.PDF")):
//// put here what you want to do for each pdf file

Extract text from multiple .html files and save them in seperate txt files

files list
I have tried to extract text content from 24 folders, every folder has serval (100+) .HTML files, I need to create 24 .txt files to save text which extracts from .HTML files.
I have done most of the job except saving .txt files, the code I wrote save 24 files all with the same content, I know something wrong in the following part
for number in range(1,25):
with open('Text'+"%02d" % number +" .txt", "w", encoding='utf-8') as text:
for i in passage:
text.write(i+' ')
All code is listed below
# Read files and call functions
from bs4 import BeautifulSoup
import os
import numpy as np
gap_html = os.listdir('gap-html')
print(gap_html)
# print(folder)
passage = list()
# out = "all.txt"
# def Convertfile():
for textFolders in gap_html:
# domain = os.path.abspath(r'../gap-html')
folder = os.path.join(os.path.abspath('gap-html'), textFolders)
# text_folder=os.path.abspath(folder)
# Lists the file names under all folders
textFiles=os.listdir(folder)
for textFile in textFiles :
file=os.path.join(os.path.abspath(folder), textFile)
print(file)
html = open(file, 'r', encoding="utf-8").read()
# print("Start reading file...")
soup = BeautifulSoup(html, features='lxml')
page = soup.find_all('span', {"class": "ocr_cinfo"})
for word in page:
word = word.get_text()
passage.append(word)
for number in range(1,25):
with open('Text'+"%02d" % number +" .txt", "w", encoding='utf-8') as text:
for i in passage:
text.write(i+' ')

Extracting text from a PDF - All pages and Output - file using Python [duplicate]

This question already has answers here:
How to extract text from a PDF file?
(33 answers)
Closed 2 months ago.
Im new on Python.
I am using this code to extract text. Is it possible extract all pages and have an output in a file?
import PyPDF2
pdf_file = open('sample.pdf','rb')
read_pdf = PyPDF2.PdfFileReader(pdf_file)
number_of_pages = read_pdf.getNumPages()
page = read_pdf.getPage(10)
page_content = page.extractText()
print (page_content)

Use a loop to extract each page's text and write each page's text to a single file.
import PyPDF2
with open('sample.pdf','rb') as pdf_file, open('sample.txt', 'w') as text_file:
read_pdf = PyPDF2.PdfFileReader(pdf_file)
number_of_pages = read_pdf.getNumPages()
for page_number in range(number_of_pages): # use xrange in Py2
page = read_pdf.getPage(page_number)
page_content = page.extractText()
text_file.write(page_content)

I used following code to convert multiple pdf files into txt
p
df_dir = "D:/search/pdf"
txt_dir = "D:/pdf_to_text"
corpus = (f for f in os.listdir(pdf_dir) if not f.startswith('.') and isfile(join(pdf_dir, f)))
pdfWriter = PyPDF2.PdfFileWriter()
for filename in corpus:
pdf = open(join(pdf_dir, filename),'rb')
pdfReader = PyPDF2.PdfFileReader(pdf)
for page in range(1, pdfReader.numPages):
pageObj = pdfReader.getPage(page)
pdfWriter.addPage(pageObj)
text = pageObj.extractText()
page_name = "{}-page{}.txt".format(filename[:4], page + 1)
with open(join(txt_dir, page_name), mode="w", encoding='UTF-8') as o:
o.write(text)
This code works properly, but for each file I have multiple pages , when I run above code it gives me data as file1-page1.txt, file1-page2.txt, file1-page3.txt. but I want file.txt contains information for all pages . How I can do it.

def getPptContent(path, text):
pdfWriter = PyPDF2.PdfFileWriter()
pdf = open(join(pdf_dir, filename),'rb')
pdfReader = PyPDF2.PdfFileReader(pdf)
for page in range(1, pdfReader.numPages):
pageObj = pdfReader.getPage(page)
pdfWriter.addPage(pageObj)
text = pageObj.extractText()
return text
pdf_dir = "pdf_directory name"
corpus = [str(f) for f in os.listdir(pdf_dir) if not f.startswith('.') and
isfile(join(pdf_dir, f))]
for filename in corpus:
Path = pdf_dir + "/" +filename
print(Path)
file_content = getPptContent(Path)
f = open(pdf_dir + "/output/" + filename.split(".")[0] +".txt" ,"w+",
encoding="utf-8")
f.write(str(file_content))
f.close()
Above code works for me.

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Split/Extract PDF pages between two strings in python - python

Related

reading text from PDF contains unknown encoding

Convert Scanned PDF to Text

Use Tesseract OCR to extract text from a scanned pdf folders

Extract text from multiple .html files and save them in seperate txt files

Extracting text from a PDF - All pages and Output - file using Python [duplicate]

Categories

Resources