I am trying to extract text from PDF, but when I extract the text some words and numbers are missing.
Is it possible to extract the text without missing words and numbers?
from pdf2image import convert_from_path
import pytesseract
from PyPDF2 import PdfReader
path = "file.pdf"
reader = PdfReader(path)
pages = len(reader.pages)
output = open(path.rsplit(".", 1)[0] + ".txt", "a")
for i in range(1, pages + 1):
image = convert_from_path(path, first_page=i, last_page=i)
print(f"Converting page: {i}/{pages}")
say = pytesseract.image_to_string(image[0])
output.write(say + "\n")
output.flush()
print(f"Conversion of {path} Complete")
Related
I am trying to split/extract PDF pages between two strings - excluding pages that contains both strings.
For example,
String1 = "String1"
String2 = "String2"
If page 2 has "String1" and page 10 has "String2", then the output PDF should contain pages from 3 to 9.
The below script extracts all the pages which contains or have the string and creates a single PDF will all the pages containing the string.
from PyPDF2 import PdfReader, PdfWriter
from PyPDF2 import PdfFileReader, PdfFileWriter
import os
import fitz
import re
nameList = list()
directory = r"C:\Users\rohitpandey\Downloads\OneDrive_1_1-25-2023\CLAIMS Analysis\Format 2(Wire)"
output = r"C:\Users\rohitpandey\Downloads\OneDrive_1_1-25-2023\New folder"
for file in os.listdir(directory):
if not file.endswith(".pdf"):
continue
with open(os.path.join(directory,file), 'rb') as pdfFileObj: # Changes here
doc = fitz.open(directory+ "\\" + file)
nameList.append(str(file))
docPageCount = doc.page_count
reader = PdfReader(pdfFileObj)
writer = PdfWriter()
pageNo = list()
# Open the pdf file
object = PdfReader(doc.name, 'rb') # Get number of pages
NumPages = docPageCount # Enter code here
String = "REMIT-TO-CODE" # Extract text and do the search
for i in range(0, NumPages):
PageObj = object.getPage(i)
Text = PageObj.extractText()
if re.search(String,Text):
##print("Pattern Found on Page: " + str(i))
pageNo.append(str(i))
minPg = min(pageNo, key=float)
minPg = int(minPg)
for page_num in range(minPg, docPageCount):
page = reader.pages[page_num]
# This is CPU intensive! It ZIPs the contents of the page
page.compress_content_streams()
end = output+"\\"+"PAYMENT_WIRE_STEP_1_2_" + file
writer.add_page(page)
with open(end, "wb") as fh:
writer.remove_links()
writer.write(fh)
I'm using PyPDF4 to read text from a PDF I downloaded. This works, but the text string is not readable:
ÓŒŁ–Ł#`#䎖Ł#`#Ä›¥–Ž¢–#¥ŒŒŽ—–fi–Ł
Áfi⁄–fl–Ł–#›ŁƒŒŽfl†£›–
As far as I know the file is not encrypted, I can open it in Acrobat Reader without problem. In reader I can also select / copy / paste the text correctly.
for reference: this is the code:
import glob
import PyPDF4
relevant_path = 'C:\\_Personal\\Mega\\PycharmProjects\\PDFHandler\\docs\\input\\'
if __name__ == '__main__':
for PDFFile in glob.iglob(relevant_path + '*.pdf', recursive=True):
print('Processing File: ' + PDFFile.split('\\')[-1])
pdfReader = PyPDF4.PdfFileReader(PDFFile)
num_pages = pdfReader.numPages
print(num_pages)
page_count = 0
text = ''
while page_count < num_pages:
pageObj = pdfReader.getPage(page_count)
page_count += 1
text += pageObj.extractText()
print(text)
any hints? other packages I could use? ...
I want to save my output data into the text file where each new line is shown in a different row. Currently each row is delimited by \n, I want new lines to be saved in different rows.
from PIL import Image
import pytesseract
import sys
from pdf2image import convert_from_path
import os
PDF_file = "F:/ABC/Doc_1.pdf"
pages = convert_from_path(PDF_file, 500)
image_counter = 1
for page in pages:
filename = "page_"+str(image_counter)+".jpg"
page.save(filename, 'JPEG')
image_counter = image_counter + 1
filelimit = image_counter-1
outfile = "F:/ABC/intermediate_steps/out_text.txt"
f = open(outfile, "a")
for i in range(1, 2):
filename = "page_"+str(i)+".jpg"
import pytesseract
pytesseract.pytesseract.tesseract_cmd = r"\ABC\opencv-text-detection\Tesseract-OCR\tesseract.exe"
from pytesseract import pytesseract
text = str(((pytesseract.image_to_string(Image.open(filename)))))
text = text.replace('-\n', '')
#text = text.splitlines()
f.writelines("Data Extracted from next page starts now.")
f.writelines(str(text.encode('utf-8')))
f.close()
For eg :-
ABC
DEF
GHI
Current output :-
ABC\nDEF\nGHI\n
When you do
f.writelines(str(text.encode('utf-8')))
You convert the newline byte \n to its escaped version \\n. You should use just
f.writelines(text)
This question already has answers here:
How to extract text from a PDF file?
(33 answers)
Closed 2 months ago.
Im new on Python.
I am using this code to extract text. Is it possible extract all pages and have an output in a file?
import PyPDF2
pdf_file = open('sample.pdf','rb')
read_pdf = PyPDF2.PdfFileReader(pdf_file)
number_of_pages = read_pdf.getNumPages()
page = read_pdf.getPage(10)
page_content = page.extractText()
print (page_content)
Use a loop to extract each page's text and write each page's text to a single file.
import PyPDF2
with open('sample.pdf','rb') as pdf_file, open('sample.txt', 'w') as text_file:
read_pdf = PyPDF2.PdfFileReader(pdf_file)
number_of_pages = read_pdf.getNumPages()
for page_number in range(number_of_pages): # use xrange in Py2
page = read_pdf.getPage(page_number)
page_content = page.extractText()
text_file.write(page_content)
I used following code to convert multiple pdf files into txt
p
df_dir = "D:/search/pdf"
txt_dir = "D:/pdf_to_text"
corpus = (f for f in os.listdir(pdf_dir) if not f.startswith('.') and isfile(join(pdf_dir, f)))
pdfWriter = PyPDF2.PdfFileWriter()
for filename in corpus:
pdf = open(join(pdf_dir, filename),'rb')
pdfReader = PyPDF2.PdfFileReader(pdf)
for page in range(1, pdfReader.numPages):
pageObj = pdfReader.getPage(page)
pdfWriter.addPage(pageObj)
text = pageObj.extractText()
page_name = "{}-page{}.txt".format(filename[:4], page + 1)
with open(join(txt_dir, page_name), mode="w", encoding='UTF-8') as o:
o.write(text)
This code works properly, but for each file I have multiple pages , when I run above code it gives me data as file1-page1.txt, file1-page2.txt, file1-page3.txt. but I want file.txt contains information for all pages . How I can do it.
def getPptContent(path, text):
pdfWriter = PyPDF2.PdfFileWriter()
pdf = open(join(pdf_dir, filename),'rb')
pdfReader = PyPDF2.PdfFileReader(pdf)
for page in range(1, pdfReader.numPages):
pageObj = pdfReader.getPage(page)
pdfWriter.addPage(pageObj)
text = pageObj.extractText()
return text
pdf_dir = "pdf_directory name"
corpus = [str(f) for f in os.listdir(pdf_dir) if not f.startswith('.') and
isfile(join(pdf_dir, f))]
for filename in corpus:
Path = pdf_dir + "/" +filename
print(Path)
file_content = getPptContent(Path)
f = open(pdf_dir + "/output/" + filename.split(".")[0] +".txt" ,"w+",
encoding="utf-8")
f.write(str(file_content))
f.close()
Above code works for me.
Which python packages can I use to find out out on which page a specific “search string” is located ?
I looked into several python pdf packages but couldn't figure out which one I should use.
PyPDF does not seem to have this functionality and PDFMiner seems to be an overkill for such simple task.
Any advice ?
More precise:
I have several PDF documents and I would like to extract pages which are between a string “Begin” and a string “End” .
I finally figured out that pyPDF can help. I am posting it in case it can help somebody else.
(1) a function to locate the string
def fnPDF_FindText(xFile, xString):
# xfile : the PDF file in which to look
# xString : the string to look for
import pyPdf, re
PageFound = -1
pdfDoc = pyPdf.PdfFileReader(file(xFile, "rb"))
for i in range(0, pdfDoc.getNumPages()):
content = ""
content += pdfDoc.getPage(i).extractText() + "\n"
content1 = content.encode('ascii', 'ignore').lower()
ResSearch = re.search(xString, content1)
if ResSearch is not None:
PageFound = i
break
return PageFound
(2) a function to extract the pages of interest
def fnPDF_ExtractPages(xFileNameOriginal, xFileNameOutput, xPageStart, xPageEnd):
from pyPdf import PdfFileReader, PdfFileWriter
output = PdfFileWriter()
pdfOne = PdfFileReader(file(xFileNameOriginal, "rb"))
for i in range(xPageStart, xPageEnd):
output.addPage(pdfOne.getPage(i))
outputStream = file(xFileNameOutput, "wb")
output.write(outputStream)
outputStream.close()
I hope this will be helpful to somebody else
I was able to successfully get the output using the code below.
Code:
import PyPDF2
import re
# Open the pdf file
object = PyPDF2.PdfFileReader(r"C:\TEST.pdf")
# Get number of pages
NumPages = object.getNumPages()
# Enter code here
String = "Enter_the_text_to_Search_here"
# Extract text and do the search
for i in range(0, NumPages):
PageObj = object.getPage(i)
Text = PageObj.extractText()
if re.search(String,Text):
print("Pattern Found on Page: " + str(i))
Sample Output:
Pattern Found on Page: 7
Finding on which page a search string is located in a pdf document using python
PyPDF2
# import packages
import PyPDF2
import re
# open the pdf file
object = PyPDF2.PdfFileReader(r"source_file_path")
# get number of pages
NumPages = object.getNumPages()
# define keyterms
String = "P4F-21B"
# extract text and do the search
for i in range(0, NumPages):
PageObj = object.getPage(i)
Text = PageObj.extractText()
ResSearch = re.search(String, Text)
if ResSearch != None:
print(ResSearch)
print("Page Number" + str(i+1))
Output:
<re.Match object; span=(57, 64), match='P4F-21B'>
Page Number1
PyMuPDF
import fitz
import re
# load document
doc = fitz.open(r"C:\Users\shraddha.shetty\Desktop\OCR-pages-deleted.pdf")
# define keyterms
String = "P4F-21B"
# get text, search for string and print count on page.
for page in doc:
text = ''
text += page.get_text()
if len(re.findall(String, text)) > 0:
print(f'count on page {page.number + 1} is: {len(re.findall(String, text))}')
In addition to what #user1043144 mentioned,
To use with python 3.x
Use PyPDF2
import PyPDF2
Use open instead of file
PdfFileReader(open(xFile, 'rb'))
updated answer with PYDF2
import re
import PyPDF2
def pdf_find_text(xfile_pdf, xsearch_string, ignore_case = False):
'''
find page(s) on which a given text is located in a pdf
input: pdf file and the string to search
(string to search can be in a regex like 'references\n')
N.B:
results need to be checked
in case of pdf whose page numbers are not zero indexed ,
the results seems off (by one page)
'''
xlst_res = []
xreader = PyPDF2.PdfFileReader(xfile_pdf)
for xpage_nr, xpage in enumerate(xreader.pages):
xpage_text = xpage.extractText()
xhits = None
if ignore_case == False:
xhits = re.search(xsearch_string, xpage_text.lower())
else:
xhits = re.search(xsearch_string, xpage_text.lower(), re.IGNORECASE)
if xhits:
xlst_res.append(xpage_nr)
return {'num_pages': xreader.numPages, 'page_hits': xlst_res}
def pdf_extract_pages(xpdf_original, xpdf_new , xpage_start, xpage_end):
'''
given a pdf extract a page range and save it in a new pdf file
'''
with open(xpdf_original, 'rb') as xfile_1, open(xpdf_new , 'wb') as xfile_2:
xreader = PyPDF2.PdfFileReader(xfile_1)
xwriter = PyPDF2.PdfFileWriter()
for xpage_nr in range(xpage_start, xpage_end ):
xwriter.addPage(xreader.getPage(xpage_nr))
xwriter.write(xfile_2)