Loop script to extract multiple PDFs to text files using Python PDFMiner

Loop script to extract multiple PDFs to text files using Python PDFMiner - python

Grateful for your help. I found this sample script to extract a PDF to a text file:
https://gist.github.com/vinovator/c78c2cb63d62fdd9fb67
This works, and it is probably the most accurate extraction I've found. I would like to edit it to loop through multiple PDFs and write them to multiple text files, all with the same name as the PDF they were created from. I'm struggling to do so and keep either only writing one text file, or overwriting the PDFs I'm trying to extract from. Anyone able just to help me with a loop that will loop through all PDFs in a single folder and extract them to individual text files of the same name as the PDF?
Thanks in advance for your help!
import os
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfpage import PDFPage
# From PDFInterpreter import both PDFResourceManager and PDFPageInterpreter
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfdevice import PDFDevice
# Import this to raise exception whenever text extraction from PDF is not allowed
from pdfminer.pdfpage import PDFTextExtractionNotAllowed
from pdfminer.layout import LAParams, LTTextBox, LTTextLine
from pdfminer.converter import PDFPageAggregator
base_path = "C://some_folder"
my_file = os.path.join(base_path + "/" + "test_pdf.pdf")
log_file = os.path.join(base_path + "/" + "pdf_log.txt")
password = ""
extracted_text = ""
# Open and read the pdf file in binary mode
fp = open(my_file, "rb")
# Create parser object to parse the pdf content
parser = PDFParser(fp)
# Store the parsed content in PDFDocument object
document = PDFDocument(parser, password)
# Check if document is extractable, if not abort
if not document.is_extractable:
raise PDFTextExtractionNotAllowed
# Create PDFResourceManager object that stores shared resources such as fonts or images
rsrcmgr = PDFResourceManager()
# set parameters for analysis
laparams = LAParams()
# Create a PDFDevice object which translates interpreted information into desired format
# Device needs to be connected to resource manager to store shared resources
# device = PDFDevice(rsrcmgr)
# Extract the decive to page aggregator to get LT object elements
device = PDFPageAggregator(rsrcmgr, laparams=laparams)
# Create interpreter object to process page content from PDFDocument
# Interpreter needs to be connected to resource manager for shared resources and device
interpreter = PDFPageInterpreter(rsrcmgr, device)
# Ok now that we have everything to process a pdf document, lets process it page by page
for page in PDFPage.create_pages(document):
# As the interpreter processes the page stored in PDFDocument object
interpreter.process_page(page)
# The device renders the layout from interpreter
layout = device.get_result()
# Out of the many LT objects within layout, we are interested in LTTextBox and LTTextLine
for lt_obj in layout:
if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, LTTextLine):
extracted_text += lt_obj.get_text()
#close the pdf file
fp.close()
# print (extracted_text.encode("utf-8"))
with open(log_file, "wb") as my_log:
my_log.write(extracted_text.encode("utf-8"))
print("Done !!")

Assuming you have the following directory structure:
script.py
pdfs
├─a.pdf
├─b.pdf
└─c.pdf
txts
Where script.py is your Python script, pdfs is a folder containing your PDF documents, and txts is an empty folder where the extracted text files should go.
We can use pathlib.Path.glob to discover the paths of all PDF documents in a given directory. We iterate over the paths, and for each path we open the corresponding PDF document, parse it, extract the text and save the text in a text document (with the same name) in the txts folder.
def main():
from pathlib import Path
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfdevice import PDFDevice
from pdfminer.layout import LAParams, LTTextBox, LTTextLine
from pdfminer.converter import PDFPageAggregator
for path in Path("pdfs").glob("*.pdf"):
with path.open("rb") as file:
parser = PDFParser(file)
document = PDFDocument(parser, "")
if not document.is_extractable:
continue
manager = PDFResourceManager()
params = LAParams()
device = PDFPageAggregator(manager, laparams=params)
interpreter = PDFPageInterpreter(manager, device)
text = ""
for page in PDFPage.create_pages(document):
interpreter.process_page(page)
for obj in device.get_result():
if isinstance(obj, LTTextBox) or isinstance(obj, LTTextLine):
text += obj.get_text()
with open("txts/{}.txt".format(path.stem), "w") as file:
file.write(text)
return 0
if __name__ == "__main__":
import sys
sys.exit(main())

The script author specifies the input and output files at the start with two parameters: my_file and log_file
You can convert the script to a function that takes these as inputs and performs the extraction, then loop this function multiple times.
# import statemates as in the original script
base_path = "C://some_folder"
# Define a pair of tuples with lists of your file names
my_files = ("pdf1.pdf","pdf2.pdf")
log_files = ("log1.txt","log2.txt")
# This is called a list comprehension, it takes each of the
# files listed above and generates the complete file path
my_files = [os.path.join(base_path,x) for x in my_files]
log_files = [os.path.join(base_path,x) for x in log_files]
# Function to extract the file
def extract(my_file,log_file):
# code to perform the file extraction as in the original script
# loop through the file names,
# as we have two list, use a range of indices instead of for name in my_files
for i in range(len(my_files)):
extract(my_files[i],log_files[i])
You should also check the documentation for os.path.join as your usage is not best practice (it may break when switching operating systems).

Related

How to extract only specific text from PDF file using python

How to extract some of the specific text only from PDF files using python and store the output data into particular columns of Excel.
Here is the sample input PDF file (File.pdf)
Link to the full PDF file File.pdf
We need to extract the value of Invoice Number, Due Date and Total Due from the whole PDF file.
Script i have used so far:
from io import StringIO
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfparser import PDFParser
output_string = StringIO()
with open('file.pdf', 'rb') as in_file:
parser = PDFParser(in_file)
doc = PDFDocument(parser)
rsrcmgr = PDFResourceManager()
device = TextConverter(rsrcmgr, output_string, laparams=LAParams())
interpreter = PDFPageInterpreter(rsrcmgr, device)
for page in PDFPage.create_pages(doc):
interpreter.process_page(page)
print(output_string.getvalue())
But not getting the specific output value from the PDF file .

If you want to find the data in in your way (pdfminer), you can search for a pattern to extract the data like the following (new is the regex at the end, based on your given data):
from io import StringIO
import re
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfparser import PDFParser
output_string = StringIO()
with open('testfile.pdf', 'rb') as in_file:
parser = PDFParser(in_file)
doc = PDFDocument(parser)
rsrcmgr = PDFResourceManager()
device = TextConverter(rsrcmgr, output_string, laparams=LAParams())
interpreter = PDFPageInterpreter(rsrcmgr, device)
for page in PDFPage.create_pages(doc):
interpreter.process_page(page)
finding = re.search(r"INV-\d+\n\d+\n.+\n.+\n\$\d+\.\d+", output_string.getvalue())
invoice_no, order_no, _, due_date, total_due = finding.group(0).split("\n")
print(invoice_no, order_no, due_date, total_due)
If you want to store the data in excel, you may have to be more specific (or open a new question) or look on these pages:
Writing to an Excel spreadsheet
https://www.geeksforgeeks.org/writing-excel-sheet-using-python/
https://xlsxwriter.readthedocs.io/
PS: the other answer looks like a good solution, you only have to filter the data
EDIT:
Second solution. Here I use another package PyPDF2, because there you get the data in an other order (maybe this is possible with PDFMiner, too). If the text before the values are always the same, you can find the data like this:
import re
import PyPDF2
def parse_pdf() -> list:
with open("testfile.pdf", "rb") as file:
fr = PyPDF2.PdfFileReader(file)
data = fr.getPage(0).extractText()
regex_invoice_no = re.compile(r"Invoice Number\s*(INV-\d+)")
regex_order_no = re.compile(r"Order Number(\d+)")
regex_invoice_date = re.compile(r"Invoice Date(\S+ \d{1,2}, \d{4})")
regex_due_date = re.compile(r"Due Date(\S+ \d{1,2}, \d{4})")
regex_total_due = re.compile(r"Total Due(\$\d+\.\d{1,2})")
invoice_no = re.search(regex_invoice_no, data).group(1)
order_no = re.search(regex_order_no, data).group(1)
invoice_date = re.search(regex_invoice_date, data).group(1)
due_date = re.search(regex_due_date, data).group(1)
total_due = re.search(regex_total_due, data).group(1)
return [invoice_no, due_date, total_due]
if __name__ == '__main__':
print(parse_pdf())
Maybe you have to change the regexes, because they are only based on the given example. The regexes are only working if they find the regex, so you have to work with try: except per regex ;)
If this does not answer your question, you have to provide more information/example pdfs.

You can extract data using tabula and using that data you can create an excel file using python:
df = ("./Downloads/folder/myfile.pdf")
output = "./Downloads/folder/test.csv"
tabula.convert_into(df, output, output_format="csv", stream=True)
excel file creation:
https://www.geeksforgeeks.org/python-create-and-write-on-excel-file-using-xlsxwriter-module/

How to parse PDF text into sentences

I'm wondering how to parse PDF text into sentences, I've found a great variety of solutions here, but quite frankly I do not understand them or they do not solve the problem.
The text I'm trying to parse is IMF reports, and found that the best libary to use is pdfminer. The ultimate goal is to perform sentiment analysis on the reports.
link to text: https://www.imf.org/en/Publications/WEO/Issues/2019/03/28/world-economic-outlook-april-2019
The biggest problems I've encountered are the diverse layout and filtering them, such as frontpage, table of content, graphs etc. The second problem is special characters and characters that it can't read properly making them apostrophes.
Here is what I've got and what I'have tried:
Def Parse_PDF(file) is used to read the PDF
def text_to_sentence() is supposed to convert the text into a list of sentences, put doesn't.
The other two solutions I have found here, for the purpose of reading the PDF, but haven't found them to work properly on the text as explained above. What am I missing here, what can be done for this?
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfpage import PDFTextExtractionNotAllowed
from pdfminer.pdfinterp import PDFResourceManager
from pdfminer.pdfinterp import PDFPageInterpreter
from pdfminer.pdfdevice import PDFDevice
from pdfminer.layout import LAParams
from pdfminer.converter import PDFPageAggregator
import pdfminer
import PyPDF2 as pdf
import nltk
def Parse_PDF(file):
filename = '/Users/andreas/Desktop/Python/Portfolio/Text_analasis/IMF_Publications/{}.pdf'.format(file)
Myfile = open(filename, mode='rb') #Opens PDF
pdfReader = pdf.PdfFileReader(Myfile) #Reads file
parsedpageCt = pdfReader.numPages #Get's number of pages
count = 0
text = ""
#The while loop will read each page
while count < parsedpageCt:
pageObj = pdfReader.getPage(count)
count +=1
text += pageObj.extractText() #Extracts the text
report = text.lower()
return text
def text_to_sentence():
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
data = Parse_PDF('text')
return '\n-----\n'.join(tokenizer.tokenize(data))
fp = open('/Users/andreas/Desktop/Python/Portfolio/Text_analasis/IMF_Publications/text.pdf', 'rb')
rsrcmgr = PDFResourceManager()
laparams = LAParams()
device = PDFPageAggregator(rsrcmgr, laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device)
pages = PDFPage.get_pages(fp)
for page in pages:
print('Processing next page...')
interpreter.process_page(page)
layout = device.get_result()
print(layout)
#for lobj in layout:
#if isinstance(lobj, LTTextBox):
#x, y, text = lobj.bbox[0], lobj.bbox[3], lobj.get_text()
#print('At %r is text: %s' % ((x, y), text))
fp = open('/Users/andreas/Desktop/Python/Portfolio/Text_analasis/IMF_Publications/text.pdf', 'rb')
# Create a PDF parser object associated with the file object.
parser = PDFParser(fp)
# Create a PDF document object that stores the document structure.
# Password for initialization as 2nd parameter
document = PDFDocument(parser)
# Check if the document allows text extraction. If not, abort.
if not document.is_extractable:
raise PDFTextExtractionNotAllowed
# Create a PDF resource manager object that stores shared resources.
rsrcmgr = PDFResourceManager()
# Create a PDF device object.
device = PDFDevice(rsrcmgr)
# BEGIN LAYOUT ANALYSIS
# Set parameters for analysis.
laparams = LAParams()
# Create a PDF page aggregator object.
device = PDFPageAggregator(rsrcmgr, laparams=laparams)
# Create a PDF interpreter object.
interpreter = PDFPageInterpreter(rsrcmgr, device)
def parse_obj(lt_objs):
# loop over the object list
for obj in lt_objs:
# if it's a textbox, print text and location
if isinstance(obj, pdfminer.layout.LTTextBoxHorizontal):
print("%6d, %6d, %s" % (obj.bbox[0], obj.bbox[1], obj.get_text().replace('\n', '_')))
# if it's a container, recurse
elif isinstance(obj, pdfminer.layout.LTFigure):
parse_obj(obj._objs)
# loop over all pages in the document
for page in PDFPage.create_pages(document):
# read the page into a layout object
interpreter.process_page(page)
layout = device.get_result()
# extract text from this object
parse_obj(layout._objs)

PDFPage does not exist in Python PDFMiner library

So i pip installed pdfminer3k for python 3.6. I was trying to follow some examples in opening and converting PDF files to text and they all require a PDFPage import. This does not exist for me. Is there any work around for this? I tried copying a PDFPage.py from online and saving to the directory where python searches pdfminer but I just got... "Import Error: cannot import name PDFObjectNotFound".
Thanks!

Ah. I guess the PDFPage is not meant for python 3.6. Following example from How to read pdf file using pdfminer3k? solved my issues!

import io
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfparser import PDFParser, PDFDocument
def extract_text_from_pdf(pdf_path):
'''
Iterator: extract the plain text from pdf-files with pdfminer3k
pdf_path: path to pdf-file to be extracted
return: iterator of string of extracted text (by page)
'''
# pdfminer.six-version can be found at:
# https://www.blog.pythonlibrary.org/2018/05/03/exporting-data-from-pdfs-with-python/
with open(pdf_path, 'rb') as fp:
parser = PDFParser(fp)
doc = PDFDocument()
parser.set_document(doc)
doc.set_parser(parser)
doc.initialize('')
for page in doc.get_pages(): # pdfminer.six: PDFPage.get_pages(fh, caching=True, check_extractable=True):
rsrcmgr = PDFResourceManager()
fake_file_handle = io.StringIO()
device = TextConverter(rsrcmgr, fake_file_handle, laparams=LAParams())
interpreter = PDFPageInterpreter(rsrcmgr, device)
interpreter.process_page(page)
text = fake_file_handle.getvalue()
yield text
# close open handles
device.close()
fake_file_handle.close()
maxPages = 1
for i, t in enumerate(extract_text_from_pdf(fPath)):
if i<maxPages:
print(f"Page {i}:\n{t}")
else:
print(f"Page {i} skipped!")

Iterate through .PDFs and convert them to .txt using PDFMiner

I'm trying to merge two different things I've been able to accomplish independently. Unfortunately the PDFMiner docs are just not useful at all.
I have a folder that has hundred of PDFs, named: "[0-9].pdf", in it, in no particular order and I don't care to sort them. I just need a way to go through them and convert them to text.
Using this post: Extracting text from a PDF file using PDFMiner in python? - I was able to extract the text from one PDF successfully.
Some of this post: batch process text to csv using python - was useful in determining how to open a folder full of PDFs and work with them.
Now, I just don't know how I can combine them to one-by-one open a PDF, convert it to a text object, save that to a text file with the same original-filename.txt, and then move onto the next PDF in the directory.
Here's my code:
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from cStringIO import StringIO
import os
import glob
directory = r'./Documents/003/' #path
pdfFiles = glob.glob(os.path.join(directory, '*.pdf'))
resourceManager = PDFResourceManager()
returnString = StringIO()
codec = 'utf-8'
laParams = LAParams()
device = TextConverter(resourceManager, returnString, codec=codec, laparams=laParams)
interpreter = PDFPageInterpreter(resourceManager, device)
password = ""
maxPages = 0
caching = True
pageNums=set()
for one_pdf in pdfFiles:
print("Processing file: " + str(one_pdf))
fp = file(one_pdf, 'rb')
for page in PDFPage.get_pages(fp, pageNums, maxpages=maxPages, password=password,caching=caching, check_extractable=True):
interpreter.process_page(page)
text = returnString.getvalue()
filenameString = str(one_pdf) + ".txt"
text_file = open(filenameString, "w")
text_file.write(text)
text_file.close()
fp.close()
device.close()
returnString.close()
I get no compilation errors, but my code doesn't do anything.
Thanks for your help!

Just answering my own question with the solution idea from #LaurentLAPORTE that worked.
Set directory to an absolute path using os like this: os.path.abspath("../Documents/003/"). And then it'll work.

Extract text from PDF

I have a bunch of PDF files that I need to convert to TXT. Unfortunately, when i use one of the many available utilities to do this, it loses all formatting and all the tabulated data in the PDF gets jumbled up. Is it possible to use Python to extract the text from the PDF by specifying postions, etc?
Thanks.

PDFs do not contain tabular data unless it contains structured content. Some tools include heuristics to try and guess the data structure and put it back. I wrote a blog article explaining the issues with PDF text extraction at http://www.jpedal.org/PDFblog/2009/04/pdf-text/

$ pdftotext -layout thingwithtablesinit.pdf
will produce a text file thingwithtablesinit.txt with the tables right.

I had a similar problem and ended up using XPDF from http://www.foolabs.com/xpdf/
One of the utils is PDFtoText, but I guess it all comes up to, how the PDF was produced.

As explained in other answers, extracting text from PDF is not a straight forward task. However there are certain Python libraries such as pdfminer (pdfminer3k for Python 3) that are reasonably efficient.
The code snippet below shows a Python class which can be instantiated to extract text from PDF. This will work in most of the cases.
(source - https://gist.github.com/vinovator/a46341c77273760aa2bb)
# Python 2.7.6
# PdfAdapter.py
""" Reusable library to extract text from pdf file
Uses pdfminer library; For Python 3.x use pdfminer3k module
Below links have useful information on components of the program
https://euske.github.io/pdfminer/programming.html
http://denis.papathanasiou.org/posts/2010.08.04.post.html
"""
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfpage import PDFPage
# From PDFInterpreter import both PDFResourceManager and PDFPageInterpreter
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
# from pdfminer.pdfdevice import PDFDevice
# To raise exception whenever text extraction from PDF is not allowed
from pdfminer.pdfpage import PDFTextExtractionNotAllowed
from pdfminer.layout import LAParams, LTTextBox, LTTextLine
from pdfminer.converter import PDFPageAggregator
import logging
__doc__ = "eusable library to extract text from pdf file"
__name__ = "pdfAdapter"
""" Basic logging config
"""
log = logging.getLogger(__name__)
log.addHandler(logging.NullHandler())
class pdf_text_extractor:
""" Modules overview:
- PDFParser: fetches data from pdf file
- PDFDocument: stores data parsed by PDFParser
- PDFPageInterpreter: processes page contents from PDFDocument
- PDFDevice: translates processed information from PDFPageInterpreter
to whatever you need
- PDFResourceManager: Stores shared resources such as fonts or images
used by both PDFPageInterpreter and PDFDevice
- LAParams: A layout analyzer returns a LTPage object for each page in
the PDF document
- PDFPageAggregator: Extract the decive to page aggregator to get LT
object elements
"""
def __init__(self, pdf_file_path, password=""):
""" Class initialization block.
Pdf_file_path - Full path of pdf including name
password = If not passed, assumed as none
"""
self.pdf_file_path = pdf_file_path
self.password = password
def getText(self):
""" Algorithm:
1) Txr information from PDF file to PDF document object using parser
2) Open the PDF file
3) Parse the file using PDFParser object
4) Assign the parsed content to PDFDocument object
5) Now the information in this PDFDocumet object has to be processed.
For this we need PDFPageInterpreter, PDFDevice and PDFResourceManager
6) Finally process the file page by page
"""
# Open and read the pdf file in binary mode
with open(self.pdf_file_path, "rb") as fp:
# Create parser object to parse the pdf content
parser = PDFParser(fp)
# Store the parsed content in PDFDocument object
document = PDFDocument(parser, self.password)
# Check if document is extractable, if not abort
if not document.is_extractable:
raise PDFTextExtractionNotAllowed
# Create PDFResourceManager object that stores shared resources
# such as fonts or images
rsrcmgr = PDFResourceManager()
# set parameters for analysis
laparams = LAParams()
# Create a PDFDevice object which translates interpreted
# information into desired format
# Device to connect to resource manager to store shared resources
# device = PDFDevice(rsrcmgr)
# Extract the decive to page aggregator to get LT object elements
device = PDFPageAggregator(rsrcmgr, laparams=laparams)
# Create interpreter object to process content from PDFDocument
# Interpreter needs to be connected to resource manager for shared
# resources and device
interpreter = PDFPageInterpreter(rsrcmgr, device)
# Initialize the text
extracted_text = ""
# Ok now that we have everything to process a pdf document,
# lets process it page by page
for page in PDFPage.create_pages(document):
# As the interpreter processes the page stored in PDFDocument
# object
interpreter.process_page(page)
# The device renders the layout from interpreter
layout = device.get_result()
# Out of the many LT objects within layout, we are interested
# in LTTextBox and LTTextLine
for lt_obj in layout:
if (isinstance(lt_obj, LTTextBox) or
isinstance(lt_obj, LTTextLine)):
extracted_text += lt_obj.get_text()
return extracted_text.encode("utf-8")
Note - There are other libraries such as PyPDF2 which are good at transforming a PDF, such as merging PDF pages, splitting or cropping specific pages out of PDF etc.

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Loop script to extract multiple PDFs to text files using Python PDFMiner - python

Related

How to extract only specific text from PDF file using python

How to parse PDF text into sentences

PDFPage does not exist in Python PDFMiner library

Iterate through .PDFs and convert them to .txt using PDFMiner

Extract text from PDF

Categories

Resources