How to parse PDF text into sentences - python

I'm wondering how to parse PDF text into sentences, I've found a great variety of solutions here, but quite frankly I do not understand them or they do not solve the problem.
The text I'm trying to parse is IMF reports, and found that the best libary to use is pdfminer. The ultimate goal is to perform sentiment analysis on the reports.
link to text: https://www.imf.org/en/Publications/WEO/Issues/2019/03/28/world-economic-outlook-april-2019
The biggest problems I've encountered are the diverse layout and filtering them, such as frontpage, table of content, graphs etc. The second problem is special characters and characters that it can't read properly making them apostrophes.
Here is what I've got and what I'have tried:
Def Parse_PDF(file) is used to read the PDF
def text_to_sentence() is supposed to convert the text into a list of sentences, put doesn't.
The other two solutions I have found here, for the purpose of reading the PDF, but haven't found them to work properly on the text as explained above. What am I missing here, what can be done for this?
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfpage import PDFTextExtractionNotAllowed
from pdfminer.pdfinterp import PDFResourceManager
from pdfminer.pdfinterp import PDFPageInterpreter
from pdfminer.pdfdevice import PDFDevice
from pdfminer.layout import LAParams
from pdfminer.converter import PDFPageAggregator
import pdfminer
import PyPDF2 as pdf
import nltk
def Parse_PDF(file):
filename = '/Users/andreas/Desktop/Python/Portfolio/Text_analasis/IMF_Publications/{}.pdf'.format(file)
Myfile = open(filename, mode='rb') #Opens PDF
pdfReader = pdf.PdfFileReader(Myfile) #Reads file
parsedpageCt = pdfReader.numPages #Get's number of pages
count = 0
text = ""
#The while loop will read each page
while count < parsedpageCt:
pageObj = pdfReader.getPage(count)
count +=1
text += pageObj.extractText() #Extracts the text
report = text.lower()
return text
def text_to_sentence():
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
data = Parse_PDF('text')
return '\n-----\n'.join(tokenizer.tokenize(data))
fp = open('/Users/andreas/Desktop/Python/Portfolio/Text_analasis/IMF_Publications/text.pdf', 'rb')
rsrcmgr = PDFResourceManager()
laparams = LAParams()
device = PDFPageAggregator(rsrcmgr, laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device)
pages = PDFPage.get_pages(fp)
for page in pages:
print('Processing next page...')
interpreter.process_page(page)
layout = device.get_result()
print(layout)
#for lobj in layout:
#if isinstance(lobj, LTTextBox):
#x, y, text = lobj.bbox[0], lobj.bbox[3], lobj.get_text()
#print('At %r is text: %s' % ((x, y), text))
fp = open('/Users/andreas/Desktop/Python/Portfolio/Text_analasis/IMF_Publications/text.pdf', 'rb')
# Create a PDF parser object associated with the file object.
parser = PDFParser(fp)
# Create a PDF document object that stores the document structure.
# Password for initialization as 2nd parameter
document = PDFDocument(parser)
# Check if the document allows text extraction. If not, abort.
if not document.is_extractable:
raise PDFTextExtractionNotAllowed
# Create a PDF resource manager object that stores shared resources.
rsrcmgr = PDFResourceManager()
# Create a PDF device object.
device = PDFDevice(rsrcmgr)
# BEGIN LAYOUT ANALYSIS
# Set parameters for analysis.
laparams = LAParams()
# Create a PDF page aggregator object.
device = PDFPageAggregator(rsrcmgr, laparams=laparams)
# Create a PDF interpreter object.
interpreter = PDFPageInterpreter(rsrcmgr, device)
def parse_obj(lt_objs):
# loop over the object list
for obj in lt_objs:
# if it's a textbox, print text and location
if isinstance(obj, pdfminer.layout.LTTextBoxHorizontal):
print("%6d, %6d, %s" % (obj.bbox[0], obj.bbox[1], obj.get_text().replace('\n', '_')))
# if it's a container, recurse
elif isinstance(obj, pdfminer.layout.LTFigure):
parse_obj(obj._objs)
# loop over all pages in the document
for page in PDFPage.create_pages(document):
# read the page into a layout object
interpreter.process_page(page)
layout = device.get_result()
# extract text from this object
parse_obj(layout._objs)

Related

Loop script to extract multiple PDFs to text files using Python PDFMiner

Grateful for your help. I found this sample script to extract a PDF to a text file:
https://gist.github.com/vinovator/c78c2cb63d62fdd9fb67
This works, and it is probably the most accurate extraction I've found. I would like to edit it to loop through multiple PDFs and write them to multiple text files, all with the same name as the PDF they were created from. I'm struggling to do so and keep either only writing one text file, or overwriting the PDFs I'm trying to extract from. Anyone able just to help me with a loop that will loop through all PDFs in a single folder and extract them to individual text files of the same name as the PDF?
Thanks in advance for your help!
import os
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfpage import PDFPage
# From PDFInterpreter import both PDFResourceManager and PDFPageInterpreter
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfdevice import PDFDevice
# Import this to raise exception whenever text extraction from PDF is not allowed
from pdfminer.pdfpage import PDFTextExtractionNotAllowed
from pdfminer.layout import LAParams, LTTextBox, LTTextLine
from pdfminer.converter import PDFPageAggregator
base_path = "C://some_folder"
my_file = os.path.join(base_path + "/" + "test_pdf.pdf")
log_file = os.path.join(base_path + "/" + "pdf_log.txt")
password = ""
extracted_text = ""
# Open and read the pdf file in binary mode
fp = open(my_file, "rb")
# Create parser object to parse the pdf content
parser = PDFParser(fp)
# Store the parsed content in PDFDocument object
document = PDFDocument(parser, password)
# Check if document is extractable, if not abort
if not document.is_extractable:
raise PDFTextExtractionNotAllowed
# Create PDFResourceManager object that stores shared resources such as fonts or images
rsrcmgr = PDFResourceManager()
# set parameters for analysis
laparams = LAParams()
# Create a PDFDevice object which translates interpreted information into desired format
# Device needs to be connected to resource manager to store shared resources
# device = PDFDevice(rsrcmgr)
# Extract the decive to page aggregator to get LT object elements
device = PDFPageAggregator(rsrcmgr, laparams=laparams)
# Create interpreter object to process page content from PDFDocument
# Interpreter needs to be connected to resource manager for shared resources and device
interpreter = PDFPageInterpreter(rsrcmgr, device)
# Ok now that we have everything to process a pdf document, lets process it page by page
for page in PDFPage.create_pages(document):
# As the interpreter processes the page stored in PDFDocument object
interpreter.process_page(page)
# The device renders the layout from interpreter
layout = device.get_result()
# Out of the many LT objects within layout, we are interested in LTTextBox and LTTextLine
for lt_obj in layout:
if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, LTTextLine):
extracted_text += lt_obj.get_text()
#close the pdf file
fp.close()
# print (extracted_text.encode("utf-8"))
with open(log_file, "wb") as my_log:
my_log.write(extracted_text.encode("utf-8"))
print("Done !!")
Assuming you have the following directory structure:
script.py
pdfs
├─a.pdf
├─b.pdf
└─c.pdf
txts
Where script.py is your Python script, pdfs is a folder containing your PDF documents, and txts is an empty folder where the extracted text files should go.
We can use pathlib.Path.glob to discover the paths of all PDF documents in a given directory. We iterate over the paths, and for each path we open the corresponding PDF document, parse it, extract the text and save the text in a text document (with the same name) in the txts folder.
def main():
from pathlib import Path
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfdevice import PDFDevice
from pdfminer.layout import LAParams, LTTextBox, LTTextLine
from pdfminer.converter import PDFPageAggregator
for path in Path("pdfs").glob("*.pdf"):
with path.open("rb") as file:
parser = PDFParser(file)
document = PDFDocument(parser, "")
if not document.is_extractable:
continue
manager = PDFResourceManager()
params = LAParams()
device = PDFPageAggregator(manager, laparams=params)
interpreter = PDFPageInterpreter(manager, device)
text = ""
for page in PDFPage.create_pages(document):
interpreter.process_page(page)
for obj in device.get_result():
if isinstance(obj, LTTextBox) or isinstance(obj, LTTextLine):
text += obj.get_text()
with open("txts/{}.txt".format(path.stem), "w") as file:
file.write(text)
return 0
if __name__ == "__main__":
import sys
sys.exit(main())
The script author specifies the input and output files at the start with two parameters: my_file and log_file
You can convert the script to a function that takes these as inputs and performs the extraction, then loop this function multiple times.
# import statemates as in the original script
base_path = "C://some_folder"
# Define a pair of tuples with lists of your file names
my_files = ("pdf1.pdf","pdf2.pdf")
log_files = ("log1.txt","log2.txt")
# This is called a list comprehension, it takes each of the
# files listed above and generates the complete file path
my_files = [os.path.join(base_path,x) for x in my_files]
log_files = [os.path.join(base_path,x) for x in log_files]
# Function to extract the file
def extract(my_file,log_file):
# code to perform the file extraction as in the original script
# loop through the file names,
# as we have two list, use a range of indices instead of for name in my_files
for i in range(len(my_files)):
extract(my_files[i],log_files[i])
You should also check the documentation for os.path.join as your usage is not best practice (it may break when switching operating systems).

How to extract only specific text from PDF file using python

How to extract some of the specific text only from PDF files using python and store the output data into particular columns of Excel.
Here is the sample input PDF file (File.pdf)
Link to the full PDF file File.pdf
We need to extract the value of Invoice Number, Due Date and Total Due from the whole PDF file.
Script i have used so far:
from io import StringIO
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfparser import PDFParser
output_string = StringIO()
with open('file.pdf', 'rb') as in_file:
parser = PDFParser(in_file)
doc = PDFDocument(parser)
rsrcmgr = PDFResourceManager()
device = TextConverter(rsrcmgr, output_string, laparams=LAParams())
interpreter = PDFPageInterpreter(rsrcmgr, device)
for page in PDFPage.create_pages(doc):
interpreter.process_page(page)
print(output_string.getvalue())
But not getting the specific output value from the PDF file .
If you want to find the data in in your way (pdfminer), you can search for a pattern to extract the data like the following (new is the regex at the end, based on your given data):
from io import StringIO
import re
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfparser import PDFParser
output_string = StringIO()
with open('testfile.pdf', 'rb') as in_file:
parser = PDFParser(in_file)
doc = PDFDocument(parser)
rsrcmgr = PDFResourceManager()
device = TextConverter(rsrcmgr, output_string, laparams=LAParams())
interpreter = PDFPageInterpreter(rsrcmgr, device)
for page in PDFPage.create_pages(doc):
interpreter.process_page(page)
finding = re.search(r"INV-\d+\n\d+\n.+\n.+\n\$\d+\.\d+", output_string.getvalue())
invoice_no, order_no, _, due_date, total_due = finding.group(0).split("\n")
print(invoice_no, order_no, due_date, total_due)
If you want to store the data in excel, you may have to be more specific (or open a new question) or look on these pages:
Writing to an Excel spreadsheet
https://www.geeksforgeeks.org/writing-excel-sheet-using-python/
https://xlsxwriter.readthedocs.io/
PS: the other answer looks like a good solution, you only have to filter the data
EDIT:
Second solution. Here I use another package PyPDF2, because there you get the data in an other order (maybe this is possible with PDFMiner, too). If the text before the values are always the same, you can find the data like this:
import re
import PyPDF2
def parse_pdf() -> list:
with open("testfile.pdf", "rb") as file:
fr = PyPDF2.PdfFileReader(file)
data = fr.getPage(0).extractText()
regex_invoice_no = re.compile(r"Invoice Number\s*(INV-\d+)")
regex_order_no = re.compile(r"Order Number(\d+)")
regex_invoice_date = re.compile(r"Invoice Date(\S+ \d{1,2}, \d{4})")
regex_due_date = re.compile(r"Due Date(\S+ \d{1,2}, \d{4})")
regex_total_due = re.compile(r"Total Due(\$\d+\.\d{1,2})")
invoice_no = re.search(regex_invoice_no, data).group(1)
order_no = re.search(regex_order_no, data).group(1)
invoice_date = re.search(regex_invoice_date, data).group(1)
due_date = re.search(regex_due_date, data).group(1)
total_due = re.search(regex_total_due, data).group(1)
return [invoice_no, due_date, total_due]
if __name__ == '__main__':
print(parse_pdf())
Maybe you have to change the regexes, because they are only based on the given example. The regexes are only working if they find the regex, so you have to work with try: except per regex ;)
If this does not answer your question, you have to provide more information/example pdfs.
You can extract data using tabula and using that data you can create an excel file using python:
df = ("./Downloads/folder/myfile.pdf")
output = "./Downloads/folder/test.csv"
tabula.convert_into(df, output, output_format="csv", stream=True)
excel file creation:
https://www.geeksforgeeks.org/python-create-and-write-on-excel-file-using-xlsxwriter-module/

How do I run through PDF files in a path, format and clean each one, and spit out regex with specific text from the individual files?

I have a script that takes a PDF and formats it to HTML, cleans up the HTML tags and spits out a clean text. Then runs some regex to extract data from each PDF. Basically I'm having trouble figuring out how to iterate through all the files and run the cleanup, THEN run regex for each. My code looks something like this:
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import HTMLConverter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from bs4 import BeautifulSoup
from cStringIO import StringIO
from pandas import DataFrame
import pandas as pd
import glob
import re
path = r'F:\Desktop\Metadata\'
allFiles = glob.glob(path + "/*.pdf")
for file_ in allFiles:
convert_pdf_to_html(file_)
def convert_pdf_to_html(path):
rsrcmgr = PDFResourceManager()
retstr = StringIO()
codec = 'utf-8'
laparams = LAParams()
device = HTMLConverter(rsrcmgr, retstr, codec = codec, laparams = laparams)
fp = file(path, 'rb')
interpreter = PDFPageInterpreter(rsrcmgr, device)
password = ""
maxpages = 0
caching = True
pagenos = set()
for page in PDFPage.get_pages(fp, pagenos, maxpages = maxpages, password = password, caching = caching, check_extractable = True):
interpreter.process_page(page)
fp.close()
device.close()
str = retstr.getvalue()
retstr.close()
return str
Now here is the part I'm confused about. I have to set the convert_pdf_to_html as a variable such as "text" and then take that text and input it through beautiful soup to clean it up. I need to do this for each PDF file in my folder.
text = convert_pdf_to_html(path)
soup = BeautifulSoup(text, 'lxml') #remove HTML tags
document_text = soup.get_text() #cleaned up text
Then I need to run a few regex queries like so and output them each to a new file with the format 'filename.csv'
print " Alt : "
list = (re.findall(r"""
((?:LE|BA|BE|BM|BC)[\w]+\:)
""",document_text, re.X))
print list
The good way will be create a class which can handle each of your function for each instance of file you send to the class.
Class PDFParser:
file = ''
def __init__(self, myfile):
file = myfile
def get_html_response(self):
//your code for pdf to html
def run_regx(self):
//run your regex here.
for file_ in allFiles:
my_parse = PDFParser(file_)
my_parse.get_html_response()
my_parse.run_regx()

PDFMiner - export pages as List of Strings

I'm looking to export text from pdf as a list of strings where the list is the whole document and strings are the pages of the PDF. I'm using PDFMiner for this task but it is very complicated and I'm on a tight deadline.
So far I've gotten the code to extract the full pdf as string but I need it in the form of list of strings.
my code is as follows
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfpage import PDFPage
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from cStringIO import StringIO
f = file('./PDF/' + file_name, 'rb')
data = []
rsrcmgr = PDFResourceManager()
retstr = StringIO()
codec = 'utf-8'
laparams = LAParams()
device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
# Create a PDF interpreter object.
interpreter = PDFPageInterpreter(rsrcmgr, device)
# Process each page contained in the document.
for page in PDFPage.get_pages(pdf):
interpreter.process_page(page)
data = retstr.getvalue()
print data
help please.
The issue with your current script is StringIO.getvalue always returns a string, and this string contains all the data read so far. Moreover, with each page, you're overwriting the variable data where you're storing it.
One fix is to store the position of StringIO before it writes, and then reading from this position to the end of the string stream:
# A list for all each page's text
pages_text = []
for page in PDFPage.get_pages(pdf):
# Get (and store) the "cursor" position of stream before reading from PDF
# On the first page, this will be zero
read_position = retstr.tell()
# Read PDF page, write text into stream
interpreter.process_page(page)
# Move the "cursor" to the position stored
retstr.seek(read_position, 0)
# Read the text (from the "cursor" to the end)
page_text = retstr.read()
# Add this page's text to a convenient list
pages_text.append(page_text)
Think of StringIO as a text document. You need to manage the cursor position as text is added and store the newly-added text one page at a time. Here, we're storing text in a list.

Extracting tables from a pdf

I'm trying to get the data from the tables in this PDF. I've tried pdfminer and pypdf with a little luck but I can't really get the data from the tables.
This is what one of the tables looks like:
As you can see, some columns are marked with an 'x'. I'm trying to this table into a list of objects.
This is the code so far, I'm using pdfminer now.
# pdfminer test
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfdevice import PDFDevice, TagExtractor
from pdfminer.pdfpage import PDFPage, PDFTextExtractionNotAllowed
from pdfminer.converter import XMLConverter, HTMLConverter, TextConverter, PDFPageAggregator
from pdfminer.cmapdb import CMapDB
from pdfminer.layout import LAParams, LTTextBox, LTTextLine, LTFigure, LTImage
from pdfminer.image import ImageWriter
from cStringIO import StringIO
import sys
import os
def pdfToText(path):
rsrcmgr = PDFResourceManager()
retstr = StringIO()
codec = 'utf-8'
laparams = LAParams()
device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
fp = file(path, 'rb')
interpreter = PDFPageInterpreter(rsrcmgr, device)
password = ''
maxpages = 0
caching = True
pagenos = set()
records = []
i = 1
for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,
caching=caching, check_extractable=True):
# process page
interpreter.process_page(page)
# only select lines from the line containing 'Tool' to the line containing "1 The 'All'"
lines = retstr.getvalue().splitlines()
idx = containsSubString(lines, 'Tool')
lines = lines[idx+1:]
idx = containsSubString(lines, "1 The 'All'")
lines = lines[:idx]
for line in lines:
records.append(line)
i += 1
fp.close()
device.close()
retstr.close()
return records
def containsSubString(list, substring):
# find a substring in a list item
for i, s in enumerate(list):
if substring in s:
return i
return -1
# process pdf
fn = '../test1.pdf'
ft = 'test.txt'
text = pdfToText(fn)
outFile = open(ft, 'w')
for i in range(0, len(text)):
outFile.write(text[i])
outFile.close()
That produces a text file and it gets all of the text but, the x's don't have the spacing preserved. The output looks like this:
The x's are just single spaced in the text document
Right now, I'm just producing text output but my goal is to produce an html document with the data from the tables. I've been searching for OCR examples, and most of them seem confusing or incomplete. I'm open to using C# or any other language that might produce the results I'm looking for.
EDIT: There will be multiple pdfs like this that I need to get the table data from. The headers will be the same for all pdfs (s far as I know).
I figured it out, I was going in the wrong direction. What I did was create pngs of each table in the pdf and now I'm processing the images using opencv & python.
Give a try to Tabula and if it works use tabula-extractor library (written in ruby) to programatically extract the data.

Categories

Resources