How to extract some of the specific text only from PDF files using python and store the output data into particular columns of Excel.
Here is the sample input PDF file (File.pdf)
Link to the full PDF file File.pdf
We need to extract the value of Invoice Number, Due Date and Total Due from the whole PDF file.
Script i have used so far:
from io import StringIO
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfparser import PDFParser
output_string = StringIO()
with open('file.pdf', 'rb') as in_file:
parser = PDFParser(in_file)
doc = PDFDocument(parser)
rsrcmgr = PDFResourceManager()
device = TextConverter(rsrcmgr, output_string, laparams=LAParams())
interpreter = PDFPageInterpreter(rsrcmgr, device)
for page in PDFPage.create_pages(doc):
interpreter.process_page(page)
print(output_string.getvalue())
But not getting the specific output value from the PDF file .
If you want to find the data in in your way (pdfminer), you can search for a pattern to extract the data like the following (new is the regex at the end, based on your given data):
from io import StringIO
import re
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfparser import PDFParser
output_string = StringIO()
with open('testfile.pdf', 'rb') as in_file:
parser = PDFParser(in_file)
doc = PDFDocument(parser)
rsrcmgr = PDFResourceManager()
device = TextConverter(rsrcmgr, output_string, laparams=LAParams())
interpreter = PDFPageInterpreter(rsrcmgr, device)
for page in PDFPage.create_pages(doc):
interpreter.process_page(page)
finding = re.search(r"INV-\d+\n\d+\n.+\n.+\n\$\d+\.\d+", output_string.getvalue())
invoice_no, order_no, _, due_date, total_due = finding.group(0).split("\n")
print(invoice_no, order_no, due_date, total_due)
If you want to store the data in excel, you may have to be more specific (or open a new question) or look on these pages:
Writing to an Excel spreadsheet
https://www.geeksforgeeks.org/writing-excel-sheet-using-python/
https://xlsxwriter.readthedocs.io/
PS: the other answer looks like a good solution, you only have to filter the data
EDIT:
Second solution. Here I use another package PyPDF2, because there you get the data in an other order (maybe this is possible with PDFMiner, too). If the text before the values are always the same, you can find the data like this:
import re
import PyPDF2
def parse_pdf() -> list:
with open("testfile.pdf", "rb") as file:
fr = PyPDF2.PdfFileReader(file)
data = fr.getPage(0).extractText()
regex_invoice_no = re.compile(r"Invoice Number\s*(INV-\d+)")
regex_order_no = re.compile(r"Order Number(\d+)")
regex_invoice_date = re.compile(r"Invoice Date(\S+ \d{1,2}, \d{4})")
regex_due_date = re.compile(r"Due Date(\S+ \d{1,2}, \d{4})")
regex_total_due = re.compile(r"Total Due(\$\d+\.\d{1,2})")
invoice_no = re.search(regex_invoice_no, data).group(1)
order_no = re.search(regex_order_no, data).group(1)
invoice_date = re.search(regex_invoice_date, data).group(1)
due_date = re.search(regex_due_date, data).group(1)
total_due = re.search(regex_total_due, data).group(1)
return [invoice_no, due_date, total_due]
if __name__ == '__main__':
print(parse_pdf())
Maybe you have to change the regexes, because they are only based on the given example. The regexes are only working if they find the regex, so you have to work with try: except per regex ;)
If this does not answer your question, you have to provide more information/example pdfs.
You can extract data using tabula and using that data you can create an excel file using python:
df = ("./Downloads/folder/myfile.pdf")
output = "./Downloads/folder/test.csv"
tabula.convert_into(df, output, output_format="csv", stream=True)
excel file creation:
https://www.geeksforgeeks.org/python-create-and-write-on-excel-file-using-xlsxwriter-module/
Related
So i pip installed pdfminer3k for python 3.6. I was trying to follow some examples in opening and converting PDF files to text and they all require a PDFPage import. This does not exist for me. Is there any work around for this? I tried copying a PDFPage.py from online and saving to the directory where python searches pdfminer but I just got... "Import Error: cannot import name PDFObjectNotFound".
Thanks!
Ah. I guess the PDFPage is not meant for python 3.6. Following example from How to read pdf file using pdfminer3k? solved my issues!
import io
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfparser import PDFParser, PDFDocument
def extract_text_from_pdf(pdf_path):
'''
Iterator: extract the plain text from pdf-files with pdfminer3k
pdf_path: path to pdf-file to be extracted
return: iterator of string of extracted text (by page)
'''
# pdfminer.six-version can be found at:
# https://www.blog.pythonlibrary.org/2018/05/03/exporting-data-from-pdfs-with-python/
with open(pdf_path, 'rb') as fp:
parser = PDFParser(fp)
doc = PDFDocument()
parser.set_document(doc)
doc.set_parser(parser)
doc.initialize('')
for page in doc.get_pages(): # pdfminer.six: PDFPage.get_pages(fh, caching=True, check_extractable=True):
rsrcmgr = PDFResourceManager()
fake_file_handle = io.StringIO()
device = TextConverter(rsrcmgr, fake_file_handle, laparams=LAParams())
interpreter = PDFPageInterpreter(rsrcmgr, device)
interpreter.process_page(page)
text = fake_file_handle.getvalue()
yield text
# close open handles
device.close()
fake_file_handle.close()
maxPages = 1
for i, t in enumerate(extract_text_from_pdf(fPath)):
if i<maxPages:
print(f"Page {i}:\n{t}")
else:
print(f"Page {i} skipped!")
I'm trying to merge two different things I've been able to accomplish independently. Unfortunately the PDFMiner docs are just not useful at all.
I have a folder that has hundred of PDFs, named: "[0-9].pdf", in it, in no particular order and I don't care to sort them. I just need a way to go through them and convert them to text.
Using this post: Extracting text from a PDF file using PDFMiner in python? - I was able to extract the text from one PDF successfully.
Some of this post: batch process text to csv using python - was useful in determining how to open a folder full of PDFs and work with them.
Now, I just don't know how I can combine them to one-by-one open a PDF, convert it to a text object, save that to a text file with the same original-filename.txt, and then move onto the next PDF in the directory.
Here's my code:
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from cStringIO import StringIO
import os
import glob
directory = r'./Documents/003/' #path
pdfFiles = glob.glob(os.path.join(directory, '*.pdf'))
resourceManager = PDFResourceManager()
returnString = StringIO()
codec = 'utf-8'
laParams = LAParams()
device = TextConverter(resourceManager, returnString, codec=codec, laparams=laParams)
interpreter = PDFPageInterpreter(resourceManager, device)
password = ""
maxPages = 0
caching = True
pageNums=set()
for one_pdf in pdfFiles:
print("Processing file: " + str(one_pdf))
fp = file(one_pdf, 'rb')
for page in PDFPage.get_pages(fp, pageNums, maxpages=maxPages, password=password,caching=caching, check_extractable=True):
interpreter.process_page(page)
text = returnString.getvalue()
filenameString = str(one_pdf) + ".txt"
text_file = open(filenameString, "w")
text_file.write(text)
text_file.close()
fp.close()
device.close()
returnString.close()
I get no compilation errors, but my code doesn't do anything.
Thanks for your help!
Just answering my own question with the solution idea from #LaurentLAPORTE that worked.
Set directory to an absolute path using os like this: os.path.abspath("../Documents/003/"). And then it'll work.
I have a script that takes a PDF and formats it to HTML, cleans up the HTML tags and spits out a clean text. Then runs some regex to extract data from each PDF. Basically I'm having trouble figuring out how to iterate through all the files and run the cleanup, THEN run regex for each. My code looks something like this:
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import HTMLConverter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from bs4 import BeautifulSoup
from cStringIO import StringIO
from pandas import DataFrame
import pandas as pd
import glob
import re
path = r'F:\Desktop\Metadata\'
allFiles = glob.glob(path + "/*.pdf")
for file_ in allFiles:
convert_pdf_to_html(file_)
def convert_pdf_to_html(path):
rsrcmgr = PDFResourceManager()
retstr = StringIO()
codec = 'utf-8'
laparams = LAParams()
device = HTMLConverter(rsrcmgr, retstr, codec = codec, laparams = laparams)
fp = file(path, 'rb')
interpreter = PDFPageInterpreter(rsrcmgr, device)
password = ""
maxpages = 0
caching = True
pagenos = set()
for page in PDFPage.get_pages(fp, pagenos, maxpages = maxpages, password = password, caching = caching, check_extractable = True):
interpreter.process_page(page)
fp.close()
device.close()
str = retstr.getvalue()
retstr.close()
return str
Now here is the part I'm confused about. I have to set the convert_pdf_to_html as a variable such as "text" and then take that text and input it through beautiful soup to clean it up. I need to do this for each PDF file in my folder.
text = convert_pdf_to_html(path)
soup = BeautifulSoup(text, 'lxml') #remove HTML tags
document_text = soup.get_text() #cleaned up text
Then I need to run a few regex queries like so and output them each to a new file with the format 'filename.csv'
print " Alt : "
list = (re.findall(r"""
((?:LE|BA|BE|BM|BC)[\w]+\:)
""",document_text, re.X))
print list
The good way will be create a class which can handle each of your function for each instance of file you send to the class.
Class PDFParser:
file = ''
def __init__(self, myfile):
file = myfile
def get_html_response(self):
//your code for pdf to html
def run_regx(self):
//run your regex here.
for file_ in allFiles:
my_parse = PDFParser(file_)
my_parse.get_html_response()
my_parse.run_regx()
When I copy and paste a pdf document into a text file using ctrl+a, ctrl+c, ctrl+v I get a result like
this:
but when I use pdfminer with the code below i get this:
from cStringIO import StringIO
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
*....*
def scrub(self):
text = self.convert(self.inFile)
with open(self.WBOutputFile, "w") as WBOut:
WBOut.write(text)
#code from Tim Arnold at https://www.binpress.com/tutorial/manipulating-pdfs-with-python/167
def convert(self, fname):
pagenums = set()
output = StringIO()
manager = PDFResourceManager()
converter = TextConverter(manager, output, laparams=LAParams())
interpreter = PDFPageInterpreter(manager, converter)
infile = file(fname, 'rb')
for page in PDFPage.get_pages(infile, pagenums):
interpreter.process_page(page)
infile.close()
converter.close()
text = output.getvalue()
output.close
return text
*....*
The code takes several seconds longer than doing it manually but I want to automate this pdf to text process because I have a lot of documents. Is there a way to get similar result (in terms of speed and formatting) similarly to copy and paste? I am using chrome as my pdf viewer, sublime text as my text editor, and windows 8 as my OS.
I am using pdf from http:// www. supremecourt.gov/oral_arguments/argument_transcripts/14-8349_n648 .pdf
try setting the char_margin in the laparams to 50.
i.e.
laparams=LAParams()
laparams.char_margin = float(50)
converter = TextConverter(manager, output, laparams=laparams)
interpreter = PDFPageInterpreter(manager, converter)
I'm trying to get the data from the tables in this PDF. I've tried pdfminer and pypdf with a little luck but I can't really get the data from the tables.
This is what one of the tables looks like:
As you can see, some columns are marked with an 'x'. I'm trying to this table into a list of objects.
This is the code so far, I'm using pdfminer now.
# pdfminer test
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfdevice import PDFDevice, TagExtractor
from pdfminer.pdfpage import PDFPage, PDFTextExtractionNotAllowed
from pdfminer.converter import XMLConverter, HTMLConverter, TextConverter, PDFPageAggregator
from pdfminer.cmapdb import CMapDB
from pdfminer.layout import LAParams, LTTextBox, LTTextLine, LTFigure, LTImage
from pdfminer.image import ImageWriter
from cStringIO import StringIO
import sys
import os
def pdfToText(path):
rsrcmgr = PDFResourceManager()
retstr = StringIO()
codec = 'utf-8'
laparams = LAParams()
device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
fp = file(path, 'rb')
interpreter = PDFPageInterpreter(rsrcmgr, device)
password = ''
maxpages = 0
caching = True
pagenos = set()
records = []
i = 1
for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,
caching=caching, check_extractable=True):
# process page
interpreter.process_page(page)
# only select lines from the line containing 'Tool' to the line containing "1 The 'All'"
lines = retstr.getvalue().splitlines()
idx = containsSubString(lines, 'Tool')
lines = lines[idx+1:]
idx = containsSubString(lines, "1 The 'All'")
lines = lines[:idx]
for line in lines:
records.append(line)
i += 1
fp.close()
device.close()
retstr.close()
return records
def containsSubString(list, substring):
# find a substring in a list item
for i, s in enumerate(list):
if substring in s:
return i
return -1
# process pdf
fn = '../test1.pdf'
ft = 'test.txt'
text = pdfToText(fn)
outFile = open(ft, 'w')
for i in range(0, len(text)):
outFile.write(text[i])
outFile.close()
That produces a text file and it gets all of the text but, the x's don't have the spacing preserved. The output looks like this:
The x's are just single spaced in the text document
Right now, I'm just producing text output but my goal is to produce an html document with the data from the tables. I've been searching for OCR examples, and most of them seem confusing or incomplete. I'm open to using C# or any other language that might produce the results I'm looking for.
EDIT: There will be multiple pdfs like this that I need to get the table data from. The headers will be the same for all pdfs (s far as I know).
I figured it out, I was going in the wrong direction. What I did was create pngs of each table in the pdf and now I'm processing the images using opencv & python.
Give a try to Tabula and if it works use tabula-extractor library (written in ruby) to programatically extract the data.