Extract only specific text from PDF using Python

Extract only specific text from PDF using Python - python

Need to extract the specific text only from Invoice PDF file having different PDF structure using python and store the output data into particular excel columns. All the PDF files have different structure but same content values.
Tried to solve it but not able to extract the specific text values only.
Sample PDF file :
Click to view the sample file
Need to Extract Invoice ID, Issue Date, Subject, Amount Due from the whole PDF file.
Script i have used so far:
import PyPDF2
import re
pdfFileObj = open('test.pdf','rb')
pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
pageObj = pdfReader.getPage(0)
text = str(pageObj.extractText())
quotes = re.findall(r'"[^"]*"',text)
print(quotes)

You have a very nice pdf document, because your pdf has form fields, so you can use them directly to read the data:
import PyPDF2
pdfFileObj = open('test.pdf', 'rb')
pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
fields = pdfReader.getFormTextFields()
print(fields["Invoice ID"])
print(fields["Issue Date"])
print(fields["Subject"])
print(fields["Amount Due"])
EDIT:
I combined your requested data (from here: How to extract only specific text from PDF file using python) in a little script with 3 opportunities of parsing the pdf (for your 3 pdfs). The problem is your pdfs have a lot of differences and the packages have some advantages on different pdfs, so i think you have to combine this stuff. The thing is, that you try all functions, till it gets a result. I hope this is an good start for you. You may have to change the regexes, if you have more different pdfs and may you have to store all regex (per field) in an array and use them on the different functions so you have 3 functions for parsing and 4 lists of regexes to use in 2 of the functions.
import PyPDF2
import re
import os
from io import StringIO
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfparser import PDFParser
def parse_pdf_by_regex_2(filename: str) -> dict:
output_string = StringIO()
with open(filename, 'rb') as in_file:
parser = PDFParser(in_file)
doc = PDFDocument(parser)
rsrcmgr = PDFResourceManager()
device = TextConverter(rsrcmgr, output_string, laparams=LAParams())
interpreter = PDFPageInterpreter(rsrcmgr, device)
for page in PDFPage.create_pages(doc):
interpreter.process_page(page)
regex_invoice_no = re.compile(r"Invoice No.:\s*(\w+)\s")
regex_order_no = re.compile(r"IRN:\s*(\d+)")
regex_due_date = re.compile(r"Due Date: (\d{2}\.\d{2}\.\d{4})")
regex_total_due = re.compile(r"([\d,.]+) \n\nTotal Invoice Value\(in words\)")
try:
return {"invoice_id": re.search(regex_invoice_no, output_string.getvalue()).group(1),
"issue_date": re.search(regex_due_date, output_string.getvalue()).group(1),
"subject": re.search(regex_order_no, output_string.getvalue()).group(1),
"amount": re.search(regex_total_due, output_string.getvalue()).group(1)}
except AttributeError as err:
print("Not all elements have been found")
return {}
def parse_pdf_by_form_fields(filename: str) -> dict:
with open(filename, 'rb') as file:
pdf_reader = PyPDF2.PdfFileReader(file)
try:
fields = pdf_reader.getFormTextFields()
except TypeError as err:
# print("No FormFields available")
return {}
try:
# You can also check if onyly missing some values, maybe this can happen, but this is up to your data
return {"invoice_id": fields["Invoice ID"],
"issue_date": fields["Issue Date"],
"subject": fields["Subject"],
"amount": fields["Amount Due"]}
except KeyError as err:
# print(f"Key not found: '{err.args[0]}'")
return {}
def parse_pdf_by_regex(filename: str) -> dict:
with open(filename, 'rb') as file:
pdf_reader = PyPDF2.PdfFileReader(file)
text_data = ""
for page_no in range(pdf_reader.getNumPages()):
text_data += pdf_reader.getPage(page_no).extractText()
regex_invoice_no = re.compile(r"Invoice Number\s*(INV-\d+)")
regex_order_no = re.compile(r"Order Number(\d+)")
regex_due_date = re.compile(r"Due Date(\S+ \d{1,2}, \d{4})")
regex_total_due = re.compile(r"Total Due(\$\d+\.\d{1,2})")
try:
return {"invoice_id": re.search(regex_invoice_no, text_data).group(1),
"issue_date": re.search(regex_due_date, text_data).group(1),
"subject": re.search(regex_order_no, text_data).group(1),
"amount": re.search(regex_total_due, text_data).group(1)}
except AttributeError as err:
# print("Not all elements have been found")
return {}
def parse_pdf(filename: str) -> dict:
# Hint: ':=' is available since pythoon 3.8
if data := parse_pdf_by_form_fields(filename=fname):
return data
elif data := parse_pdf_by_regex(filename=fname):
return data
elif data := parse_pdf_by_regex_2(filename=fname):
return data
else:
print("No data found")
return {}
if __name__ == '__main__':
for fname in os.listdir("."):
if fname.startswith("testfile"):
print(f"check {fname}")
print(parse_pdf(filename=fname))

Related

Incorrect output: Extracting text from pdf's,docx's pptx's will not output in their own spearte line

I created a function that will open each file in a directory and extract the text from each file and output it in an excel sheet using Pandas. The indexing for each file type seems to be working just fine.However the extracted text from each file comes out next to each other in a list and not separated and next to their corresponding file.
See bottom of script for current output and the out put I want.
** I believe the problem lies in the loader() function which takes in a path, goes through each directory file checks the file .ext and extracts the text.
Thank you!
import re
#import PyPDF4
import pathlib
from pathlib import Path
import shutil
from datetime import datetime
import time
from configparser import ConfigParser
import glob
import fileinput
import pandas as pd
import os
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from io import StringIO
import docx2txt
from pptx import Presentation
import more_itertools as mit
p = Path('C:/Users/XXXX/Desktop/test')
txt_files = list(p.rglob('*txt'))
PDF_files = list(p.rglob('*pdf'))
csv_files = list(p.rglob('*csv'))
docx_files = list(p.rglob('*docx'))
pptx_files = list(p.rglob('*pptx'))
#excel_files = list(p.rglob('xls'))
def pdf_to_text(x):
# PDFMiner
rsrcmgr = PDFResourceManager()
sio = StringIO()
codec = 'utf-8'
laparams = LAParams()
device = TextConverter(rsrcmgr, sio, codec=codec, laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device)
# Extract text
fp = open(x, 'rb')
for page in PDFPage.get_pages(fp):
interpreter.process_page(page)
fp.close()
# Get text from StringIO
text = sio.getvalue()
# Cleanup
device.close()
sio.close()
return text
#-------------------------------------------------------------------------------
def loader(path):
with open(str(path.resolve()),"r",encoding = "ISO-8859-1") as f:
docx_out,pptx_out,pdf_out = [],[],[]
if path.suffix == ".pdf":
for name1 in PDF_files:
pdf_out.append(pdf_to_text(name1))
return pdf_out
elif path.suffix == ".docx":
for name2 in docx_files:
docx_out.append(docx2txt.process(name2))
return docx_out
elif path.suffix == ".pptx":
for file in pptx_files:
prs = Presentation(file)
for slide in prs.slides:
for shape in slide.shapes:
if not shape.has_text_frame:
continue
for paragraph in shape.text_frame.paragraphs:
for run in paragraph.runs:
pptx_out.append(run.text)
return pptx_out
else:
return f.readlines()
print(pdf_out)
def file_generator():
files = txt_files+PDF_files+csv_files+docx_files+pptx_files
for item in files:
yield {
"path": item,
"name": item.name[0:],
"created": time.ctime(item.stat().st_ctime),
"modified": time.ctime(item.stat().st_mtime),
"content": loader(item)
}
def to_xlsx():
df = pd.DataFrame.from_dict(file_generator())
df.head()
df.to_excel("tester4.xlsx")
if __name__ == "__main__":
to_xlsx()
#------------------------------------------------------------
OUTPUT EXAMPLE
current output:
content
["content_test1","content_test2"] test1.pdf
["content_test1","content_test2"] test2.pdf
What I want:
["content_test1"] test1.pdf
["content_test2"] test2.pdf

The appends called by each filetype_out function look like they are adding the contents of each file to the end of the list pertaining to that filetype. If you want to generate a unique list with the contents of each individual file, I'd recommend creating a separate dict for each filetype, which then includes individual lists for each file processed. Taking the PDFs as an example:
def loader(path):
with open(str(path.resolve()),"r",encoding = "ISO-8859-1") as f:
docx_out,pptx_out,pdf_out = {},{},{}
if path.suffix == ".pdf":
for name1 in PDF_files:
name1_contents = []
name1_contents.append(pdf_to_text(name1))
pdf_out[name1] = name1_contents
return pdf_out
To then print out your results in a similar way as you have been:
for name, contents in pdf_out:
print(contents + ' ' + name)

Downloading image from url and assign it with a specified id from csv file

I have a csv file with columns: image_id, image_url
I need to download all the images from the URL and save it as the corresponding image_id as the name. Is there a way to do so?
I'm aware you can do so with python from codes that i've seen online such as
import cStringIO # *much* faster than StringIO
import urllib
import Image
try:
file =
urllib.urlopen('http://freegee.sourceforge.net/FG_EN/src/teasers_en/t_gee-power_en.gif')
im = cStringIO.StringIO(file.read()) # constructs a StringIO holding the image
img = Image.open(im)
img.save('/home/wenbert/uploaderx_files/test.gif')
except IOError, e:
raise e
but how do you reference the url and the filename from the csv
even better if i can automate the process to uploading to a GCP bucket
Appreciate any help i can get.
Cheers!

This should help. Use the csv module to parse through your CSV file.
Ex:
# -*- coding: utf-8 -*-
import csv
import cStringIO # *much* faster than StringIO
import urllib
import Image
def downloadFile(imageID, url):
try:
file = urllib.urlopen(url)
im = cStringIO.StringIO(file.read()) # constructs a StringIO holding the image
img = Image.open(im)
img.save('/home/wenbert/uploaderx_files/{0}.gif'.format(imageID))
except IOError, e:
raise e
with open('PATH_TO_.csv', 'rb') as csvfile:
reader = csv.reader(csvfile, delimiter=',')
next(reader, None) # skip the headers
for row in reader:
print row
downloadFile(row[0], row[1])

I have produced a python script below. I have only tested this in python 3.4.3 but should do the trick.
Hope this helps.
import urllib, csv, requests, os
from pathlib import Path
spreadsheetAddress = 'C:\\SOURCE\\CSV\\FILE.csv'
targetDirectory = 'C:\\TARGET\\IMAGE\\SAVE\\LOCATION\\'
def getSpreadsheetContents(spreadsheetAddress):
with open(spreadsheetAddress) as csvfile:
readCSV = csv.reader(csvfile, delimiter=',')
imageSet = {}
for row in readCSV:
if 'image_id' not in row:
imageSet[row[0]] = row[1]
return imageSet
if __name__ == "__main__":
if os.path.exists(spreadsheetAddress) and os.path.exists(targetDirectory):
imageDict = getSpreadsheetContents(spreadsheetAddress)
for key, value in imageDict.items():
if requests.get(value).status_code == 200:
filename, file_extension = os.path.splitext(value)
address = str(targetDirectory + "\\" + key + file_extension)
urllib.request.urlretrieve(value, address)
else:
raise Exception("File not found")

Extract data from PDF using python [duplicate]

I'm trying to extract the text included in this PDF file using Python.
I'm using the PyPDF2 package (version 1.27.2), and have the following script:
import PyPDF2
with open("sample.pdf", "rb") as pdf_file:
read_pdf = PyPDF2.PdfFileReader(pdf_file)
number_of_pages = read_pdf.getNumPages()
page = read_pdf.pages[0]
page_content = page.extractText()
print(page_content)
When I run the code, I get the following output which is different from that included in the PDF document:
! " # $ % # $ % &% $ &' ( ) * % + , - % . / 0 1 ' * 2 3% 4
5
' % 1 $ # 2 6 % 3/ % 7 / ) ) / 8 % &) / 2 6 % 8 # 3" % 3" * % 31 3/ 9 # &)
%
How can I extract the text as is in the PDF document?

I was looking for a simple solution to use for python 3.x and windows. There doesn't seem to be support from textract, which is unfortunate, but if you are looking for a simple solution for windows/python 3 checkout the tika package, really straight forward for reading pdfs.
Tika-Python is a Python binding to the Apache Tika™ REST services allowing Tika to be called natively in the Python community.
from tika import parser # pip install tika
raw = parser.from_file('sample.pdf')
print(raw['content'])
Note that Tika is written in Java so you will need a Java runtime installed

PyPDF2 recently improved a lot. Depending on the data, it is on-par or better than pdfminer.six.
pymupdf / tika / PDFium are better than PyPDF2, but the difference became rather small -
(mostly when to set a new line). The core part is that they are way faster. But they are not pure-Python which can mean that you cannot execute it. And some might have too restrictive licenses so that you may not use it.
Have a look at the benchmark.
Results from November 2022:
PyPDF2
Edit: I recently became the maintainer of PyPDF2! 😁 The community improved the text extraction a lot. Give it a try :-)
from PyPDF2 import PdfReader
reader = PdfReader("example.pdf")
text = ""
for page in reader.pages:
text += page.extract_text() + "\n"
Please note that those packages are not maintained:
pyPdf, PyPDF3, PyPDF4
pdfminer (without .six)
pymupdf
import fitz # install using: pip install PyMuPDF
with fitz.open("my.pdf") as doc:
text = ""
for page in doc:
text += page.get_text()
print(text)
Other PDF libraries
pikepdf does not support text extraction (source)

Use textract.
http://textract.readthedocs.io/en/latest/
https://github.com/deanmalmgren/textract
It supports many types of files including PDFs
import textract
text = textract.process("path/to/file.extension")

Look at this code for PyPDF2<=1.26.0:
import PyPDF2
pdf_file = open('sample.pdf', 'rb')
read_pdf = PyPDF2.PdfFileReader(pdf_file)
page = read_pdf.getPage(0)
page_content = page.extractText()
print page_content.encode('utf-8')
The output is:
!"#$%#$%&%$&'()*%+,-%./01'*23%4
5'%1$#26%3/%7/))/8%&)/26%8#3"%3"*%313/9#&)
%
Using the same code to read a pdf from 201308FCR.pdf
.The output is normal.
Its documentation explains why:
def extractText(self):
"""
Locate all text drawing commands, in the order they are provided in the
content stream, and extract the text. This works well for some PDF
files, but poorly for others, depending on the generator used. This will
be refined in the future. Do not rely on the order of text coming out of
this function, as it will change if this function is made more
sophisticated.
:return: a unicode string object.
"""

After trying textract (which seemed to have too many dependencies) and pypdf2 (which could not extract text from the pdfs I tested with) and tika (which was too slow) I ended up using pdftotext from xpdf (as already suggested in another answer) and just called the binary from python directly (you may need to adapt the path to pdftotext):
import os, subprocess
SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
args = ["/usr/local/bin/pdftotext",
'-enc',
'UTF-8',
"{}/my-pdf.pdf".format(SCRIPT_DIR),
'-']
res = subprocess.run(args, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
output = res.stdout.decode('utf-8')
There is pdftotext which does basically the same but this assumes pdftotext in /usr/local/bin whereas I am using this in AWS lambda and wanted to use it from the current directory.
Btw: For using this on lambda you need to put the binary and the dependency to libstdc++.so into your lambda function. I personally needed to compile xpdf. As instructions for this would blow up this answer I put them on my personal blog.

I've try many Python PDF converters, and I like to update this review. Tika is one of the best. But PyMuPDF is a good news from #ehsaneha user.
I did a code to compare them in: https://github.com/erfelipe/PDFtextExtraction I hope to help you.
Tika-Python is a Python binding to the Apache Tika™ REST services
allowing Tika to be called natively in the Python community.
from tika import parser
raw = parser.from_file("///Users/Documents/Textos/Texto1.pdf")
raw = str(raw)
safe_text = raw.encode('utf-8', errors='ignore')
safe_text = str(safe_text).replace("\n", "").replace("\\", "")
print('--- safe text ---' )
print( safe_text )

You may want to use time proved xPDF and derived tools to extract text instead as pyPDF2 seems to have various issues with the text extraction still.
The long answer is that there are lot of variations how a text is encoded inside PDF and that it may require to decoded PDF string itself, then may need to map with CMAP, then may need to analyze distance between words and letters etc.
In case the PDF is damaged (i.e. displaying the correct text but when copying it gives garbage) and you really need to extract text, then you may want to consider converting PDF into image (using ImageMagik) and then use Tesseract to get text from image using OCR.

PyPDF2 in some cases ignores the white spaces and makes the result text a mess, but I use PyMuPDF and I'm really satisfied
you can use this link for more info

pdftotext is the best and simplest one!
pdftotext also reserves the structure as well.
I tried PyPDF2, PDFMiner and a few others but none of them gave a satisfactory result.

In 2020 the solutions above were not working for the particular pdf I was working with. Below is what did the trick. I am on Windows 10 and Python 3.8
Test pdf file: https://drive.google.com/file/d/1aUfQAlvq5hA9kz2c9CyJADiY3KpY3-Vn/view?usp=sharing
#pip install pdfminer.six
import io
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
def convert_pdf_to_txt(path):
'''Convert pdf content from a file path to text
:path the file path
'''
rsrcmgr = PDFResourceManager()
codec = 'utf-8'
laparams = LAParams()
with io.StringIO() as retstr:
with TextConverter(rsrcmgr, retstr, codec=codec,
laparams=laparams) as device:
with open(path, 'rb') as fp:
interpreter = PDFPageInterpreter(rsrcmgr, device)
password = ""
maxpages = 0
caching = True
pagenos = set()
for page in PDFPage.get_pages(fp,
pagenos,
maxpages=maxpages,
password=password,
caching=caching,
check_extractable=True):
interpreter.process_page(page)
return retstr.getvalue()
if __name__ == "__main__":
print(convert_pdf_to_txt('C:\\Path\\To\\Test_PDF.pdf'))

I found a solution here PDFLayoutTextStripper
It's good because it can keep the layout of the original PDF.
It's written in Java but I have added a Gateway to support Python.
Sample code:
from py4j.java_gateway import JavaGateway
gw = JavaGateway()
result = gw.entry_point.strip('samples/bus.pdf')
# result is a dict of {
# 'success': 'true' or 'false',
# 'payload': pdf file content if 'success' is 'true'
# 'error': error message if 'success' is 'false'
# }
print result['payload']
Sample output from PDFLayoutTextStripper:
You can see more details here Stripper with Python

The below code is a solution to the question in Python 3. Before running the code, make sure you have installed the PyPDF2 library in your environment. If not installed, open the command prompt and run the following command:
pip3 install PyPDF2
Solution Code using PyPDF2 <= 1.26.0:
import PyPDF2
pdfFileObject = open('sample.pdf', 'rb')
pdfReader = PyPDF2.PdfFileReader(pdfFileObject)
count = pdfReader.numPages
for i in range(count):
page = pdfReader.getPage(i)
print(page.extractText())

pdfplumber is one of the better libraries to read and extract data from pdf. It also provides ways to read table data and after struggling with a lot of such libraries, pdfplumber worked best for me.
Mind you, it works best for machine-written pdf and not scanned pdf.
import pdfplumber
with pdfplumber.open(r'D:\examplepdf.pdf') as pdf:
first_page = pdf.pages[0]
print(first_page.extract_text())

I've got a better work around than OCR and to maintain the page alignment while extracting the text from a PDF. Should be of help:
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from io import StringIO
def convert_pdf_to_txt(path):
rsrcmgr = PDFResourceManager()
retstr = StringIO()
codec = 'utf-8'
laparams = LAParams()
device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
fp = open(path, 'rb')
interpreter = PDFPageInterpreter(rsrcmgr, device)
password = ""
maxpages = 0
caching = True
pagenos=set()
for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True):
interpreter.process_page(page)
text = retstr.getvalue()
fp.close()
device.close()
retstr.close()
return text
text= convert_pdf_to_txt('test.pdf')
print(text)

Multi - page pdf can be extracted as text at single stretch instead of giving individual page number as argument using below code
import PyPDF2
import collections
pdf_file = open('samples.pdf', 'rb')
read_pdf = PyPDF2.PdfFileReader(pdf_file)
number_of_pages = read_pdf.getNumPages()
c = collections.Counter(range(number_of_pages))
for i in c:
page = read_pdf.getPage(i)
page_content = page.extractText()
print page_content.encode('utf-8')

You can use PDFtoText
https://github.com/jalan/pdftotext
PDF to text keeps text format indentation, doesn't matter if you have tables.

If wanting to extract text from a table, I've found tabula to be easily implemented, accurate, and fast:
to get a pandas dataframe:
import tabula
df = tabula.read_pdf('your.pdf')
df
By default, it ignores page content outside of the table. So far, I've only tested on a single-page, single-table file, but there are kwargs to accommodate multiple pages and/or multiple tables.
install via:
pip install tabula-py
# or
conda install -c conda-forge tabula-py
In terms of straight-up text extraction see:
https://stackoverflow.com/a/63190886/9249533

As of 2021 I would like to recommend pdfreader due to the fact that PyPDF2/3 seems to be troublesome now and tika is actually written in java and needs a jre in the background. pdfreader is pythonic, currently well maintained and has extensive documentation here.
Installation as usual: pip install pdfreader
Short example of usage:
from pdfreader import PDFDocument, SimplePDFViewer
# get raw document
fd = open(file_name, "rb")
doc = PDFDocument(fd)
# there is an iterator for pages
page_one = next(doc.pages())
all_pages = [p for p in doc.pages()]
# and even a viewer
fd = open(file_name, "rb")
viewer = SimplePDFViewer(fd)

Here is the simplest code for extracting text
code:
# importing required modules
import PyPDF2
# creating a pdf file object
pdfFileObj = open('filename.pdf', 'rb')
# creating a pdf reader object
pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
# printing number of pages in pdf file
print(pdfReader.numPages)
# creating a page object
pageObj = pdfReader.getPage(5)
# extracting text from page
print(pageObj.extractText())
# closing the pdf file object
pdfFileObj.close()

Use pdfminer.six. Here is the the doc : https://pdfminersix.readthedocs.io/en/latest/index.html
To convert pdf to text :
def pdf_to_text():
from pdfminer.high_level import extract_text
text = extract_text('test.pdf')
print(text)

You can simply do this using pytessaract and OpenCV. Refer the following code. You can get more details from this article.
import os
from PIL import Image
from pdf2image import convert_from_path
import pytesseract
filePath = ‘021-DO-YOU-WONDER-ABOUT-RAIN-SNOW-SLEET-AND-HAIL-Free-Childrens-Book-By-Monkey-Pen.pdf’
doc = convert_from_path(filePath)
path, fileName = os.path.split(filePath)
fileBaseName, fileExtension = os.path.splitext(fileName)
for page_number, page_data in enumerate(doc):
txt = pytesseract.image_to_string(page_data).encode(“utf-8”)
print(“Page # {} — {}”.format(str(page_number),txt))

Go through the official documentation there it is given
from PyPDF2 import PdfReader
reader = PdfReader("example.pdf")
page = reader.pages[0]
print(page.extract_text())

I am adding code to accomplish this:
It is working fine for me:
# This works in python 3
# required python packages
# tabula-py==1.0.0
# PyPDF2==1.26.0
# Pillow==4.0.0
# pdfminer.six==20170720
import os
import shutil
import warnings
from io import StringIO
import requests
import tabula
from PIL import Image
from PyPDF2 import PdfFileWriter, PdfFileReader
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfpage import PDFPage
warnings.filterwarnings("ignore")
def download_file(url):
local_filename = url.split('/')[-1]
local_filename = local_filename.replace("%20", "_")
r = requests.get(url, stream=True)
print(r)
with open(local_filename, 'wb') as f:
shutil.copyfileobj(r.raw, f)
return local_filename
class PDFExtractor():
def __init__(self, url):
self.url = url
# Downloading File in local
def break_pdf(self, filename, start_page=-1, end_page=-1):
pdf_reader = PdfFileReader(open(filename, "rb"))
# Reading each pdf one by one
total_pages = pdf_reader.numPages
if start_page == -1:
start_page = 0
elif start_page < 1 or start_page > total_pages:
return "Start Page Selection Is Wrong"
else:
start_page = start_page - 1
if end_page == -1:
end_page = total_pages
elif end_page < 1 or end_page > total_pages - 1:
return "End Page Selection Is Wrong"
else:
end_page = end_page
for i in range(start_page, end_page):
output = PdfFileWriter()
output.addPage(pdf_reader.getPage(i))
with open(str(i + 1) + "_" + filename, "wb") as outputStream:
output.write(outputStream)
def extract_text_algo_1(self, file):
pdf_reader = PdfFileReader(open(file, 'rb'))
# creating a page object
pageObj = pdf_reader.getPage(0)
# extracting extract_text from page
text = pageObj.extractText()
text = text.replace("\n", "").replace("\t", "")
return text
def extract_text_algo_2(self, file):
pdfResourceManager = PDFResourceManager()
retstr = StringIO()
la_params = LAParams()
device = TextConverter(pdfResourceManager, retstr, codec='utf-8', laparams=la_params)
fp = open(file, 'rb')
interpreter = PDFPageInterpreter(pdfResourceManager, device)
password = ""
max_pages = 0
caching = True
page_num = set()
for page in PDFPage.get_pages(fp, page_num, maxpages=max_pages, password=password, caching=caching,
check_extractable=True):
interpreter.process_page(page)
text = retstr.getvalue()
text = text.replace("\t", "").replace("\n", "")
fp.close()
device.close()
retstr.close()
return text
def extract_text(self, file):
text1 = self.extract_text_algo_1(file)
text2 = self.extract_text_algo_2(file)
if len(text2) > len(str(text1)):
return text2
else:
return text1
def extarct_table(self, file):
# Read pdf into DataFrame
try:
df = tabula.read_pdf(file, output_format="csv")
except:
print("Error Reading Table")
return
print("\nPrinting Table Content: \n", df)
print("\nDone Printing Table Content\n")
def tiff_header_for_CCITT(self, width, height, img_size, CCITT_group=4):
tiff_header_struct = '<' + '2s' + 'h' + 'l' + 'h' + 'hhll' * 8 + 'h'
return struct.pack(tiff_header_struct,
b'II', # Byte order indication: Little indian
42, # Version number (always 42)
8, # Offset to first IFD
8, # Number of tags in IFD
256, 4, 1, width, # ImageWidth, LONG, 1, width
257, 4, 1, height, # ImageLength, LONG, 1, lenght
258, 3, 1, 1, # BitsPerSample, SHORT, 1, 1
259, 3, 1, CCITT_group, # Compression, SHORT, 1, 4 = CCITT Group 4 fax encoding
262, 3, 1, 0, # Threshholding, SHORT, 1, 0 = WhiteIsZero
273, 4, 1, struct.calcsize(tiff_header_struct), # StripOffsets, LONG, 1, len of header
278, 4, 1, height, # RowsPerStrip, LONG, 1, lenght
279, 4, 1, img_size, # StripByteCounts, LONG, 1, size of extract_image
0 # last IFD
)
def extract_image(self, filename):
number = 1
pdf_reader = PdfFileReader(open(filename, 'rb'))
for i in range(0, pdf_reader.numPages):
page = pdf_reader.getPage(i)
try:
xObject = page['/Resources']['/XObject'].getObject()
except:
print("No XObject Found")
return
for obj in xObject:
try:
if xObject[obj]['/Subtype'] == '/Image':
size = (xObject[obj]['/Width'], xObject[obj]['/Height'])
data = xObject[obj]._data
if xObject[obj]['/ColorSpace'] == '/DeviceRGB':
mode = "RGB"
else:
mode = "P"
image_name = filename.split(".")[0] + str(number)
print(xObject[obj]['/Filter'])
if xObject[obj]['/Filter'] == '/FlateDecode':
data = xObject[obj].getData()
img = Image.frombytes(mode, size, data)
img.save(image_name + "_Flate.png")
# save_to_s3(imagename + "_Flate.png")
print("Image_Saved")
number += 1
elif xObject[obj]['/Filter'] == '/DCTDecode':
img = open(image_name + "_DCT.jpg", "wb")
img.write(data)
# save_to_s3(imagename + "_DCT.jpg")
img.close()
number += 1
elif xObject[obj]['/Filter'] == '/JPXDecode':
img = open(image_name + "_JPX.jp2", "wb")
img.write(data)
# save_to_s3(imagename + "_JPX.jp2")
img.close()
number += 1
elif xObject[obj]['/Filter'] == '/CCITTFaxDecode':
if xObject[obj]['/DecodeParms']['/K'] == -1:
CCITT_group = 4
else:
CCITT_group = 3
width = xObject[obj]['/Width']
height = xObject[obj]['/Height']
data = xObject[obj]._data # sorry, getData() does not work for CCITTFaxDecode
img_size = len(data)
tiff_header = self.tiff_header_for_CCITT(width, height, img_size, CCITT_group)
img_name = image_name + '_CCITT.tiff'
with open(img_name, 'wb') as img_file:
img_file.write(tiff_header + data)
# save_to_s3(img_name)
number += 1
except:
continue
return number
def read_pages(self, start_page=-1, end_page=-1):
# Downloading file locally
downloaded_file = download_file(self.url)
print(downloaded_file)
# breaking PDF into number of pages in diff pdf files
self.break_pdf(downloaded_file, start_page, end_page)
# creating a pdf reader object
pdf_reader = PdfFileReader(open(downloaded_file, 'rb'))
# Reading each pdf one by one
total_pages = pdf_reader.numPages
if start_page == -1:
start_page = 0
elif start_page < 1 or start_page > total_pages:
return "Start Page Selection Is Wrong"
else:
start_page = start_page - 1
if end_page == -1:
end_page = total_pages
elif end_page < 1 or end_page > total_pages - 1:
return "End Page Selection Is Wrong"
else:
end_page = end_page
for i in range(start_page, end_page):
# creating a page based filename
file = str(i + 1) + "_" + downloaded_file
print("\nStarting to Read Page: ", i + 1, "\n -----------===-------------")
file_text = self.extract_text(file)
print(file_text)
self.extract_image(file)
self.extarct_table(file)
os.remove(file)
print("Stopped Reading Page: ", i + 1, "\n -----------===-------------")
os.remove(downloaded_file)
# I have tested on these 3 pdf files
# url = "http://s3.amazonaws.com/NLP_Project/Original_Documents/Healthcare-January-2017.pdf"
url = "http://s3.amazonaws.com/NLP_Project/Original_Documents/Sample_Test.pdf"
# url = "http://s3.amazonaws.com/NLP_Project/Original_Documents/Sazerac_FS_2017_06_30%20Annual.pdf"
# creating the instance of class
pdf_extractor = PDFExtractor(url)
# Getting desired data out
pdf_extractor.read_pages(15, 23)

You can download tika-app-xxx.jar(latest) from Here.
Then put this .jar file in the same folder of your python script file.
then insert the following code in the script:
import os
import os.path
tika_dir=os.path.join(os.path.dirname(__file__),'<tika-app-xxx>.jar')
def extract_pdf(source_pdf:str,target_txt:str):
os.system('java -jar '+tika_dir+' -t {} > {}'.format(source_pdf,target_txt))
The advantage of this method:
fewer dependency. Single .jar file is easier to manage that a python package.
multi-format support. The position source_pdf can be the directory of any kind of document. (.doc, .html, .odt, etc.)
up-to-date. tika-app.jar always release earlier than the relevant version of tika python package.
stable. It is far more stable and well-maintained (Powered by Apache) than PyPDF.
disadvantage:
A jre-headless is necessary.

If you try it in Anaconda on Windows, PyPDF2 might not handle some of the PDFs with non-standard structure or unicode characters. I recommend using the following code if you need to open and read a lot of pdf files - the text of all pdf files in folder with relative path .//pdfs// will be stored in list pdf_text_list.
from tika import parser
import glob
def read_pdf(filename):
text = parser.from_file(filename)
return(text)
all_files = glob.glob(".\\pdfs\\*.pdf")
pdf_text_list=[]
for i,file in enumerate(all_files):
text=read_pdf(file)
pdf_text_list.append(text['content'])
print(pdf_text_list)

For extracting Text from PDF use below code
import PyPDF2
pdfFileObj = open('mypdf.pdf', 'rb')
pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
print(pdfReader.numPages)
pageObj = pdfReader.getPage(0)
a = pageObj.extractText()
print(a)

A more robust way, supposing there are multiple PDF's or just one !
import os
from PyPDF2 import PdfFileWriter, PdfFileReader
from io import BytesIO
mydir = # specify path to your directory where PDF or PDF's are
for arch in os.listdir(mydir):
buffer = io.BytesIO()
archpath = os.path.join(mydir, arch)
with open(archpath) as f:
pdfFileObj = open(archpath, 'rb')
pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
pdfReader.numPages
pageObj = pdfReader.getPage(0)
ley = pageObj.extractText()
file1 = open("myfile.txt","w")
file1.writelines(ley)
file1.close()

Camelot seems a fairly powerful solution to extract tables from PDFs in Python.
At first sight it seems to achieve almost as accurate extraction as the tabula-py package suggested by CreekGeek, which is already waaaaay above any other posted solution as of today in terms of reliability, but it is supposedly much more configurable. Furthermore it has its own accuracy indicator (results.parsing_report), and great debugging features.
Both Camelot and Tabula provide the results as Pandas’ DataFrames, so it is easy to adjust tables afterwards.
pip install camelot-py
(Not to be confused with the camelot package.)
import camelot
df_list = []
results = camelot.read_pdf("file.pdf", ...)
for table in results:
print(table.parsing_report)
df_list.append(results[0].df)
It can also output results as CSV, JSON, HTML or Excel.
Camelot comes at the expense of a number of dependencies.
NB : Since my input is pretty complex with many different tables I ended up using both Camelot and Tabula, depending on the table, to achieve the best results.

Try out borb, a pure python PDF library
import typing
from borb.pdf.document import Document
from borb.pdf.pdf import PDF
from borb.toolkit.text.simple_text_extraction import SimpleTextExtraction
def main():
# variable to hold Document instance
doc: typing.Optional[Document] = None
# this implementation of EventListener handles text-rendering instructions
l: SimpleTextExtraction = SimpleTextExtraction()
# open the document, passing along the array of listeners
with open("input.pdf", "rb") as in_file_handle:
doc = PDF.loads(in_file_handle, [l])
# were we able to read the document?
assert doc is not None
# print the text on page 0
print(l.get_text(0))
if __name__ == "__main__":
main()

It includes creating a new sheet for each PDF page being set dynamically based on number of pages in the document.
import PyPDF2 as p2
import xlsxwriter
pdfFileName = "sample.pdf"
pdfFile = open(pdfFileName, 'rb')
pdfread = p2.PdfFileReader(pdfFile)
number_of_pages = pdfread.getNumPages()
workbook = xlsxwriter.Workbook('pdftoexcel.xlsx')
for page_number in range(number_of_pages):
print(f'Sheet{page_number}')
pageinfo = pdfread.getPage(page_number)
rawInfo = pageinfo.extractText().split('\n')
row = 0
column = 0
worksheet = workbook.add_worksheet(f'Sheet{page_number}')
for line in rawInfo:
worksheet.write(row, column, line)
row += 1
workbook.close()

Downloading files concurrently in Python

This code downloads metadata from a repository, writes that data to file, downloads a pdf, turns that pdf to text, then deletes the original pdf:
for record in records:
record_data = [] # data is stored in record_data
for name, metadata in record.metadata.items():
for i, value in enumerate(metadata):
if value:
record_data.append(value)
fulltext = ''
file_path = ''
file_path_metadata = ''
unique_id = str(uuid.uuid4())
for data in record_data:
if 'Fulltext' in data:
# the link to the pdf
fulltext = data.replace('Fulltext ', '')
# path where the txt file will be stored
file_path = '/' + os.path.basename(data).replace('.pdf', '') + unique_id + '.pdf'
# path where the metadata will be stored
file_path_metadata = '/' + os.path.basename(data).replace('.pdf', '') + unique_id + '_metadata.txt'
print fulltext, file_path
# Write metadata to file
if fulltext:
try:
write_metadata = open(path_to_institute + file_path_metadata, 'w')
for i, data in enumerate(record_data):
write_metadata.write('MD_' + str(i) + ': ' + data.encode('utf8') + '\n')
write_metadata.close()
except Exception as e:
# Exceptions due to missing path to file
print 'Exception when writing metadata: {}'.format(e)
print fulltext, path_to_institute, file_path_metadata
# Download pdf
download_pdf(fulltext, path_to_institute + file_path)
# Create text file and delete pdf
pdf2text(path_to_institute + file_path)
Doing some measurements, the download_pdf method and pdf2text method takes quite a long time.
Here are those methods:
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfinterp import PDFResourceManager
from pdfminer.pdfinterp import PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from cStringIO import StringIO
import os
def remove_file(path):
try:
os.remove(path)
except OSError, e:
print ("Error: %s - %s." % (e.filename,e.strerror))
def pdf2text(path):
string_handling = StringIO()
parser = PDFParser(open(path, 'r'))
save_file = open(path.replace('.pdf', '.txt'), 'w')
try:
document = PDFDocument(parser)
except Exception as e:
print '{} is not a readable document. Exception {}'.format(path, e)
return
if document.is_extractable:
recourse_manager = PDFResourceManager()
device = TextConverter(recourse_manager,
string_handling,
codec='ascii',
laparams=LAParams())
interpreter = PDFPageInterpreter(recourse_manager, device)
for page in PDFPage.create_pages(document):
interpreter.process_page(page)
# write to file
save_file.write(string_handling.getvalue())
save_file.close()
# deletes pdf
remove_file(path)
else:
print(path, "Warning: could not extract text from pdf file.")
return
def download_pdf(url, path):
try:
f = urllib2.urlopen(url)
except Exception as e:
print e
f = None
if f:
data = f.read()
with open(path, "wb") as code:
code.write(data)
code.close()
So I'm thinking I should run those in parallel.
I tried this, but it did not word:
pool = mp.Pool(processes=len(process_data))
for i in process_data:
print i
pool.apply(download_pdf, args=(i[0], i[1]))
pool = mp.Pool(processes=len(process_data))
for i in process_data:
print i[1]
pool.apply(pdf2text, args=(i[1],))
It takes just as long time? The printing happens as if the processes are run one at a time...

I finally found out a way to run the code in parallel. Unbelievable how much faster it got.
import multiprocessing as mp
jobs = []
for i in process_data:
p = mp.Process(target=download_pdf, args=(i[0], i[1]))
jobs.append(p)
p.start()
for i, data in enumerate(process_data):
print data
p = mp.Process(target=pdf2text, args=(data[1],))
jobs[i].join()
p.start()

here is a great article on how to build stuff in parallel,
it uses multiprocessing.dummy to run things in different threads
here is a little example:
from urllib2 import urlopen
from multiprocessing.dummy import Pool
urls = [url_a,
url_b,
url_c
]
pool = Pool()
res = pool.map(urlopen, urls)
pool.close()
pool.join()
for python >= 3.3 I suggest concurrent.futures
example:
import functools
import urllib.request
import futures
URLS = ['http://www.foxnews.com/',
'http://www.cnn.com/',
'http://europe.wsj.com/',
'http://www.bbc.co.uk/',
'http://some-made-up-domain.com/']
def load_url(url, timeout):
return urllib.request.urlopen(url, timeout=timeout).read()
with futures.ThreadPoolExecutor(50) as executor:
future_list = executor.run_to_futures(
[functools.partial(load_url, url, 30) for url in URLS])
example taken from: here

PyPDF2 - Returning only blank lines. En(de)code issue? [duplicate]

I'm trying to extract the text included in this PDF file using Python.
I'm using the PyPDF2 package (version 1.27.2), and have the following script:
import PyPDF2
with open("sample.pdf", "rb") as pdf_file:
read_pdf = PyPDF2.PdfFileReader(pdf_file)
number_of_pages = read_pdf.getNumPages()
page = read_pdf.pages[0]
page_content = page.extractText()
print(page_content)
When I run the code, I get the following output which is different from that included in the PDF document:
! " # $ % # $ % &% $ &' ( ) * % + , - % . / 0 1 ' * 2 3% 4
5
' % 1 $ # 2 6 % 3/ % 7 / ) ) / 8 % &) / 2 6 % 8 # 3" % 3" * % 31 3/ 9 # &)
%
How can I extract the text as is in the PDF document?

I was looking for a simple solution to use for python 3.x and windows. There doesn't seem to be support from textract, which is unfortunate, but if you are looking for a simple solution for windows/python 3 checkout the tika package, really straight forward for reading pdfs.
Tika-Python is a Python binding to the Apache Tika™ REST services allowing Tika to be called natively in the Python community.
from tika import parser # pip install tika
raw = parser.from_file('sample.pdf')
print(raw['content'])
Note that Tika is written in Java so you will need a Java runtime installed

PyPDF2 recently improved a lot. Depending on the data, it is on-par or better than pdfminer.six.
pymupdf / tika / PDFium are better than PyPDF2, but the difference became rather small -
(mostly when to set a new line). The core part is that they are way faster. But they are not pure-Python which can mean that you cannot execute it. And some might have too restrictive licenses so that you may not use it.
Have a look at the benchmark.
Results from November 2022:
PyPDF2
Edit: I recently became the maintainer of PyPDF2! 😁 The community improved the text extraction a lot. Give it a try :-)
from PyPDF2 import PdfReader
reader = PdfReader("example.pdf")
text = ""
for page in reader.pages:
text += page.extract_text() + "\n"
Please note that those packages are not maintained:
pyPdf, PyPDF3, PyPDF4
pdfminer (without .six)
pymupdf
import fitz # install using: pip install PyMuPDF
with fitz.open("my.pdf") as doc:
text = ""
for page in doc:
text += page.get_text()
print(text)
Other PDF libraries
pikepdf does not support text extraction (source)

Use textract.
http://textract.readthedocs.io/en/latest/
https://github.com/deanmalmgren/textract
It supports many types of files including PDFs
import textract
text = textract.process("path/to/file.extension")

Look at this code for PyPDF2<=1.26.0:
import PyPDF2
pdf_file = open('sample.pdf', 'rb')
read_pdf = PyPDF2.PdfFileReader(pdf_file)
page = read_pdf.getPage(0)
page_content = page.extractText()
print page_content.encode('utf-8')
The output is:
!"#$%#$%&%$&'()*%+,-%./01'*23%4
5'%1$#26%3/%7/))/8%&)/26%8#3"%3"*%313/9#&)
%
Using the same code to read a pdf from 201308FCR.pdf
.The output is normal.
Its documentation explains why:
def extractText(self):
"""
Locate all text drawing commands, in the order they are provided in the
content stream, and extract the text. This works well for some PDF
files, but poorly for others, depending on the generator used. This will
be refined in the future. Do not rely on the order of text coming out of
this function, as it will change if this function is made more
sophisticated.
:return: a unicode string object.
"""

After trying textract (which seemed to have too many dependencies) and pypdf2 (which could not extract text from the pdfs I tested with) and tika (which was too slow) I ended up using pdftotext from xpdf (as already suggested in another answer) and just called the binary from python directly (you may need to adapt the path to pdftotext):
import os, subprocess
SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
args = ["/usr/local/bin/pdftotext",
'-enc',
'UTF-8',
"{}/my-pdf.pdf".format(SCRIPT_DIR),
'-']
res = subprocess.run(args, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
output = res.stdout.decode('utf-8')
There is pdftotext which does basically the same but this assumes pdftotext in /usr/local/bin whereas I am using this in AWS lambda and wanted to use it from the current directory.
Btw: For using this on lambda you need to put the binary and the dependency to libstdc++.so into your lambda function. I personally needed to compile xpdf. As instructions for this would blow up this answer I put them on my personal blog.

I've try many Python PDF converters, and I like to update this review. Tika is one of the best. But PyMuPDF is a good news from #ehsaneha user.
I did a code to compare them in: https://github.com/erfelipe/PDFtextExtraction I hope to help you.
Tika-Python is a Python binding to the Apache Tika™ REST services
allowing Tika to be called natively in the Python community.
from tika import parser
raw = parser.from_file("///Users/Documents/Textos/Texto1.pdf")
raw = str(raw)
safe_text = raw.encode('utf-8', errors='ignore')
safe_text = str(safe_text).replace("\n", "").replace("\\", "")
print('--- safe text ---' )
print( safe_text )

You may want to use time proved xPDF and derived tools to extract text instead as pyPDF2 seems to have various issues with the text extraction still.
The long answer is that there are lot of variations how a text is encoded inside PDF and that it may require to decoded PDF string itself, then may need to map with CMAP, then may need to analyze distance between words and letters etc.
In case the PDF is damaged (i.e. displaying the correct text but when copying it gives garbage) and you really need to extract text, then you may want to consider converting PDF into image (using ImageMagik) and then use Tesseract to get text from image using OCR.

PyPDF2 in some cases ignores the white spaces and makes the result text a mess, but I use PyMuPDF and I'm really satisfied
you can use this link for more info

pdftotext is the best and simplest one!
pdftotext also reserves the structure as well.
I tried PyPDF2, PDFMiner and a few others but none of them gave a satisfactory result.

In 2020 the solutions above were not working for the particular pdf I was working with. Below is what did the trick. I am on Windows 10 and Python 3.8
Test pdf file: https://drive.google.com/file/d/1aUfQAlvq5hA9kz2c9CyJADiY3KpY3-Vn/view?usp=sharing
#pip install pdfminer.six
import io
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
def convert_pdf_to_txt(path):
'''Convert pdf content from a file path to text
:path the file path
'''
rsrcmgr = PDFResourceManager()
codec = 'utf-8'
laparams = LAParams()
with io.StringIO() as retstr:
with TextConverter(rsrcmgr, retstr, codec=codec,
laparams=laparams) as device:
with open(path, 'rb') as fp:
interpreter = PDFPageInterpreter(rsrcmgr, device)
password = ""
maxpages = 0
caching = True
pagenos = set()
for page in PDFPage.get_pages(fp,
pagenos,
maxpages=maxpages,
password=password,
caching=caching,
check_extractable=True):
interpreter.process_page(page)
return retstr.getvalue()
if __name__ == "__main__":
print(convert_pdf_to_txt('C:\\Path\\To\\Test_PDF.pdf'))

I found a solution here PDFLayoutTextStripper
It's good because it can keep the layout of the original PDF.
It's written in Java but I have added a Gateway to support Python.
Sample code:
from py4j.java_gateway import JavaGateway
gw = JavaGateway()
result = gw.entry_point.strip('samples/bus.pdf')
# result is a dict of {
# 'success': 'true' or 'false',
# 'payload': pdf file content if 'success' is 'true'
# 'error': error message if 'success' is 'false'
# }
print result['payload']
Sample output from PDFLayoutTextStripper:
You can see more details here Stripper with Python

The below code is a solution to the question in Python 3. Before running the code, make sure you have installed the PyPDF2 library in your environment. If not installed, open the command prompt and run the following command:
pip3 install PyPDF2
Solution Code using PyPDF2 <= 1.26.0:
import PyPDF2
pdfFileObject = open('sample.pdf', 'rb')
pdfReader = PyPDF2.PdfFileReader(pdfFileObject)
count = pdfReader.numPages
for i in range(count):
page = pdfReader.getPage(i)
print(page.extractText())

pdfplumber is one of the better libraries to read and extract data from pdf. It also provides ways to read table data and after struggling with a lot of such libraries, pdfplumber worked best for me.
Mind you, it works best for machine-written pdf and not scanned pdf.
import pdfplumber
with pdfplumber.open(r'D:\examplepdf.pdf') as pdf:
first_page = pdf.pages[0]
print(first_page.extract_text())

I've got a better work around than OCR and to maintain the page alignment while extracting the text from a PDF. Should be of help:
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from io import StringIO
def convert_pdf_to_txt(path):
rsrcmgr = PDFResourceManager()
retstr = StringIO()
codec = 'utf-8'
laparams = LAParams()
device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
fp = open(path, 'rb')
interpreter = PDFPageInterpreter(rsrcmgr, device)
password = ""
maxpages = 0
caching = True
pagenos=set()
for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True):
interpreter.process_page(page)
text = retstr.getvalue()
fp.close()
device.close()
retstr.close()
return text
text= convert_pdf_to_txt('test.pdf')
print(text)

Multi - page pdf can be extracted as text at single stretch instead of giving individual page number as argument using below code
import PyPDF2
import collections
pdf_file = open('samples.pdf', 'rb')
read_pdf = PyPDF2.PdfFileReader(pdf_file)
number_of_pages = read_pdf.getNumPages()
c = collections.Counter(range(number_of_pages))
for i in c:
page = read_pdf.getPage(i)
page_content = page.extractText()
print page_content.encode('utf-8')

You can use PDFtoText
https://github.com/jalan/pdftotext
PDF to text keeps text format indentation, doesn't matter if you have tables.

If wanting to extract text from a table, I've found tabula to be easily implemented, accurate, and fast:
to get a pandas dataframe:
import tabula
df = tabula.read_pdf('your.pdf')
df
By default, it ignores page content outside of the table. So far, I've only tested on a single-page, single-table file, but there are kwargs to accommodate multiple pages and/or multiple tables.
install via:
pip install tabula-py
# or
conda install -c conda-forge tabula-py
In terms of straight-up text extraction see:
https://stackoverflow.com/a/63190886/9249533

As of 2021 I would like to recommend pdfreader due to the fact that PyPDF2/3 seems to be troublesome now and tika is actually written in java and needs a jre in the background. pdfreader is pythonic, currently well maintained and has extensive documentation here.
Installation as usual: pip install pdfreader
Short example of usage:
from pdfreader import PDFDocument, SimplePDFViewer
# get raw document
fd = open(file_name, "rb")
doc = PDFDocument(fd)
# there is an iterator for pages
page_one = next(doc.pages())
all_pages = [p for p in doc.pages()]
# and even a viewer
fd = open(file_name, "rb")
viewer = SimplePDFViewer(fd)

Here is the simplest code for extracting text
code:
# importing required modules
import PyPDF2
# creating a pdf file object
pdfFileObj = open('filename.pdf', 'rb')
# creating a pdf reader object
pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
# printing number of pages in pdf file
print(pdfReader.numPages)
# creating a page object
pageObj = pdfReader.getPage(5)
# extracting text from page
print(pageObj.extractText())
# closing the pdf file object
pdfFileObj.close()

Use pdfminer.six. Here is the the doc : https://pdfminersix.readthedocs.io/en/latest/index.html
To convert pdf to text :
def pdf_to_text():
from pdfminer.high_level import extract_text
text = extract_text('test.pdf')
print(text)

You can simply do this using pytessaract and OpenCV. Refer the following code. You can get more details from this article.
import os
from PIL import Image
from pdf2image import convert_from_path
import pytesseract
filePath = ‘021-DO-YOU-WONDER-ABOUT-RAIN-SNOW-SLEET-AND-HAIL-Free-Childrens-Book-By-Monkey-Pen.pdf’
doc = convert_from_path(filePath)
path, fileName = os.path.split(filePath)
fileBaseName, fileExtension = os.path.splitext(fileName)
for page_number, page_data in enumerate(doc):
txt = pytesseract.image_to_string(page_data).encode(“utf-8”)
print(“Page # {} — {}”.format(str(page_number),txt))

Go through the official documentation there it is given
from PyPDF2 import PdfReader
reader = PdfReader("example.pdf")
page = reader.pages[0]
print(page.extract_text())

I am adding code to accomplish this:
It is working fine for me:
# This works in python 3
# required python packages
# tabula-py==1.0.0
# PyPDF2==1.26.0
# Pillow==4.0.0
# pdfminer.six==20170720
import os
import shutil
import warnings
from io import StringIO
import requests
import tabula
from PIL import Image
from PyPDF2 import PdfFileWriter, PdfFileReader
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfpage import PDFPage
warnings.filterwarnings("ignore")
def download_file(url):
local_filename = url.split('/')[-1]
local_filename = local_filename.replace("%20", "_")
r = requests.get(url, stream=True)
print(r)
with open(local_filename, 'wb') as f:
shutil.copyfileobj(r.raw, f)
return local_filename
class PDFExtractor():
def __init__(self, url):
self.url = url
# Downloading File in local
def break_pdf(self, filename, start_page=-1, end_page=-1):
pdf_reader = PdfFileReader(open(filename, "rb"))
# Reading each pdf one by one
total_pages = pdf_reader.numPages
if start_page == -1:
start_page = 0
elif start_page < 1 or start_page > total_pages:
return "Start Page Selection Is Wrong"
else:
start_page = start_page - 1
if end_page == -1:
end_page = total_pages
elif end_page < 1 or end_page > total_pages - 1:
return "End Page Selection Is Wrong"
else:
end_page = end_page
for i in range(start_page, end_page):
output = PdfFileWriter()
output.addPage(pdf_reader.getPage(i))
with open(str(i + 1) + "_" + filename, "wb") as outputStream:
output.write(outputStream)
def extract_text_algo_1(self, file):
pdf_reader = PdfFileReader(open(file, 'rb'))
# creating a page object
pageObj = pdf_reader.getPage(0)
# extracting extract_text from page
text = pageObj.extractText()
text = text.replace("\n", "").replace("\t", "")
return text
def extract_text_algo_2(self, file):
pdfResourceManager = PDFResourceManager()
retstr = StringIO()
la_params = LAParams()
device = TextConverter(pdfResourceManager, retstr, codec='utf-8', laparams=la_params)
fp = open(file, 'rb')
interpreter = PDFPageInterpreter(pdfResourceManager, device)
password = ""
max_pages = 0
caching = True
page_num = set()
for page in PDFPage.get_pages(fp, page_num, maxpages=max_pages, password=password, caching=caching,
check_extractable=True):
interpreter.process_page(page)
text = retstr.getvalue()
text = text.replace("\t", "").replace("\n", "")
fp.close()
device.close()
retstr.close()
return text
def extract_text(self, file):
text1 = self.extract_text_algo_1(file)
text2 = self.extract_text_algo_2(file)
if len(text2) > len(str(text1)):
return text2
else:
return text1
def extarct_table(self, file):
# Read pdf into DataFrame
try:
df = tabula.read_pdf(file, output_format="csv")
except:
print("Error Reading Table")
return
print("\nPrinting Table Content: \n", df)
print("\nDone Printing Table Content\n")
def tiff_header_for_CCITT(self, width, height, img_size, CCITT_group=4):
tiff_header_struct = '<' + '2s' + 'h' + 'l' + 'h' + 'hhll' * 8 + 'h'
return struct.pack(tiff_header_struct,
b'II', # Byte order indication: Little indian
42, # Version number (always 42)
8, # Offset to first IFD
8, # Number of tags in IFD
256, 4, 1, width, # ImageWidth, LONG, 1, width
257, 4, 1, height, # ImageLength, LONG, 1, lenght
258, 3, 1, 1, # BitsPerSample, SHORT, 1, 1
259, 3, 1, CCITT_group, # Compression, SHORT, 1, 4 = CCITT Group 4 fax encoding
262, 3, 1, 0, # Threshholding, SHORT, 1, 0 = WhiteIsZero
273, 4, 1, struct.calcsize(tiff_header_struct), # StripOffsets, LONG, 1, len of header
278, 4, 1, height, # RowsPerStrip, LONG, 1, lenght
279, 4, 1, img_size, # StripByteCounts, LONG, 1, size of extract_image
0 # last IFD
)
def extract_image(self, filename):
number = 1
pdf_reader = PdfFileReader(open(filename, 'rb'))
for i in range(0, pdf_reader.numPages):
page = pdf_reader.getPage(i)
try:
xObject = page['/Resources']['/XObject'].getObject()
except:
print("No XObject Found")
return
for obj in xObject:
try:
if xObject[obj]['/Subtype'] == '/Image':
size = (xObject[obj]['/Width'], xObject[obj]['/Height'])
data = xObject[obj]._data
if xObject[obj]['/ColorSpace'] == '/DeviceRGB':
mode = "RGB"
else:
mode = "P"
image_name = filename.split(".")[0] + str(number)
print(xObject[obj]['/Filter'])
if xObject[obj]['/Filter'] == '/FlateDecode':
data = xObject[obj].getData()
img = Image.frombytes(mode, size, data)
img.save(image_name + "_Flate.png")
# save_to_s3(imagename + "_Flate.png")
print("Image_Saved")
number += 1
elif xObject[obj]['/Filter'] == '/DCTDecode':
img = open(image_name + "_DCT.jpg", "wb")
img.write(data)
# save_to_s3(imagename + "_DCT.jpg")
img.close()
number += 1
elif xObject[obj]['/Filter'] == '/JPXDecode':
img = open(image_name + "_JPX.jp2", "wb")
img.write(data)
# save_to_s3(imagename + "_JPX.jp2")
img.close()
number += 1
elif xObject[obj]['/Filter'] == '/CCITTFaxDecode':
if xObject[obj]['/DecodeParms']['/K'] == -1:
CCITT_group = 4
else:
CCITT_group = 3
width = xObject[obj]['/Width']
height = xObject[obj]['/Height']
data = xObject[obj]._data # sorry, getData() does not work for CCITTFaxDecode
img_size = len(data)
tiff_header = self.tiff_header_for_CCITT(width, height, img_size, CCITT_group)
img_name = image_name + '_CCITT.tiff'
with open(img_name, 'wb') as img_file:
img_file.write(tiff_header + data)
# save_to_s3(img_name)
number += 1
except:
continue
return number
def read_pages(self, start_page=-1, end_page=-1):
# Downloading file locally
downloaded_file = download_file(self.url)
print(downloaded_file)
# breaking PDF into number of pages in diff pdf files
self.break_pdf(downloaded_file, start_page, end_page)
# creating a pdf reader object
pdf_reader = PdfFileReader(open(downloaded_file, 'rb'))
# Reading each pdf one by one
total_pages = pdf_reader.numPages
if start_page == -1:
start_page = 0
elif start_page < 1 or start_page > total_pages:
return "Start Page Selection Is Wrong"
else:
start_page = start_page - 1
if end_page == -1:
end_page = total_pages
elif end_page < 1 or end_page > total_pages - 1:
return "End Page Selection Is Wrong"
else:
end_page = end_page
for i in range(start_page, end_page):
# creating a page based filename
file = str(i + 1) + "_" + downloaded_file
print("\nStarting to Read Page: ", i + 1, "\n -----------===-------------")
file_text = self.extract_text(file)
print(file_text)
self.extract_image(file)
self.extarct_table(file)
os.remove(file)
print("Stopped Reading Page: ", i + 1, "\n -----------===-------------")
os.remove(downloaded_file)
# I have tested on these 3 pdf files
# url = "http://s3.amazonaws.com/NLP_Project/Original_Documents/Healthcare-January-2017.pdf"
url = "http://s3.amazonaws.com/NLP_Project/Original_Documents/Sample_Test.pdf"
# url = "http://s3.amazonaws.com/NLP_Project/Original_Documents/Sazerac_FS_2017_06_30%20Annual.pdf"
# creating the instance of class
pdf_extractor = PDFExtractor(url)
# Getting desired data out
pdf_extractor.read_pages(15, 23)

You can download tika-app-xxx.jar(latest) from Here.
Then put this .jar file in the same folder of your python script file.
then insert the following code in the script:
import os
import os.path
tika_dir=os.path.join(os.path.dirname(__file__),'<tika-app-xxx>.jar')
def extract_pdf(source_pdf:str,target_txt:str):
os.system('java -jar '+tika_dir+' -t {} > {}'.format(source_pdf,target_txt))
The advantage of this method:
fewer dependency. Single .jar file is easier to manage that a python package.
multi-format support. The position source_pdf can be the directory of any kind of document. (.doc, .html, .odt, etc.)
up-to-date. tika-app.jar always release earlier than the relevant version of tika python package.
stable. It is far more stable and well-maintained (Powered by Apache) than PyPDF.
disadvantage:
A jre-headless is necessary.

If you try it in Anaconda on Windows, PyPDF2 might not handle some of the PDFs with non-standard structure or unicode characters. I recommend using the following code if you need to open and read a lot of pdf files - the text of all pdf files in folder with relative path .//pdfs// will be stored in list pdf_text_list.
from tika import parser
import glob
def read_pdf(filename):
text = parser.from_file(filename)
return(text)
all_files = glob.glob(".\\pdfs\\*.pdf")
pdf_text_list=[]
for i,file in enumerate(all_files):
text=read_pdf(file)
pdf_text_list.append(text['content'])
print(pdf_text_list)

For extracting Text from PDF use below code
import PyPDF2
pdfFileObj = open('mypdf.pdf', 'rb')
pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
print(pdfReader.numPages)
pageObj = pdfReader.getPage(0)
a = pageObj.extractText()
print(a)

A more robust way, supposing there are multiple PDF's or just one !
import os
from PyPDF2 import PdfFileWriter, PdfFileReader
from io import BytesIO
mydir = # specify path to your directory where PDF or PDF's are
for arch in os.listdir(mydir):
buffer = io.BytesIO()
archpath = os.path.join(mydir, arch)
with open(archpath) as f:
pdfFileObj = open(archpath, 'rb')
pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
pdfReader.numPages
pageObj = pdfReader.getPage(0)
ley = pageObj.extractText()
file1 = open("myfile.txt","w")
file1.writelines(ley)
file1.close()

Camelot seems a fairly powerful solution to extract tables from PDFs in Python.
At first sight it seems to achieve almost as accurate extraction as the tabula-py package suggested by CreekGeek, which is already waaaaay above any other posted solution as of today in terms of reliability, but it is supposedly much more configurable. Furthermore it has its own accuracy indicator (results.parsing_report), and great debugging features.
Both Camelot and Tabula provide the results as Pandas’ DataFrames, so it is easy to adjust tables afterwards.
pip install camelot-py
(Not to be confused with the camelot package.)
import camelot
df_list = []
results = camelot.read_pdf("file.pdf", ...)
for table in results:
print(table.parsing_report)
df_list.append(results[0].df)
It can also output results as CSV, JSON, HTML or Excel.
Camelot comes at the expense of a number of dependencies.
NB : Since my input is pretty complex with many different tables I ended up using both Camelot and Tabula, depending on the table, to achieve the best results.

Try out borb, a pure python PDF library
import typing
from borb.pdf.document import Document
from borb.pdf.pdf import PDF
from borb.toolkit.text.simple_text_extraction import SimpleTextExtraction
def main():
# variable to hold Document instance
doc: typing.Optional[Document] = None
# this implementation of EventListener handles text-rendering instructions
l: SimpleTextExtraction = SimpleTextExtraction()
# open the document, passing along the array of listeners
with open("input.pdf", "rb") as in_file_handle:
doc = PDF.loads(in_file_handle, [l])
# were we able to read the document?
assert doc is not None
# print the text on page 0
print(l.get_text(0))
if __name__ == "__main__":
main()

It includes creating a new sheet for each PDF page being set dynamically based on number of pages in the document.
import PyPDF2 as p2
import xlsxwriter
pdfFileName = "sample.pdf"
pdfFile = open(pdfFileName, 'rb')
pdfread = p2.PdfFileReader(pdfFile)
number_of_pages = pdfread.getNumPages()
workbook = xlsxwriter.Workbook('pdftoexcel.xlsx')
for page_number in range(number_of_pages):
print(f'Sheet{page_number}')
pageinfo = pdfread.getPage(page_number)
rawInfo = pageinfo.extractText().split('\n')
row = 0
column = 0
worksheet = workbook.add_worksheet(f'Sheet{page_number}')
for line in rawInfo:
worksheet.write(row, column, line)
row += 1
workbook.close()

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Extract only specific text from PDF using Python - python

Related

Incorrect output: Extracting text from pdf's,docx's pptx's will not output in their own spearte line

Downloading image from url and assign it with a specified id from csv file

Extract data from PDF using python [duplicate]

Downloading files concurrently in Python

PyPDF2 - Returning only blank lines. En(de)code issue? [duplicate]

Categories

Resources