Read pdf page by page

Read pdf page by page - python

I searched for my question and did not get my answer in the two available questions
Extract text per page with Python pdfMiner?
PDFMiner - Iterating through pages and converting them to text
Basically I want to iterate over each page because I want to select only that page which has a certain text.
I have used pyPdf. It works for almost i can say 90% of the pdfs but sometimes it does not extract the information from a page.
I have used the below code:
import pyPdf
extract = ""
pdf = pyPdf.PdfFileReader(open('filename.pdf', "rb"))
num_of_pages = pdf.getNumPages()
for p in range(num_of_pages):
ex = pdf.getPage(6)
ex = ex.extractText()
if re.search(r"to be held (at|on)",ex.lower()):
print 'yes'
print ex ,"\n"
extract = extract + ex + "\n"
continue
The above code works but sometimes some pages don't get extracted.
I also tried using pdfminer, but i could not find how to iterate the pdf in it page by page. pdfminer returns the entire text of the pdf.
I used the below code:
def convert_pdf_to_txt(path):
rsrcmgr = PDFResourceManager()
retstr = StringIO()
codec = 'utf-8'
laparams = LAParams()
device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
fp = file(path, 'rb')
interpreter = PDFPageInterpreter(rsrcmgr, device)
password = ""
maxpages = 0
caching = True
pagenos=set()
for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True):
interpreter.process_page(page)
text = retstr.getvalue()
fp.close()
device.close()
retstr.close()
return text
In the above code the text from the pdf comes from the for loop
for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True):
interpreter.process_page(page)
text = retstr.getvalue()
In this how can I iterated on one page at a time.
The documentation on pdfminer is not understandable. Also there are many versions of the same.
So are there any other packages available for my question or can pdfminer be used for it?

Because retstr will retain each page, you might consider altering your code by calling retstr.truncate(0) which clears the string each time, otherwise you're printing the entirety of what's already been read each time:
import pyPdf
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from cStringIO import StringIO
path = "filename.pdf"
pdf = pyPdf.PdfFileReader(open(path, "rb"))
fp = file(path, 'rb')
num_of_pages = pdf.getNumPages()
extract = ""
for i in range(num_of_pages):
inside = [i]
pagenos=set(inside)
rsrcmgr = PDFResourceManager()
retstr = StringIO()
codec = 'utf-8'
laparams = LAParams()
device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device)
password = ""
maxpages = 0
caching = True
text = ""
for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True):
interpreter.process_page(page)
text = retstr.getvalue()
retstr.truncate(0)
text = text.decode("ascii","replace")
if re.search(r"to be held (at|on)",text.lower()):
print text
extract = extract + text + "\n"
continue

I know it is not good to answer your own question but i think i may have figured out an answer for this question.
I think it is not the best way to do it, but still it helps me.
I used a combination of pypdf and pdfminer
The code is as below:
import pyPdf
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from cStringIO import StringIO
path = "filename.pdf"
pdf = pyPdf.PdfFileReader(open(path, "rb"))
fp = file(path, 'rb')
num_of_pages = pdf.getNumPages()
extract = ""
for i in range(num_of_pages):
inside = [i]
pagenos=set(inside)
rsrcmgr = PDFResourceManager()
retstr = StringIO()
codec = 'utf-8'
laparams = LAParams()
device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device)
password = ""
maxpages = 0
caching = True
text = ""
for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True):
interpreter.process_page(page)
text = retstr.getvalue()
text = text.decode("ascii","replace")
if re.search(r"to be held (at|on)",text.lower()):
print text
extract = extract + text + "\n"
continue
There may be a better way to do it, but currently i found out this to be pretty good.

You can refer the following link to extract page by page text from PDF.
from pdfminer.high_level import extract_pages
from pdfminer.layout import LTTextContainer
for page_layout in extract_pages("test.pdf"):
for element in page_layout:
if isinstance(element, LTTextContainer):
print(element.get_text())
PDFMiner Page by Page text Extraction

Related

Converting data from PDF to XML with PDFminer?

I used the code below to convert PDF data to XML data and write the conversion to a XML file. It is quite well known (it uses the PDFminer module) and works very well for PDF to text and HTML conversions but I have a problem when I do PDF to XML conversion. I'm quite a novice and some help from you would be super nice :)
Voici le code :
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter, XMLConverter, HTMLConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from io import BytesIO
def convert_pdf(path, format='text', codec='utf-8', password=''):
rsrcmgr = PDFResourceManager()
retstr = BytesIO()
laparams = LAParams()
if format == 'text':
device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
elif format == 'html':
device = HTMLConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
elif format == 'xml':
device = XMLConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
else:
raise ValueError('provide format, either text, html or xml!')
fp = open(path, 'rb')
interpreter = PDFPageInterpreter(rsrcmgr, device)
maxpages = 0
caching = True
pagenos=set()
for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True):
interpreter.process_page(page)
text = retstr.getvalue().decode()
fp.close()
device.close()
retstr.close()
return text
path_pdf = ...
path_xml = ...
open(path_xml, "w").close()
text_output = convert_pdf(path_pdf)
open(path_xml, "a", encoding="utf-8").write(text_output)
And here's the error I get:
Thank you in advance!

If you are using python2.7, the following works
from io import BytesIO
from pdfminer.layout import LAParams
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfpage import PDFPage
def get_xml_py2(file_path):
in_fp = BytesIO()
with open(file_path, 'rb') as x:
in_fp.write(x.read())
laparams = LAParams(all_texts=True)
rsrcmgr = PDFResourceManager()
for page in PDFPage.get_pages(in_fp):
outfp = BytesIO()
device = XMLConverter(rsrcmgr, outfp, laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device)
interpreter.process_page(page)
yield outfp.getvalue()
device.close()
outfp.close()
in_fp.close()

Working with singe pages with PDFMiner

I have some PDF documents from which I can not extract text with PyPDF, only with PDFMiner. The following code works fine to extract all text from the PDFs, it goes through the whole document, then returns all the text.
Is there a way to only work with certain pages of the PDF?
The PDFs I have are all 2000-3000 long and I only need to work with every second page.
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from cStringIO import StringIO
def convert_pdf_to_txt(path):
rsrcmgr = PDFResourceManager()
retstr = StringIO()
codec = 'utf-8'
laparams = LAParams()
device = TextConverter(rsrcmgr, retstr, codec=codec,laparams=laparams)
fp = file(path, 'rb')
interpreter = PDFPageInterpreter(rsrcmgr, device)
password = ""
maxpages = 0
caching = True
pagenos=set()
for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True):
interpreter.process_page(page)
text = retstr.getvalue()
fp.close()
device.close()
retstr.close()
return text

Couldn't you use enumerate to get the page number and the page content while iterating through all the pages? If you only want every second page, use modulus. If you want specific pages only, use ranges.
Example:
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from cStringIO import StringIO
def convert_pdf_to_txt(path):
rsrcmgr = PDFResourceManager()
retstr = StringIO()
codec = 'utf-8'
laparams = LAParams()
device = TextConverter(rsrcmgr, retstr, codec=codec,laparams=laparams)
fp = file(path, 'rb')
interpreter = PDFPageInterpreter(rsrcmgr, device)
password = ""
maxpages = 0
caching = True
pagenos=set()
for pagenumber, page in enumerate(PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True)):
print pagenumber
if pagenumber % 2 == 0:
print("even page number")
interpreter.process_page(page)
else:
print("odd page number")
if 5 <= pagenumber <= 10:
print("pages 5 to 10")
text = retstr.getvalue()
fp.close()
device.close()
retstr.close()
return text

How can I adjust the 'word_margin' for reading PDFs with pdfminer in python?

I've tried to manipulate the 'word_margin' with python using the following code but it throws me an error TypeError: get_pages() got an unexpected keyword argument 'word_margin'. PDFminer reads the document fine if I remove the word_margin=word_marginfrom the arguments.
Code:
def convert_pdf_to_txt(path):
rsrcmgr = PDFResourceManager()
retstr = StringIO()
codec = 'utf-8'
laparams = LAParams()
device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
fp = open(path, 'rb')
interpreter = PDFPageInterpreter(rsrcmgr, device)
password = ""
maxpages = 0
caching = True
pagenos=set()
word_margin = 1
for page in PDFPage.get_pages(fp, pagenos, word_margin=word_margin,maxpages=maxpages,password=password,caching=caching, check_extractable=True):
interpreter.process_page(page)
text = retstr.getvalue()
fp.close()
device.close()
retstr.close()
return text

word_margin is a parameter of LAParams class. If I understand correctly, the code should look like this:
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.layout import LAParams
from pdfminer.converter import TextConverter
from StringIO import StringIO
def convert_pdf_to_txt(path):
rsrcmgr = PDFResourceManager()
retstr = StringIO()
codec = 'utf-8'
word_margin = 1
laparams = LAParams(word_margin=word_margin)
device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
fp = open(path, 'rb')
interpreter = PDFPageInterpreter(rsrcmgr, device)
password = ""
maxpages = 0
caching = True
pagenos=set()
for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages,password=password,caching=caching, check_extractable=True):
interpreter.process_page(page)
text = retstr.getvalue()
fp.close()
device.close()
retstr.close()
return text

pdfminer/poppler - how to set encoding

I have a file, i.e. http://www.agfl.cs.ru.nl/papers/manual28.pdf
(it's english)
Pdfminer and poppler shows the same result in most parsed pages, like:
¾º¿ Â ÒÙ ÖÝ¸ ¾¼¼ Èº ¾º ÂÙÒ ¸ ¾¼¼ Åº Ë ÙØØ Ö¸ Çº Ë
It seems it can't read font custom encodings. How to specify it?
Here's code samples:
# poppler
input_filename = '/tmp/manual28.pdf'
document = poppler.document_new_from_file('file://%s' % urllib.pathname2url(os.path.abspath(input_filename)), None)
n_pages = document.get_n_pages()
for i in range(n_pages):
page = document.get_page(i)
print page.get_text()
# chardet.detect(page.get_text()) # utf8 all time
# pdfminer
def pdf_to_html(in_fp, out_fp, codec='utf-8', maxpages=0, pagenos=None, html=True):
rsrcmgr = PDFResourceManager()
laparams = LAParams()
if isinstance(in_fp, basestring):
in_fp = open(in_fp, 'rb')
if isinstance(out_fp, basestring):
out_fp = open(out_fp, 'wb')
if html:
device = HTMLConverter(rsrcmgr, out_fp, codec=codec, laparams=laparams)
else:
device = TextConverter(rsrcmgr, out_fp, codec=codec, laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device)
for page in PDFPage.get_pages(in_fp, pagenos, maxpages=maxpages):
interpreter.process_page(page)
in_fp.close()
device.close()
out_fp.close()

How do I use pdfminer as a library

I am trying to get text data from a pdf using pdfminer. I am able to extract this data to a .txt file successfully with the pdfminer command line tool pdf2txt.py. I currently do this and then use a python script to clean up the .txt file. I would like to incorporate the pdf extract process into the script and save myself a step.
I thought I was on to something when I found this link, but I didn't have success with any of the solutions. Perhaps the function listed there needs to be updated again because I am using a newer version of pdfminer.
I also tried the function shown here, but it also did not work.
Another approach I tried was to call the script within a script using os.system. This was also unsuccessful.
I am using Python version 2.7.1 and pdfminer version 20110227.

Here is a new solution that works with the latest version:
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from cStringIO import StringIO
def convert_pdf_to_txt(path):
rsrcmgr = PDFResourceManager()
retstr = StringIO()
codec = 'utf-8'
laparams = LAParams()
device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
fp = file(path, 'rb')
interpreter = PDFPageInterpreter(rsrcmgr, device)
password = ""
maxpages = 0
caching = True
pagenos=set()
for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True):
interpreter.process_page(page)
fp.close()
device.close()
str = retstr.getvalue()
retstr.close()
return str

Here is a cleaned up version I finally produced that worked for me. The following just simply returns the string in a PDF, given its filename. I hope this saves someone time.
from pdfminer.pdfinterp import PDFResourceManager, process_pdf
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from cStringIO import StringIO
def convert_pdf(path):
rsrcmgr = PDFResourceManager()
retstr = StringIO()
codec = 'utf-8'
laparams = LAParams()
device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
fp = file(path, 'rb')
process_pdf(rsrcmgr, device, fp)
fp.close()
device.close()
str = retstr.getvalue()
retstr.close()
return str
This solution was valid until API changes in November 2013.

I know it is poor taste to answer your own question, but I think I may have figured this out and I don't want anyone else to waste their time looking for a solution to my problem.
I followed the suggestion in a one of the links posted in my question and re-purposed the current pdf2txt.py script included with pdfminer. Here is the function in case it is useful to anyone else. Thanks to the user skyl for posting that answer, all I had to to was make a couple of changes to make it work with the current version of pdfminer.
This function take a pdf and creates a .txt file in the same directory with the same name.
def convert_pdf(path, outtype='txt', opts={}):
import sys
from pdfminer.pdfparser import PDFDocument, PDFParser
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter, process_pdf
from pdfminer.pdfdevice import PDFDevice, TagExtractor
from pdfminer.converter import XMLConverter, HTMLConverter, TextConverter
from pdfminer.cmapdb import CMapDB
from pdfminer.layout import LAParams
import getopt
outfile = path[:-3] + outtype
outdir = '/'.join(path.split('/')[:-1])
# debug option
debug = 0
# input option
password = ''
pagenos = set()
maxpages = 0
# output option
# ?outfile = None
# ?outtype = None
outdir = None
#layoutmode = 'normal'
codec = 'utf-8'
pageno = 1
scale = 1
showpageno = True
laparams = LAParams()
for (k, v) in opts:
if k == '-d': debug += 1
elif k == '-p': pagenos.update( int(x)-1 for x in v.split(',') )
elif k == '-m': maxpages = int(v)
elif k == '-P': password = v
elif k == '-o': outfile = v
elif k == '-n': laparams = None
elif k == '-A': laparams.all_texts = True
elif k == '-V': laparams.detect_vertical = True
elif k == '-M': laparams.char_margin = float(v)
elif k == '-L': laparams.line_margin = float(v)
elif k == '-W': laparams.word_margin = float(v)
elif k == '-F': laparams.boxes_flow = float(v)
elif k == '-Y': layoutmode = v
elif k == '-O': outdir = v
elif k == '-t': outtype = v
elif k == '-c': codec = v
elif k == '-s': scale = float(v)
#
#PDFDocument.debug = debug
#PDFParser.debug = debug
CMapDB.debug = debug
PDFResourceManager.debug = debug
PDFPageInterpreter.debug = debug
PDFDevice.debug = debug
#
rsrcmgr = PDFResourceManager()
outtype = 'text'
if outfile:
outfp = file(outfile, 'w')
else:
outfp = sys.stdout
device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams)
fp = file(path, 'rb')
process_pdf(rsrcmgr, device, fp, pagenos, maxpages=maxpages, password=password,
check_extractable=True)
fp.close()
device.close()
outfp.close()
return

Here's my solution
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from io import StringIO
import os
def convert_pdf_to_txt(path, pages=None):
if not pages:
pagenums = set()
else:
pagenums = set(pages)
output = StringIO()
manager = PDFResourceManager()
converter = TextConverter(manager, output, laparams=LAParams())
interpreter = PDFPageInterpreter(manager, converter)
infile = open(path, 'rb')
for page in PDFPage.get_pages(infile, pagenums):
interpreter.process_page(page)
infile.close()
converter.close()
text = output.getvalue()
output.close()
return text
For example you just want to read the first 3 pages of a pdf file:
text = convert_pdf_to_txt('../Data/EN-FINAL Table 9.pdf', pages=[0,1,2])
pdfminer.six==20160614
python: 3.x

This worked for me using the most recent version of pdfminer (as of September 2014):
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfpage import PDFTextExtractionNotAllowed
from pdfminer.pdfinterp import PDFResourceManager
from pdfminer.pdfinterp import PDFPageInterpreter
from pdfminer.pdfdevice import PDFDevice
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
import unicodedata, codecs
from io import StringIO
def getPDFText(pdfFilenamePath):
retstr = StringIO()
parser = PDFParser(open(pdfFilenamePath,'r'))
try:
document = PDFDocument(parser)
except Exception as e:
print(pdfFilenamePath,'is not a readable pdf')
return ''
if document.is_extractable:
rsrcmgr = PDFResourceManager()
device = TextConverter(rsrcmgr,retstr, codec='ascii' , laparams = LAParams())
interpreter = PDFPageInterpreter(rsrcmgr, device)
for page in PDFPage.create_pages(document):
interpreter.process_page(page)
return retstr.getvalue()
else:
print(pdfFilenamePath,"Warning: could not extract text from pdf file.")
return ''
if __name__ == '__main__':
words = getPDFText(path)

Here's an answer that works with pdfminer.six running python 3.6. It uses the pdfminer.high_level module that abstracts away a lot of the underlying detail if you just want to get out the raw text from a simple PDF file.
import pdfminer
import io
def extract_raw_text(pdf_filename):
output = io.StringIO()
laparams = pdfminer.layout.LAParams() # Using the defaults seems to work fine
with open(pdf_filename, "rb") as pdffile:
pdfminer.high_level.extract_text_to_fp(pdffile, output, laparams=laparams)
return output.getvalue()

The following modification of the non-process_pdf answers pulls the text straight from a URL string name and works with version 20140328 and Python 2.7:
from urllib2 import urlopen
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from cStringIO import StringIO
def convert_pdf_to_txt(url):
rsrcmgr = PDFResourceManager()
retstr = StringIO()
codec = 'utf-8'
laparams = LAParams()
device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
scrape = urlopen(url).read()
fp = StringIO(scrape)
interpreter = PDFPageInterpreter(rsrcmgr, device)
password = ""
maxpages = 0
caching = True
pagenos=set()
for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True):
interpreter.process_page(page)
fp.close()
device.close()
textstr = retstr.getvalue()
retstr.close()
return textstr

If you are working with scraped data via urllib2, try this (which is developed and explained here):
def pdf_to_text(scraped_pdf_data):
from pdfminer.pdfinterp import PDFResourceManager, process_pdf
from pdfminer.pdfdevice import PDFDevice
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
import StringIO
fp = StringIO.StringIO()
fp.write(scraped_pdf_data)
fp.seek(0)
outfp = StringIO.StringIO()
rsrcmgr = PDFResourceManager()
device = TextConverter(rsrcmgr, outfp, laparams=LAParams())
process_pdf(rsrcmgr, device, fp)
device.close()
t = outfp.getvalue()
outfp.close()
fp.close()
return t
Like the other answers, the code here adapts the pdf2txt utility that PDFMiner itself provides. You can thus also convert to html or xml -- just sub HTMLConverter or XMLConverter for TextConverter everywhere above.

The following code works for me with latest version of PDFMiner it takes path of pdf and return text in .txt format.
P.S: This is a modification of above answer.
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from cStringIO import StringIO
def convert_pdf_to_txt(path, outtype='txt'):
outfile = path[:-3] + outtype
rsrcmgr = PDFResourceManager()
codec = 'utf-8'
laparams = LAParams()
if outfile:
outfp = file(outfile, 'w')
else:
outfp = sys.stdout
device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams)
fp = file(path, 'rb')
interpreter = PDFPageInterpreter(rsrcmgr, device)
password = ""
maxpages = 0
caching = True
pagenos=set()
for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True):
interpreter.process_page(page)
fp.close()
device.close()
outfp.close()
return

just in case anyone still needs this,
got it working with requests and python 3.4.
thanks to #bahmait for his answer above :)
import requests
from io import StringIO
from pdfminer.pdfinterp import PDFResourceManager, process_pdf
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
def pdf_to_text(url=None):
text = None
pdf = requests.get(url)
if pdf.ok:
fp = StringIO(str(pdf.content, 'utf-8'))
outfp = StringIO()
rsrcmgr = PDFResourceManager()
device = TextConverter(rsrcmgr, outfp, laparams=LAParams())
process_pdf(rsrcmgr, device, fp)
device.close()
text = outfp.getvalue()
outfp.close()
fp.close()
return text
if __name__ == "__main__":
hello_world_text = pdf_to_text("https://bytebucket.org/hsoft/pdfminer3k/raw/28edfc91caed830674ca0b928f42571f7dee6091/samples/simple1.pdf")
no_pdf = pdf_to_text('http://www.google.com/404')
print(hello_world_text)
print(no_pdf)

Here is a cleaned up version I finally produced that worked for me. The following just simply returns the string in a PDF, given its filename. I hope this saves someone time.
from pdfminer.pdfinterp import PDFResourceManager, process_pdf
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from cStringIO import StringIO
def convert_pdf(path):
rsrcmgr = PDFResourceManager()
retstr = StringIO()
codec = 'utf-8'
laparams = LAParams()
device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
fp = file(path, 'rb')
process_pdf(rsrcmgr, device, fp)
fp.close()
device.close()
str = retstr.getvalue()
retstr.close()
return str
can anybody say me : is there any specific place where the pdf file is to be placed??

Only if someone still needs it: How to print the HTML from a PDF using PDFMiner:
import sys
import getopt
from Core.Interfaces.IReader import IReader
from pdfminer.pdfparser import PDFDocument, PDFParser
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter, process_pdf
from pdfminer.pdfdevice import PDFDevice, TagExtractor
from pdfminer.converter import XMLConverter, HTMLConverter, TextConverter
from pdfminer.cmapdb import CMapDB
from pdfminer.layout import LAParams
from cStringIO import StringIO
class PdfReader(object):
def __init__(self):
pass
def readText(self,path, outtype='text', opts={}):
outfile = path[:-3] + outtype
outdir = '/'.join(path.split('/')[:-1])
# debug option
debug = 0
# input option
password = ''
pagenos = set()
maxpages = 0
# output option
# ?outfile = None
# ?outtype = None
outdir = None
#layoutmode = 'normal'
codec = 'utf-8'
pageno = 1
scale = 1
showpageno = True
laparams = LAParams()
for (k, v) in opts:
if k == '-d': debug += 1
elif k == '-p': pagenos.update( int(x)-1 for x in v.split(',') )
elif k == '-m': maxpages = int(v)
elif k == '-P': password = v
elif k == '-o': outfile = v
elif k == '-n': laparams = None
elif k == '-A': laparams.all_texts = True
elif k == '-V': laparams.detect_vertical = True
elif k == '-M': laparams.char_margin = float(v)
elif k == '-L': laparams.line_margin = float(v)
elif k == '-W': laparams.word_margin = float(v)
elif k == '-F': laparams.boxes_flow = float(v)
elif k == '-Y': layoutmode = v
elif k == '-O': outdir = v
elif k == '-t': outtype = v
elif k == '-c': codec = v
elif k == '-s': scale = float(v)
print laparams
#
#PDFDocument.debug = debug
#PDFParser.debug = debug
CMapDB.debug = debug
PDFResourceManager.debug = debug
PDFPageInterpreter.debug = debug
PDFDevice.debug = debug
#
rsrcmgr = PDFResourceManager()
#outtype = 'text'
outfp = StringIO()
device = HTMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams)
fp = file(path, 'rb')
process_pdf(rsrcmgr, device, fp, pagenos, maxpages=maxpages, password=password,
check_extractable=True)
fp.close()
device.close()
print outfp.getvalue()
outfp.close()
return
reader = PdfReader()
opt = map(None,['-W','-L','-t'],[0.5,0.4,'html'])
reader.readText("/test_data/test.pdf","html",opt)

This one worked for me in python 3.
It requires the PDFMiner.six package
pip install pdfminer.six
The code is as follows (same code as everyone, with minor fixes):
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from six import StringIO
def convert_pdf_to_txt(path):
rsrcmgr = PDFResourceManager()
retstr = StringIO()
codec = 'utf-8'
laparams = LAParams()
device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
fp = open(path, 'rb')
interpreter = PDFPageInterpreter(rsrcmgr, device)
password = ""
maxpages = 0
caching = True
pagenos=set()
for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True):
interpreter.process_page(page)
fp.close()
device.close()
str = retstr.getvalue()
retstr.close()
return str.replace("\\n","\n")

Full disclosure, I am one of the maintainers of pdfminer.six. It is a community-maintained version of pdfminer for python 3.
Nowadays, it has multiple api's to extract text from a PDF, depending on your needs. Behind the scenes, all of these api's use the same logic for parsing and analyzing the layout.
(All the examples assume your PDF file is called example.pdf)
Commandline
If you want to extract text just once you can use the commandline tool pdf2txt.py:
$ pdf2txt.py example.pdf
High-level api
If you want to extract text (properties) with Python, you can use the high-level api. This approach is the go-to solution if you want to programmatically extract information from a PDF.
from pdfminer.high_level import extract_text
# Extract text from a pdf.
text = extract_text('example.pdf')
# Extract iterable of LTPage objects.
pages = extract_pages('example.pdf')
Composable api
There is also a composable api that gives a lot of flexibility in handling the resulting objects. For example, it allows you to create your own layout algorithm. This method is suggested in the other answers, but I would only recommend this when you need to customize some component.
from io import StringIO
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfparser import PDFParser
output_string = StringIO()
with open('example.pdf', 'rb') as in_file:
parser = PDFParser(in_file)
doc = PDFDocument(parser)
rsrcmgr = PDFResourceManager()
device = TextConverter(rsrcmgr, output_string, laparams=LAParams())
interpreter = PDFPageInterpreter(rsrcmgr, device)
for page in PDFPage.create_pages(doc):
interpreter.process_page(page)
print(output_string.getvalue())

The following code snippets is able to extract plain text from pdf documents using the latest version of pdfminer(as of 23-Mar-2016). Hope this helps.
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from cStringIO import StringIO
def convert_pdf_to_txt(path):
rsrcmgr = PDFResourceManager()
retstr = StringIO()
codec = 'utf-8'
laparams = LAParams()
device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
fp = file(path, 'rb')
parser = PDFParser(fp)
doc = PDFDocument(parser)
parser.set_document(doc)
interpreter = PDFPageInterpreter(rsrcmgr, device)
password = ""
maxpages = 0
caching = True
pagenos=set()
for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True):
interpreter.process_page(page)
text = retstr.getvalue()
fp.close()
device.close()
retstr.close()
print text
return text
convert_pdf_to_txt(<path_of_the_pdf_file>)

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Read pdf page by page - python

Related

Converting data from PDF to XML with PDFminer?

Working with singe pages with PDFMiner

How can I adjust the 'word_margin' for reading PDFs with pdfminer in python?

pdfminer/poppler - how to set encoding

How do I use pdfminer as a library

Categories

Resources