pdfminer/poppler - how to set encoding - python

I have a file, i.e. http://www.agfl.cs.ru.nl/papers/manual28.pdf
(it's english)
Pdfminer and poppler shows the same result in most parsed pages, like:
¾º¿  ÒÙ Öݸ ¾¼¼ Ⱥ ¾º ÂÙÒ ¸ ¾¼¼ ź Ë ÙØØ Ö¸ Ǻ Ë
It seems it can't read font custom encodings. How to specify it?
Here's code samples:
# poppler
input_filename = '/tmp/manual28.pdf'
document = poppler.document_new_from_file('file://%s' % urllib.pathname2url(os.path.abspath(input_filename)), None)
n_pages = document.get_n_pages()
for i in range(n_pages):
page = document.get_page(i)
print page.get_text()
# chardet.detect(page.get_text()) # utf8 all time
# pdfminer
def pdf_to_html(in_fp, out_fp, codec='utf-8', maxpages=0, pagenos=None, html=True):
rsrcmgr = PDFResourceManager()
laparams = LAParams()
if isinstance(in_fp, basestring):
in_fp = open(in_fp, 'rb')
if isinstance(out_fp, basestring):
out_fp = open(out_fp, 'wb')
if html:
device = HTMLConverter(rsrcmgr, out_fp, codec=codec, laparams=laparams)
else:
device = TextConverter(rsrcmgr, out_fp, codec=codec, laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device)
for page in PDFPage.get_pages(in_fp, pagenos, maxpages=maxpages):
interpreter.process_page(page)
in_fp.close()
device.close()
out_fp.close()

Related

Converting data from PDF to XML with PDFminer?

I used the code below to convert PDF data to XML data and write the conversion to a XML file. It is quite well known (it uses the PDFminer module) and works very well for PDF to text and HTML conversions but I have a problem when I do PDF to XML conversion. I'm quite a novice and some help from you would be super nice :)
Voici le code :
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter, XMLConverter, HTMLConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from io import BytesIO
def convert_pdf(path, format='text', codec='utf-8', password=''):
rsrcmgr = PDFResourceManager()
retstr = BytesIO()
laparams = LAParams()
if format == 'text':
device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
elif format == 'html':
device = HTMLConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
elif format == 'xml':
device = XMLConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
else:
raise ValueError('provide format, either text, html or xml!')
fp = open(path, 'rb')
interpreter = PDFPageInterpreter(rsrcmgr, device)
maxpages = 0
caching = True
pagenos=set()
for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True):
interpreter.process_page(page)
text = retstr.getvalue().decode()
fp.close()
device.close()
retstr.close()
return text
path_pdf = ...
path_xml = ...
open(path_xml, "w").close()
text_output = convert_pdf(path_pdf)
open(path_xml, "a", encoding="utf-8").write(text_output)
And here's the error I get:
Thank you in advance!
If you are using python2.7, the following works
from io import BytesIO
from pdfminer.layout import LAParams
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfpage import PDFPage
def get_xml_py2(file_path):
in_fp = BytesIO()
with open(file_path, 'rb') as x:
in_fp.write(x.read())
laparams = LAParams(all_texts=True)
rsrcmgr = PDFResourceManager()
for page in PDFPage.get_pages(in_fp):
outfp = BytesIO()
device = XMLConverter(rsrcmgr, outfp, laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device)
interpreter.process_page(page)
yield outfp.getvalue()
device.close()
outfp.close()
in_fp.close()

Separate pdf to pages using pdfminer

I am trying to extract a pdf page by page and store the results in a dictionary as follows:
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from cStringIO import StringIO
import re
def convert_pdf_to_txt(path):
ps=dict()
rsrcmgr = PDFResourceManager()
retstr = StringIO()
codec = 'utf-8'
laparams = LAParams()
device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
fp = file(path, 'rb')
interpreter = PDFPageInterpreter(rsrcmgr, device)
password = ""
maxpages = 0
caching = True
pagenos=set()
i=1
for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True):
interpreter.process_page(page)
text = retstr.getvalue()
ps[i]=re.sub(' +',' ',text)
i+=1
return ps
print convert_pdf_to_txt('Aak.pdf')[3]
But whichever page I access I get all the previous pages. Please do tell me how I can fix this?
This should work.
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from cStringIO import StringIO
import os
def set_interpreter():
rsrcmgr = PDFResourceManager()
retstr = StringIO()
codec = 'utf-8'
laparams = LAParams()
device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device)
return { 'retstr': retstr, 'device': device, 'interpreter': interpreter }
def convert_pdf_to_txt(path):
fp = file(path, 'rb')
si = set_interpreter()
retstr = si['retstr']
device = si['device']
interpreter = si['interpreter']
password = ""
maxpages = 0
caching = True
pagenos=set()
page_counter = 0
for pageNumber, page in enumerate(PDFPage.get_pages(fp, pagenos, maxpages=maxpages,password=password,caching=caching, check_extractable=True)):
interpreter.process_page(page)
fpp = file('pagetext_%d.txt' % page_counter, 'w+')
fpp.write(retstr.getvalue())
fpp.close()
page_counter += 1
si = set_interpreter()
retstr = si['retstr']
device = si['device']
interpreter = si['interpreter']
fp.close()
device.close()
retstr.close()
return text
print convert_pdf_to_txt(os.path.dirname(os.path.realpath('filename.pdf')) + "/filename.pdf")
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfinterp import resolve1
from io import StringIO
import numpy as np
def read_pdf(file_path):
"""
Function that reads a PDF file and returns a dictionary
"""
rsrcmgr = PDFResourceManager()
codec = 'utf-8'
laparams = LAParams()
fp = open(file_path, 'rb')
password = ""
maxpages = 0
caching = True
parser = PDFParser(fp)
document = PDFDocument(parser)
# Count of pages
num_pages = resolve1(document.catalog["Pages"])["Count"]
pages_dict = {}
while num_pages > 0:
retstr = StringIO()
device = TextConverter(rsrcmgr, retstr, codec = codec, laparams = laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device)
for page in PDFPage.get_pages(fp, pagenos = [num_pages - 1], maxpages = maxpages, password = password, caching = caching, check_extractable = True):
interpreter.process_page(page)
text = retstr.getvalue()
pages_dict[num_pages] = text
num_pages = num_pages - 1
device.close()
retstr.close()
fp.close()
return pages_dict
d = read_pdf("your_document.pdf")
for k, v in d.items():
print(f"\n----------------------------------------------PAGE {k}----------------------------------------------\n")
print(v)

Working with singe pages with PDFMiner

I have some PDF documents from which I can not extract text with PyPDF, only with PDFMiner. The following code works fine to extract all text from the PDFs, it goes through the whole document, then returns all the text.
Is there a way to only work with certain pages of the PDF?
The PDFs I have are all 2000-3000 long and I only need to work with every second page.
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from cStringIO import StringIO
def convert_pdf_to_txt(path):
rsrcmgr = PDFResourceManager()
retstr = StringIO()
codec = 'utf-8'
laparams = LAParams()
device = TextConverter(rsrcmgr, retstr, codec=codec,laparams=laparams)
fp = file(path, 'rb')
interpreter = PDFPageInterpreter(rsrcmgr, device)
password = ""
maxpages = 0
caching = True
pagenos=set()
for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True):
interpreter.process_page(page)
text = retstr.getvalue()
fp.close()
device.close()
retstr.close()
return text
Couldn't you use enumerate to get the page number and the page content while iterating through all the pages? If you only want every second page, use modulus. If you want specific pages only, use ranges.
Example:
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from cStringIO import StringIO
def convert_pdf_to_txt(path):
rsrcmgr = PDFResourceManager()
retstr = StringIO()
codec = 'utf-8'
laparams = LAParams()
device = TextConverter(rsrcmgr, retstr, codec=codec,laparams=laparams)
fp = file(path, 'rb')
interpreter = PDFPageInterpreter(rsrcmgr, device)
password = ""
maxpages = 0
caching = True
pagenos=set()
for pagenumber, page in enumerate(PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True)):
print pagenumber
if pagenumber % 2 == 0:
print("even page number")
interpreter.process_page(page)
else:
print("odd page number")
if 5 <= pagenumber <= 10:
print("pages 5 to 10")
text = retstr.getvalue()
fp.close()
device.close()
retstr.close()
return text

Read pdf page by page

I searched for my question and did not get my answer in the two available questions
Extract text per page with Python pdfMiner?
PDFMiner - Iterating through pages and converting them to text
Basically I want to iterate over each page because I want to select only that page which has a certain text.
I have used pyPdf. It works for almost i can say 90% of the pdfs but sometimes it does not extract the information from a page.
I have used the below code:
import pyPdf
extract = ""
pdf = pyPdf.PdfFileReader(open('filename.pdf', "rb"))
num_of_pages = pdf.getNumPages()
for p in range(num_of_pages):
ex = pdf.getPage(6)
ex = ex.extractText()
if re.search(r"to be held (at|on)",ex.lower()):
print 'yes'
print ex ,"\n"
extract = extract + ex + "\n"
continue
The above code works but sometimes some pages don't get extracted.
I also tried using pdfminer, but i could not find how to iterate the pdf in it page by page. pdfminer returns the entire text of the pdf.
I used the below code:
def convert_pdf_to_txt(path):
rsrcmgr = PDFResourceManager()
retstr = StringIO()
codec = 'utf-8'
laparams = LAParams()
device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
fp = file(path, 'rb')
interpreter = PDFPageInterpreter(rsrcmgr, device)
password = ""
maxpages = 0
caching = True
pagenos=set()
for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True):
interpreter.process_page(page)
text = retstr.getvalue()
fp.close()
device.close()
retstr.close()
return text
In the above code the text from the pdf comes from the for loop
for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True):
interpreter.process_page(page)
text = retstr.getvalue()
In this how can I iterated on one page at a time.
The documentation on pdfminer is not understandable. Also there are many versions of the same.
So are there any other packages available for my question or can pdfminer be used for it?
Because retstr will retain each page, you might consider altering your code by calling retstr.truncate(0) which clears the string each time, otherwise you're printing the entirety of what's already been read each time:
import pyPdf
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from cStringIO import StringIO
path = "filename.pdf"
pdf = pyPdf.PdfFileReader(open(path, "rb"))
fp = file(path, 'rb')
num_of_pages = pdf.getNumPages()
extract = ""
for i in range(num_of_pages):
inside = [i]
pagenos=set(inside)
rsrcmgr = PDFResourceManager()
retstr = StringIO()
codec = 'utf-8'
laparams = LAParams()
device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device)
password = ""
maxpages = 0
caching = True
text = ""
for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True):
interpreter.process_page(page)
text = retstr.getvalue()
retstr.truncate(0)
text = text.decode("ascii","replace")
if re.search(r"to be held (at|on)",text.lower()):
print text
extract = extract + text + "\n"
continue
I know it is not good to answer your own question but i think i may have figured out an answer for this question.
I think it is not the best way to do it, but still it helps me.
I used a combination of pypdf and pdfminer
The code is as below:
import pyPdf
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from cStringIO import StringIO
path = "filename.pdf"
pdf = pyPdf.PdfFileReader(open(path, "rb"))
fp = file(path, 'rb')
num_of_pages = pdf.getNumPages()
extract = ""
for i in range(num_of_pages):
inside = [i]
pagenos=set(inside)
rsrcmgr = PDFResourceManager()
retstr = StringIO()
codec = 'utf-8'
laparams = LAParams()
device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device)
password = ""
maxpages = 0
caching = True
text = ""
for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True):
interpreter.process_page(page)
text = retstr.getvalue()
text = text.decode("ascii","replace")
if re.search(r"to be held (at|on)",text.lower()):
print text
extract = extract + text + "\n"
continue
There may be a better way to do it, but currently i found out this to be pretty good.
You can refer the following link to extract page by page text from PDF.
from pdfminer.high_level import extract_pages
from pdfminer.layout import LTTextContainer
for page_layout in extract_pages("test.pdf"):
for element in page_layout:
if isinstance(element, LTTextContainer):
print(element.get_text())
PDFMiner Page by Page text Extraction

How can I adjust the 'word_margin' for reading PDFs with pdfminer in python?

I've tried to manipulate the 'word_margin' with python using the following code but it throws me an error TypeError: get_pages() got an unexpected keyword argument 'word_margin'. PDFminer reads the document fine if I remove the word_margin=word_marginfrom the arguments.
Code:
def convert_pdf_to_txt(path):
rsrcmgr = PDFResourceManager()
retstr = StringIO()
codec = 'utf-8'
laparams = LAParams()
device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
fp = open(path, 'rb')
interpreter = PDFPageInterpreter(rsrcmgr, device)
password = ""
maxpages = 0
caching = True
pagenos=set()
word_margin = 1
for page in PDFPage.get_pages(fp, pagenos, word_margin=word_margin,maxpages=maxpages,password=password,caching=caching, check_extractable=True):
interpreter.process_page(page)
text = retstr.getvalue()
fp.close()
device.close()
retstr.close()
return text
word_margin is a parameter of LAParams class. If I understand correctly, the code should look like this:
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.layout import LAParams
from pdfminer.converter import TextConverter
from StringIO import StringIO
def convert_pdf_to_txt(path):
rsrcmgr = PDFResourceManager()
retstr = StringIO()
codec = 'utf-8'
word_margin = 1
laparams = LAParams(word_margin=word_margin)
device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
fp = open(path, 'rb')
interpreter = PDFPageInterpreter(rsrcmgr, device)
password = ""
maxpages = 0
caching = True
pagenos=set()
for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages,password=password,caching=caching, check_extractable=True):
interpreter.process_page(page)
text = retstr.getvalue()
fp.close()
device.close()
retstr.close()
return text

Categories

Resources