I am getting an unexpected error when using this. The first section is from a script that I found online, and I am trying to use it to pull a particular section identified in the PDF's outline. Everything works fine, except right at output.write(outputfile1) it says:
PdfReadError: multiple definitions in dictionary.
Anybody else run into this? Please forgive all the unnecessary prints at the end. :)
import pyPdf
import glob
class Darrell(pyPdf.PdfFileReader):
def getDestinationPageNumbers(self):
def _setup_outline_page_ids(outline, _result=None):
if _result is None:
_result = {}
for obj in outline:
if isinstance(obj, pyPdf.pdf.Destination):
_result[(id(obj), obj.title)] = obj.page.idnum
elif isinstance(obj, list):
_setup_outline_page_ids(obj, _result)
return _result
def _setup_page_id_to_num(pages=None, _result=None, _num_pages=None):
if _result is None:
_result = {}
if pages is None:
_num_pages = []
pages = self.trailer["/Root"].getObject()["/Pages"].getObject()
t = pages["/Type"]
if t == "/Pages":
for page in pages["/Kids"]:
_result[page.idnum] = len(_num_pages)
_setup_page_id_to_num(page.getObject(), _result, _num_pages)
elif t == "/Page":
_num_pages.append(1)
return _result
outline_page_ids = _setup_outline_page_ids(self.getOutlines())
page_id_to_page_numbers = _setup_page_id_to_num()
result = {}
for (_, title), page_idnum in outline_page_ids.iteritems():
result[title] = page_id_to_page_numbers.get(page_idnum, '???')
return result
for fileName in glob.glob("*.pdf"):
output = pyPdf.PdfFileWriter()
print fileName
pdf = Darrell(open(fileName, 'rb'))
template = '%-5s %s'
print template % ('page', 'title')
for p,t in sorted([(v,k) for k,v in pdf.getDestinationPageNumbers().iteritems()]):
print template % (p+1,t)
for p,t in sorted([(v,k) for k,v in pdf.getDestinationPageNumbers().iteritems()]):
if t == "CATEGORY 1":
startpg = p+1
print p+1,'is the first page of Category 1.'
if t == "CATEGORY 2":
endpg = p+1
print p+1,'is the last page of Category 1.'
print startpg, endpg
pagenums = range(startpg,endpg)
print pagenums
for i in pagenums:
output.addPage(pdf.getPage(i))
fileName2 = "%sCategory1_data.pdf" % (str(fileName[:-13]))
print "%s has %s pages." % (fileName2,output.getNumPages())
outputfile1 = file(r"%s" % (fileName2), 'wb')
output.write(outputfile1)
outputfile1.close()
I know it might be too late for you, but for anyone else who will stumble here to look for the answer:
I had the same problem today, setting:
export_reader = PdfFileReader(filename, strict=False)
If you are just merging, then use:
merger = PdfFileMerger(strict=False)
This way, you will get only a warning, rather than an exception.
Related
i have a bot(query, key) function to post data, dicts(query, answer) to wrap the return result, and query_pipe(query_list) to process list of query request. But when i put that in multiprocessing.Process, i found that bot(query, key) return nothing. Here's my code.
def bot(query, key):
data = {
'key' : key,
'info' : query,
'userid' : 'wechat-robot',
}
try:
apiUrl = url
page = requests.post(apiUrl, data=data)
if page.json()['code'] == '100000':
answer = page.json()['text']
return dicts(query, answer)
else:
return dicts(query, 'failed')
except Exception as e:
return '500 Error'
def dicts(query, answer):
return {'query': query, 'answer': answer}
def query_pipe(query_list):
keys_pool = []
with open('keys.txt', 'r') as f:
lines = f.readlines()
for line in lines:
keys_pool.append(line.strip('\n'))
idx = 0
print(bot(query_list[0], keys_pool[0]))
p = Process(target=query_pipe, args=(query_data,))
p.start()
p.join()
But when i run the query_pipe(query_list) which not using multiprocess.Process, query_pipe(query_list) would print the correct output. I feel so confused, so anyone could give me a hint would be highly appreciated.
I have been working on a script to ingest a file (accounts.txt) which contains email addresses, for which each will then be verified against an API to see if they appear in a data dump. The script appears to work, however there is a bug present whereby once it finds a positive hit, it will disregard any other match...
For example;
If my "accounts.txt" file contains the following entries:
a#a.com
b#b.com
Even though both of those should return results, as soon as the script is run, the match on a#a.com will be found however b#b.com will not return anything.
I cannot seem to figure out why this is happening, ideally I want all of the hits outputted.
FYI, the script is querying 'haveibeenpwned' which is a site that locates email addresses found in credential dumps.
Any help finding my bug would be greatly appreciated. Below is my current script.
#!/usr/bin/env python
import argparse
import json
import requests
import time
breaches_by_date = {}
breaches_by_account = {}
breaches_by_name = {}
class Breach(object):
def __init__(self, e, n, d):
self.email = e
self.name = n
self.date = d
def __repr__(self):
return "%s: %s breached on %s" % (self.email, self.name, self.date)
def accCheck(acc):
global breaches_by_date, breaches_by_account, breaches_by_name
r = requests.get('https://haveibeenpwned.com/api/v2/breachedaccount/%s?truncateResponse=false' % acc)
try:
data = json.loads(r.text)
except ValueError:
print("No breach information for %s" % acc)
return
for i in data:
name, date = (i['Name'], i['BreachDate'])
breach = Breach(acc, name, date)
try: breaches_by_account[acc].append(breach)
except: breaches_by_account[acc] = [breach]
try: breaches_by_name[name].append(breach)
except: breaches_by_name[name] = [breach]
try: breaches_by_date[date].append(breach)
except: breaches_by_date[date] = [breach]
def readFromFile(fname="accounts.txt"):
accounts=[]
with open(fname, "r+") as f:
accounts = [l.strip() for l in f.readlines()]
return accounts
if __name__ == '__main__':
accounts = readFromFile()
for email_addr in accounts:
accCheck(email_addr)
print
print("Breaches by date")
for date, breaches in breaches_by_date.items():
for breach in breaches:
print(breach)
print
print("Breaches by account")
for acc, breaches in breaches_by_account.items():
print(acc)
for breach in breaches:
print("%s breached on %s" % (breach.name, breach.date))
print
print("Breaches by name")
for name, breaches in breaches_by_name.items():
print("%s breached for the following accounts:" % name)
for breach in breaches:
print("%s on %s" % (breach.email, breach.date))
print
I am not 100% sure to know where your problem comes from, but I would opt for a code like:
emails_to_check = open("/path/to/yourfile").read().split("\n")
for email in emails_to_check:
if is_email_blacklisted(email):
do_something()
I want to diff html files by structure and not by content. For example: b and a are identical with this diff because the structures of them are equal.
Anyone knows tool (I prefer in python) or implementation do it ?
You need to parse the HTML/XMLto a DOM tree and then compare those trees. The preferred solution for parsin in Python for this is lxml library. For comparison I am not sure any lib exist but below is a guidelining source code.
Here is one XML comparison function from Ian Bicking (orignal source, under Python Software Foundation License, https://bitbucket.org/ianb/formencode/src/tip/formencode/doctest_xml_compare.py?fileviewer=file-view-default#doctest_xml_compare.py-70 )
try:
import doctest
doctest.OutputChecker
except AttributeError: # Python < 2.4
import util.doctest24 as doctest
try:
import xml.etree.ElementTree as ET
except ImportError:
import elementtree.ElementTree as ET
from xml.parsers.expat import ExpatError as XMLParseError
RealOutputChecker = doctest.OutputChecker
def debug(*msg):
import sys
print >> sys.stderr, ' '.join(map(str, msg))
class HTMLOutputChecker(RealOutputChecker):
def check_output(self, want, got, optionflags):
normal = RealOutputChecker.check_output(self, want, got, optionflags)
if normal or not got:
return normal
try:
want_xml = make_xml(want)
except XMLParseError:
pass
else:
try:
got_xml = make_xml(got)
except XMLParseError:
pass
else:
if xml_compare(want_xml, got_xml):
return True
return False
def output_difference(self, example, got, optionflags):
actual = RealOutputChecker.output_difference(
self, example, got, optionflags)
want_xml = got_xml = None
try:
want_xml = make_xml(example.want)
want_norm = make_string(want_xml)
except XMLParseError, e:
if example.want.startswith('<'):
want_norm = '(bad XML: %s)' % e
# '<xml>%s</xml>' % example.want
else:
return actual
try:
got_xml = make_xml(got)
got_norm = make_string(got_xml)
except XMLParseError, e:
if example.want.startswith('<'):
got_norm = '(bad XML: %s)' % e
else:
return actual
s = '%s\nXML Wanted: %s\nXML Got : %s\n' % (
actual, want_norm, got_norm)
if got_xml and want_xml:
result = []
xml_compare(want_xml, got_xml, result.append)
s += 'Difference report:\n%s\n' % '\n'.join(result)
return s
def xml_compare(x1, x2, reporter=None):
if x1.tag != x2.tag:
if reporter:
reporter('Tags do not match: %s and %s' % (x1.tag, x2.tag))
return False
for name, value in x1.attrib.items():
if x2.attrib.get(name) != value:
if reporter:
reporter('Attributes do not match: %s=%r, %s=%r'
% (name, value, name, x2.attrib.get(name)))
return False
for name in x2.attrib.keys():
if name not in x1.attrib:
if reporter:
reporter('x2 has an attribute x1 is missing: %s'
% name)
return False
if not text_compare(x1.text, x2.text):
if reporter:
reporter('text: %r != %r' % (x1.text, x2.text))
return False
if not text_compare(x1.tail, x2.tail):
if reporter:
reporter('tail: %r != %r' % (x1.tail, x2.tail))
return False
cl1 = x1.getchildren()
cl2 = x2.getchildren()
if len(cl1) != len(cl2):
if reporter:
reporter('children length differs, %i != %i'
% (len(cl1), len(cl2)))
return False
i = 0
for c1, c2 in zip(cl1, cl2):
i += 1
if not xml_compare(c1, c2, reporter=reporter):
if reporter:
reporter('children %i do not match: %s'
% (i, c1.tag))
return False
return True
def text_compare(t1, t2):
if not t1 and not t2:
return True
if t1 == '*' or t2 == '*':
return True
return (t1 or '').strip() == (t2 or '').strip()
def make_xml(s):
return ET.XML('<xml>%s</xml>' % s)
def make_string(xml):
if isinstance(xml, (str, unicode)):
xml = make_xml(xml)
s = ET.tostring(xml)
if s == '<xml />':
return ''
assert s.startswith('<xml>') and s.endswith('</xml>'), repr(s)
return s[5:-6]
def install():
doctest.OutputChecker = HTMLOutputChecker
Sidenote: <\head> is not a valid HTML tag and will be interpreted as text. HTML close tags look like this: </head>
As other answerers may tell you, using a library that actually knows what a DOM is is probably the most reliable option if you're comparing well-structured, complete HTML documents or fragments. A simpler solution than using a DOM is to use regex to match HTML tags.
It's simple (can be done in two lines).
It's reliable in everything I've tested so far, but can give unexpected results when, for example, HTML tags appear in <pre> or <textarea> elements.
Will work with partial HTML fragments like </head>, while DOM/parsing libraries might complain that a <head> tag is missing.
Demo
Following is some code that normalizes HTML input (the HTML of this page, actually) by finding all the tags and printing them in succession.
import re, urllib
f = urllib.urlopen('http://stackoverflow.com/questions/33204018/html-structure-diff-in-python')
html = f.read()
for m in re.finditer(r'''</?\w+((\s+\w+(\s*=\s*(?:".*?"|'.*?'|[^'">\s]+))?)+\s*|\s*)/?>''', html):
print m.group(0)
You can take the output from the above and use whatever command-line diff tool you prefer to compare them.
Or maybe you want to compare them using Python. Instead of printing out all the lines, you might be interested in concatenating them into a single string:
tags_as_string = ''
for m in re.finditer(r'''</?\w+((\s+\w+(\s*=\s*(?:".*?"|'.*?'|[^'">\s]+))?)+\s*|\s*)/?>''', html):
s += m.group(0) + '\n' # the newline makes diff output look nicer
or list:
tags_as_list = []
for m in re.finditer(r'''</?(\w+)((\s+\w+(\s*=\s*(?:".*?"|'.*?'|[^'">\s]+))?)+\s*|\s*)/?>''', html):
s.append(m.group(0))
Further steps to consider (can be done inside the for loop):
Perhaps you're only interested in the tag name and not the attributes. The tag name can be accessed with m.group(1) (the first regex group in parentheses) in the for-loop.
Tags that mean the same thing still might be different due to whitespace. You might want to normalize out the whitespace within each tag using a similar technique.
Credit: The actual regex is from http://haacked.com/archive/2004/10/25/usingregularexpressionstomatchhtml.aspx/
I tried to use calais.py library
and I run the following code
API_KEY ='token'
calais = Calais(api_key=API_KEY, submitter="my app")
print calais.analyze_url('https://www.python.org/download/releases/2.5.1/')
I get the following error:
*ValueError: Invalid request format - the request has missing or invalid parameters*
calais.py is here:
"""
python-calais v.1.4 -- Python interface to the OpenCalais API
Author: Jordan Dimov (jdimov#mlke.net)
Last-Update: 01/12/2009
"""
import httplib, urllib, re
import simplejson as json
from StringIO import StringIO
PARAMS_XML = """
<c:params xmlns:c="http://s.opencalais.com/1/pred/" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"> <c:processingDirectives %s> </c:processingDirectives> <c:userDirectives %s> </c:userDirectives> <c:externalMetadata %s> </c:externalMetadata> </c:params>
"""
STRIP_RE = re.compile('<script.*?</script>|<noscript.*?</noscript>|<style.*?</style>', re.IGNORECASE)
__version__ = "1.4"
class AppURLopener(urllib.FancyURLopener):
version = "Mozilla/5.0 (X11; U; Linux x86_64; en-US; rv:1.9.0.5) Gecko/2008121623 Ubuntu/8.10 (intrepid)Firefox/3.0.5" # Lie shamelessly to Wikipedia.
urllib._urlopener = AppURLopener()
class Calais():
"""
Python class that knows how to talk to the OpenCalais API. Use the analyze() and analyze_url() methods, which return CalaisResponse objects.
"""
api_key = None
processing_directives = {"contentType":"TEXT/RAW", "outputFormat":"application/json", "reltagBaseURL":None, "calculateRelevanceScore":"true", "enableMetadataType":None, "discardMetadata":None, "omitOutputtingOriginalText":"true"}
user_directives = {"allowDistribution":"false", "allowSearch":"false", "externalID":None}
external_metadata = {}
def __init__(self, api_key, submitter="python-calais client v.%s" % __version__):
self.api_key = api_key
self.user_directives["submitter"]=submitter
def _get_params_XML(self):
return PARAMS_XML % (" ".join('c:%s="%s"' % (k,v) for (k,v) in self.processing_directives.items() if v), " ".join('c:%s="%s"' % (k,v) for (k,v) in self.user_directives.items() if v), " ".join('c:%s="%s"' % (k,v) for (k,v) in self.external_metadata.items() if v))
def rest_POST(self, content):
params = urllib.urlencode({'licenseID':self.api_key, 'content':content, 'paramsXML':self._get_params_XML()})
headers = {"Content-type":"application/x-www-form-urlencoded"}
conn = httplib.HTTPConnection("api.opencalais.com:80")
conn.request("POST", "/enlighten/rest/", params, headers)
response = conn.getresponse()
data = response.read()
conn.close()
return (data)
def get_random_id(self):
"""
Creates a random 10-character ID for your submission.
"""
import string
from random import choice
chars = string.letters + string.digits
np = ""
for i in range(10):
np = np + choice(chars)
return np
def get_content_id(self, text):
"""
Creates a SHA1 hash of the text of your submission.
"""
import hashlib
h = hashlib.sha1()
h.update(text)
return h.hexdigest()
def preprocess_html(self, html):
html = html.replace('\n', '')
html = STRIP_RE.sub('', html)
return html
def analyze(self, content, content_type="TEXT/RAW", external_id=None):
if not (content and len(content.strip())):
return None
self.processing_directives["contentType"]=content_type
if external_id:
self.user_directives["externalID"] = external_id
return CalaisResponse(self.rest_POST(content))
def analyze_url(self, url):
f = urllib.urlopen(url)
html = self.preprocess_html(f.read())
return self.analyze(html, content_type="TEXT/HTML", external_id=url)
def analyze_file(self, fn):
import mimetypes
try:
filetype = mimetypes.guess_type(fn)[0]
except:
raise ValueError("Can not determine file type for '%s'" % fn)
if filetype == "text/plain":
content_type="TEXT/RAW"
f = open(fn)
content = f.read()
f.close()
elif filetype == "text/html":
content_type = "TEXT/HTML"
f = open(fn)
content = self.preprocess_html(f.read())
f.close()
else:
raise ValueError("Only plaintext and HTML files are currently supported. ")
return self.analyze(content, content_type=content_type, external_id=fn)
class CalaisResponse():
"""
Encapsulates a parsed Calais response and provides easy pythonic access to the data.
"""
raw_response = None
simplified_response = None
def __init__(self, raw_result):
try:
self.raw_response = json.load(StringIO(raw_result))
except:
raise ValueError(raw_result)
self.simplified_response = self._simplify_json(self.raw_response)
self.__dict__['doc'] = self.raw_response['doc']
for k,v in self.simplified_response.items():
self.__dict__[k] = v
def _simplify_json(self, json):
result = {}
# First, resolve references
for element in json.values():
for k,v in element.items():
if isinstance(v, unicode) and v.startswith("http://") and json.has_key(v):
element[k] = json[v]
for k, v in json.items():
if v.has_key("_typeGroup"):
group = v["_typeGroup"]
if not result.has_key(group):
result[group]=[]
del v["_typeGroup"]
v["__reference"] = k
result[group].append(v)
return result
def print_summary(self):
if not hasattr(self, "doc"):
return None
info = self.doc['info']
print "Calais Request ID: %s" % info['calaisRequestID']
if info.has_key('externalID'):
print "External ID: %s" % info['externalID']
if info.has_key('docTitle'):
print "Title: %s " % info['docTitle']
print "Language: %s" % self.doc['meta']['language']
print "Extractions: "
for k,v in self.simplified_response.items():
print "\t%d %s" % (len(v), k)
def print_entities(self):
if not hasattr(self, "entities"):
return None
for item in self.entities:
print "%s: %s (%.2f)" % (item['_type'], item['name'], item['relevance'])
def print_topics(self):
if not hasattr(self, "topics"):
return None
for topic in self.topics:
print topic['categoryName']
def print_relations(self):
if not hasattr(self, "relations"):
return None
for relation in self.relations:
print relation['_type']
for k,v in relation.items():
if not k.startswith("_"):
if isinstance(v, unicode):
print "\t%s:%s" % (k,v)
elif isinstance(v, dict) and v.has_key('name'):
print "\t%s:%s" % (k, v['name'])
The problem solved. It was complicated little because I was using old version. Thank you.
i would like to use pyPdf to split a pdf file based on the outline where each destination in the outline refers to a different page within the pdf.
example outline:
main --> points to page 1
sect1 --> points to page 1
sect2 --> points to page 15
sect3 --> points to page 22
it is easy within pyPdf to iterate over each page of the document or each destination in the document's outline; however, i cannot figure out how to get the page number where the destination points.
does anybody know how to find the referencing page number for each destination in the outline?
I figured it out:
class Darrell(pyPdf.PdfFileReader):
def getDestinationPageNumbers(self):
def _setup_outline_page_ids(outline, _result=None):
if _result is None:
_result = {}
for obj in outline:
if isinstance(obj, pyPdf.pdf.Destination):
_result[(id(obj), obj.title)] = obj.page.idnum
elif isinstance(obj, list):
_setup_outline_page_ids(obj, _result)
return _result
def _setup_page_id_to_num(pages=None, _result=None, _num_pages=None):
if _result is None:
_result = {}
if pages is None:
_num_pages = []
pages = self.trailer["/Root"].getObject()["/Pages"].getObject()
t = pages["/Type"]
if t == "/Pages":
for page in pages["/Kids"]:
_result[page.idnum] = len(_num_pages)
_setup_page_id_to_num(page.getObject(), _result, _num_pages)
elif t == "/Page":
_num_pages.append(1)
return _result
outline_page_ids = _setup_outline_page_ids(self.getOutlines())
page_id_to_page_numbers = _setup_page_id_to_num()
result = {}
for (_, title), page_idnum in outline_page_ids.iteritems():
result[title] = page_id_to_page_numbers.get(page_idnum, '???')
return result
pdf = Darrell(open(PATH-TO-PDF, 'rb'))
template = '%-5s %s'
print template % ('page', 'title')
for p,t in sorted([(v,k) for k,v in pdf.getDestinationPageNumbers().iteritems()]):
print template % (p+1,t)
This is just what I was looking for. Darrell's additions to PdfFileReader should be part of PyPDF2.
I wrote a little recipe that uses PyPDF2 and sejda-console to split a PDF by bookmarks. In my case there are several Level 1 sections that I want to keep together. This script allows me to do that and give the resulting files meaningful names.
import operator
import os
import subprocess
import sys
import time
import PyPDF2 as pyPdf
# need to have sejda-console installed
# change this to point to your installation
sejda = 'C:\\sejda-console-1.0.0.M2\\bin\\sejda-console.bat'
class Darrell(pyPdf.PdfFileReader):
...
if __name__ == '__main__':
t0= time.time()
# get the name of the file to split as a command line arg
pdfname = sys.argv[1]
# open up the pdf
pdf = Darrell(open(pdfname, 'rb'))
# build list of (pagenumbers, newFileNames)
splitlist = [(1,'FrontMatter')] # Customize name of first section
template = '%-5s %s'
print template % ('Page', 'Title')
print '-'*72
for t,p in sorted(pdf.getDestinationPageNumbers().iteritems(),
key=operator.itemgetter(1)):
# Customize this to get it to split where you want
if t.startswith('Chapter') or \
t.startswith('Preface') or \
t.startswith('References'):
print template % (p+1, t)
# this customizes how files are renamed
new = t.replace('Chapter ', 'Chapter')\
.replace(': ', '-')\
.replace(': ', '-')\
.replace(' ', '_')
splitlist.append((p+1, new))
# call sejda tools and split document
call = sejda
call += ' splitbypages'
call += ' -f "%s"'%pdfname
call += ' -o ./'
call += ' -n '
call += ' '.join([str(p) for p,t in splitlist[1:]])
print '\n', call
subprocess.call(call)
print '\nsejda-console has completed.\n\n'
# rename the split files
for p,t in splitlist:
old ='./%i_'%p + pdfname
new = './' + t + '.pdf'
print 'renaming "%s"\n to "%s"...'%(old, new),
try:
os.remove(new)
except OSError:
pass
try:
os.rename(old, new)
print' succeeded.\n'
except:
print' failed.\n'
print '\ndone. Spliting took %.2f seconds'%(time.time() - t0)
Small update to #darrell class to be able to parse UTF-8 outlines, which I post as answer because comment would be hard to read.
Problem is in pyPdf.pdf.Destination.title which may be returned in two flavors:
pyPdf.generic.TextStringObject
pyPdf.generic.ByteStringObject
so that output from _setup_outline_page_ids() function returns also two different types for title object, which fails with UnicodeDecodeError if outline title contains anything then ASCII.
I added this code to solve the problem:
if isinstance(title, pyPdf.generic.TextStringObject):
title = title.encode('utf-8')
of whole class:
class PdfOutline(pyPdf.PdfFileReader):
def getDestinationPageNumbers(self):
def _setup_outline_page_ids(outline, _result=None):
if _result is None:
_result = {}
for obj in outline:
if isinstance(obj, pyPdf.pdf.Destination):
_result[(id(obj), obj.title)] = obj.page.idnum
elif isinstance(obj, list):
_setup_outline_page_ids(obj, _result)
return _result
def _setup_page_id_to_num(pages=None, _result=None, _num_pages=None):
if _result is None:
_result = {}
if pages is None:
_num_pages = []
pages = self.trailer["/Root"].getObject()["/Pages"].getObject()
t = pages["/Type"]
if t == "/Pages":
for page in pages["/Kids"]:
_result[page.idnum] = len(_num_pages)
_setup_page_id_to_num(page.getObject(), _result, _num_pages)
elif t == "/Page":
_num_pages.append(1)
return _result
outline_page_ids = _setup_outline_page_ids(self.getOutlines())
page_id_to_page_numbers = _setup_page_id_to_num()
result = {}
for (_, title), page_idnum in outline_page_ids.iteritems():
if isinstance(title, pyPdf.generic.TextStringObject):
title = title.encode('utf-8')
result[title] = page_id_to_page_numbers.get(page_idnum, '???')
return result
Darrell's class can be modified slightly to produce a multi-level table of contents for a pdf (in the manner of pdftoc in the pdftk toolkit.)
My modification adds one more parameter to _setup_page_id_to_num, an integer "level" which defaults to 1. Each invocation increments the level. Instead of storing just the page number in the result, we store the pair of page number and level. Appropriate modifications should be applied when using the returned result.
I am using this to implement the "PDF Hacks" browser-based page-at-a-time document viewer with a sidebar table of contents which reflects LaTeX section, subsection etc bookmarks. I am working on a shared system where pdftk can not be installed but where python is available.
A solution 10 years later for newer python and PyPDF:
from PyPDF2 import PdfReader, PdfWriter
filename = "main.pdf"
with open(filename, "rb") as f:
r = PdfReader(f)
bookmarks = list(map(lambda x: (x.title, r.get_destination_page_number(x)), r.outline))
print(bookmarks)
for i, b in enumerate(bookmarks):
begin = b[1]
end = bookmarks[i+1][1] if i < len(bookmarks) - 1 else len(r.pages)
# print(len(r.pages[begin:end]))
name = b[0] + ".pdf"
print(f"{name=}: {begin=}, {end=}")
with open(name, "wb") as f:
w = PdfWriter(f)
for p in r.pages[begin:end]:
w.add_page(p)
w.write(f)