Execute a Jupyter notebook including inline markdown with nbconvert - python

I have a Jupyter notebook that includes python variables in markdown cells like this:
code cell:
x = 10
markdown cell:
The value of x is {{x}}.
The IPython-notebook-extension Python Markdown allows me to dynamically display these variables if I execute the markdown cell with shift-enter in the notebook.
markdown cell:
The value of x is 10.
I would like to programmatically execute all cells in the notebook and save them to a new notebook using something like this:
import nbformat
from nbconvert.preprocessors import ExecutePreprocessor
with open('report.ipynb') as f:
nb = nbformat.read(f, as_version=4)
ep = ExecutePreprocessor(timeout=600, kernel_name='python3')
ep.preprocess(nb, {})
with open('report_executed.ipynb', 'wt') as f:
nbformat.write(nb, f)
This will execute the code cells but not the markdown cells. They still look like this:
The value of x is {{x}}.
I think the issue is that the notebook is not trusted. Is there a way to tell ExecutePreprocessor to trust the notebook? Is there another way to programmatically execute a notebook including python variables in the markdown cells?

The ExecutePreprocessor only looks at code cells, so your markdown cells are completely untouched. To do markdown processing, you need the Python Markdown preprocessor, as you have stated.
Unfortunately, the Python Markdown preprocessor system only executes the code in a live notebook, which it does by modifying the javascript involved with rendering cells. The modification stores the results of executing the code snippets in the cell metadata.
The PyMarkdownPreprocessor class (in pre_pymarkdown.py) was designed to be used with nbconvert operating on notebooks that had been rendered first in a live notebook setting. It processes markdown cells, replacing {{}} patterns with the values stored in the metadata.
In your situation, however, you don't have the live notebook metadata. I had a similar problem, and I solved it by writing my own execution preprocessor that also included logic to handle the markdown cells:
from nbconvert.preprocessors import ExecutePreprocessor, Preprocessor
import nbformat, nbconvert
from textwrap import dedent
class ExecuteCodeMarkdownPreprocessor(ExecutePreprocessor):
def __init__(self, **kw):
self.sections = {'default': True} # maps section ID to true or false
self.EmptyCell = nbformat.v4.nbbase.new_raw_cell("")
return super().__init__(**kw)
def preprocess_cell(self, cell, resources, cell_index):
"""
Executes a single code cell. See base.py for details.
To execute all cells see :meth:`preprocess`.
"""
if cell.cell_type not in ['code','markdown']:
return cell, resources
if cell.cell_type == 'code':
# Do code stuff
return self.preprocess_code_cell(cell, resources, cell_index)
elif cell.cell_type == 'markdown':
# Do markdown stuff
return self.preprocess_markdown_cell(cell, resources, cell_index)
else:
# Don't do anything
return cell, resources
def preprocess_code_cell(self, cell, resources, cell_index):
''' Process code cell.
'''
outputs = self.run_cell(cell)
cell.outputs = outputs
if not self.allow_errors:
for out in outputs:
if out.output_type == 'error':
pattern = u"""\
An error occurred while executing the following cell:
------------------
{cell.source}
------------------
{out.ename}: {out.evalue}
"""
msg = dedent(pattern).format(out=out, cell=cell)
raise nbconvert.preprocessors.execute.CellExecutionError(msg)
return cell, resources
def preprocess_markdown_cell(self, cell, resources, cell_index):
# Find and execute snippets of code
cell['metadata']['variables'] = {}
for m in re.finditer("{{(.*?)}}", cell.source):
# Execute code
fakecell = nbformat.v4.nbbase.new_code_cell(m.group(1))
fakecell, resources = self.preprocess_code_cell(fakecell, resources, cell_index)
# Output found in cell.outputs
# Put output in cell['metadata']['variables']
for output in fakecell.outputs:
html = self.convert_output_to_html(output)
if html is not None:
cell['metadata']['variables'][fakecell.source] = html
break
return cell, resources
def convert_output_to_html(self, output):
'''Convert IOpub output to HTML
See https://github.com/ipython-contrib/IPython-notebook-extensions/blob/master/nbextensions/usability/python-markdown/main.js
'''
if output['output_type'] == 'error':
text = '**' + output.ename + '**: ' + output.evalue;
return text
elif output.output_type == 'execute_result' or output.output_type == 'display_data':
data = output.data
if 'text/latex' in data:
html = data['text/latex']
return html
elif 'image/svg+xml' in data:
# Not supported
#var svg = ul['image/svg+xml'];
#/* embed SVG in an <img> tag, still get eaten by sanitizer... */
#svg = btoa(svg);
#html = '<img src="data:image/svg+xml;base64,' + svg + '"/>';
return None
elif 'image/jpeg' in data:
jpeg = data['image/jpeg']
html = '<img src="data:image/jpeg;base64,' + jpeg + '"/>'
return html
elif 'image/png' in data:
png = data['image/png']
html = '<img src="data:image/png;base64,' + png + '"/>'
return html
elif 'text/markdown' in data:
text = data['text/markdown']
return text
elif 'text/html' in data:
html = data['text/html']
return html
elif 'text/plain' in data:
text = data['text/plain']
# Strip <p> and </p> tags
# Strip quotes
# html.match(/<p>([\s\S]*?)<\/p>/)[1]
text = re.sub(r'<p>([\s\S]*?)<\/p>', r'\1', text)
text = re.sub(r"'([\s\S]*?)'",r'\1', text)
return text
else:
# Some tag we don't support
return None
else:
return None
You can then process you notebook with logic similar to your posted code:
import nbformat
from nbconvert.preprocessors import ExecutePreprocessor
import ExecuteCodeMarkdownPreprocessor # from wherever you put it
import PyMarkdownPreprocessor # from pre_pymarkdown.py
with open('report.ipynb') as f:
nb = nbformat.read(f, as_version=4)
ep = ExecuteCodeMarkdownPreprocessor(timeout=600, kernel_name='python3')
ep.preprocess(nb, {})
pymk = PyMarkdownPreprocessor()
pymk.preprocess(nb, {})
with open('report_executed.ipynb', 'wt') as f:
nbformat.write(nb, f)
Note that by including the Python Markdown preprocessing, your resultant notebook file will no longer have the {{}} syntax in the markdown cells - the markdown will have static content. If the recipient of the resultant notebook changes the code and executes again, the markdown will not be updated. However, if you are exporting to a different format (such as HTML), then you do want to replace the {{}} syntax with static content.

Update 2021.06.29
This again needs to be updated due to changes in nbconvert to using nbclient call (https://github.com/jupyter/nbconvert/commit/e7bf8350435a66cc50faf29ff12df492be5d7f57#diff-bee04d71b1dfc0202a0239b1513fd81d983edc339a9734ca4f4813276feed032). Since run_cell is no longer available, both the code and markdown cell processing needs to be modified. This works:
import nbformat
from nbconvert.preprocessors import ExecutePreprocessor
from nbconvert.preprocessors import execute
import re
# Taken from:
# https://stackoverflow.com/questions/35805121/execute-a-jupyter-notebook-including-inline-markdown-with-nbconvert, modified to avoid using superseeded run_cell calls.
class ExecuteCodeMarkdownPreprocessor(ExecutePreprocessor):
def __init__(self, **kw):
self.sections = {'default': True} # maps section ID to true or false
self.EmptyCell = nbformat.v4.nbbase.new_raw_cell("")
super().__init__(**kw)
def preprocess_cell(self, cell, resources, cell_index, store_history=True):
"""
Executes a single code cell. See base.py for details.
To execute all cells see :meth:`preprocess`.
"""
if cell.cell_type not in ['code', 'markdown']:
return cell, resources
if cell.cell_type == 'code':
# Do code stuff
return self.preprocess_code_cell(cell, resources, cell_index, store_history)
elif cell.cell_type == 'markdown':
# Do markdown stuff
cell, resources = self.preprocess_markdown_cell(cell, resources, cell_index, store_history)
return cell, resources
else:
# Don't do anything
return cell, resources
def preprocess_code_cell(self, cell, resources, cell_index, store_history):
""" Process code cell. Follow preprocess_cell from ExecutePreprocessor
"""
self._check_assign_resources(resources)
cell = self.execute_cell(cell, cell_index, store_history=True)
return cell, self.resources
def preprocess_markdown_cell(self, cell, resources, cell_index, store_history):
# Find and execute snippets of code
cell['metadata']['variables'] = {}
for m in re.finditer("{{(.*?)}}", cell.source):
# Execute code
self.nb.cells.append(nbformat.v4.nbbase.new_code_cell(m.group(1)))
fakecell, resources = self.preprocess_code_cell(self.nb.cells[-1], resources, len(self.nb.cells)-1, store_history)
self.nb.cells.pop()
# Output found in cell.outputs
# Put output in cell['metadata']['variables']
for output in fakecell.outputs:
html = self.convert_output_to_html(output)
if html is not None:
cell['metadata']['variables'][fakecell.source] = html
break
return cell, resources
def convert_output_to_html(self, output):
"""Convert IOpub output to HTML
See https://github.com/ipython-contrib/IPython-notebook-extensions/blob/master/nbextensions/usability/python-markdown/main.js
"""
if output['output_type'] == 'error':
text = '**' + output.ename + '**: ' + output.evalue
return text
elif output.output_type == 'execute_result' or output.output_type == 'display_data':
data = output.data
if 'text/latex' in data:
html = data['text/latex']
return html
elif 'image/svg+xml' in data:
# Not supported
#var svg = ul['image/svg+xml'];
#/* embed SVG in an <img> tag, still get eaten by sanitizer... */
#svg = btoa(svg);
#html = '<img src="data:image/svg+xml;base64,' + svg + '"/>';
return None
elif 'image/jpeg' in data:
jpeg = data['image/jpeg']
html = '<img src="data:image/jpeg;base64,' + jpeg + '"/>'
return html
elif 'image/png' in data:
png = data['image/png']
html = '<img src="data:image/png;base64,' + png + '"/>'
return html
elif 'text/markdown' in data:
text = data['text/markdown']
return text
elif 'text/html' in data:
html = data['text/html']
return html
elif 'text/plain' in data:
text = data['text/plain']
# Strip <p> and </p> tags
# Strip quotes
# html.match(/<p>([\s\S]*?)<\/p>/)[1]
text = re.sub(r'<p>([\s\S]*?)<\/p>', r'\1', text)
text = re.sub(r"'([\s\S]*?)'",r'\1', text)
return text
else:
# Some tag we don't support
return None
else:
return None
Usage remains the same.

Update 2020-07-08
The answer provided by #gordon-bean was a life-saver for me. In my final round of googling before giving up I found this answer, so before I continue I just want to say thanks!
However, slightly over 4 years after the original answer jupyter / nbconvert went through some changes and the provided code needs to be updated. So here it is:
from nbconvert.preprocessors import ExecutePreprocessor
from nbconvert.preprocessors import execute
import nbformat
import re
# Taken from:
# https://stackoverflow.com/questions/35805121/execute-a-jupyter-notebook-including-inline-markdown-with-nbconvert
class ExecuteCodeMarkdownPreprocessor(ExecutePreprocessor):
def __init__(self, **kw):
self.sections = {'default': True} # maps section ID to true or false
self.EmptyCell = nbformat.v4.nbbase.new_raw_cell("")
super().__init__(**kw)
def preprocess_cell(self, cell, resources, cell_index, store_history=True):
"""
Executes a single code cell. See base.py for details.
To execute all cells see :meth:`preprocess`.
"""
if cell.cell_type not in ['code', 'markdown']:
return cell, resources
if cell.cell_type == 'code':
# Do code stuff
return self.preprocess_code_cell(cell, resources, cell_index, store_history)
elif cell.cell_type == 'markdown':
# Do markdown stuff
return self.preprocess_markdown_cell(cell, resources, cell_index, store_history)
else:
# Don't do anything
return cell, resources
def preprocess_code_cell(self, cell, resources, cell_index, store_history):
""" Process code cell.
"""
# outputs = self.run_cell(cell)
reply, outputs = self.run_cell(cell, cell_index, store_history)
cell.outputs = outputs
cell_allows_errors = (self.allow_errors or "raises-exception"
in cell.metadata.get("tags", []))
if self.force_raise_errors or not cell_allows_errors:
for out in cell.outputs:
if out.output_type == 'error':
raise execute.CellExecutionError.from_cell_and_msg(cell, out)
if (reply is not None) and reply['content']['status'] == 'error':
raise execute.CellExecutionError.from_cell_and_msg(cell, reply['content'])
return cell, resources
def preprocess_markdown_cell(self, cell, resources, cell_index, store_history):
# Find and execute snippets of code
cell['metadata']['variables'] = {}
for m in re.finditer("{{(.*?)}}", cell.source):
# Execute code
fakecell = nbformat.v4.nbbase.new_code_cell(m.group(1))
fakecell, resources = self.preprocess_code_cell(fakecell, resources, cell_index, store_history)
# Output found in cell.outputs
# Put output in cell['metadata']['variables']
for output in fakecell.outputs:
html = self.convert_output_to_html(output)
if html is not None:
cell['metadata']['variables'][fakecell.source] = html
break
return cell, resources
def convert_output_to_html(self, output):
"""Convert IOpub output to HTML
See https://github.com/ipython-contrib/IPython-notebook-extensions/blob/master/nbextensions/usability/python-markdown/main.js
"""
if output['output_type'] == 'error':
text = '**' + output.ename + '**: ' + output.evalue
return text
elif output.output_type == 'execute_result' or output.output_type == 'display_data':
data = output.data
if 'text/latex' in data:
html = data['text/latex']
return html
elif 'image/svg+xml' in data:
# Not supported
#var svg = ul['image/svg+xml'];
#/* embed SVG in an <img> tag, still get eaten by sanitizer... */
#svg = btoa(svg);
#html = '<img src="data:image/svg+xml;base64,' + svg + '"/>';
return None
elif 'image/jpeg' in data:
jpeg = data['image/jpeg']
html = '<img src="data:image/jpeg;base64,' + jpeg + '"/>'
return html
elif 'image/png' in data:
png = data['image/png']
html = '<img src="data:image/png;base64,' + png + '"/>'
return html
elif 'text/markdown' in data:
text = data['text/markdown']
return text
elif 'text/html' in data:
html = data['text/html']
return html
elif 'text/plain' in data:
text = data['text/plain']
# Strip <p> and </p> tags
# Strip quotes
# html.match(/<p>([\s\S]*?)<\/p>/)[1]
text = re.sub(r'<p>([\s\S]*?)<\/p>', r'\1', text)
text = re.sub(r"'([\s\S]*?)'",r'\1', text)
return text
else:
# Some tag we don't support
return None
else:
return None
Usage of this code remains exactly the same as reported by Gordon Bean.

Related

Trouble converting PDF Join Script from 2.7 to 3.10

The Python script below has worked for me in converting multiple PDF documents to a single PDF when running on Python 2.7. I'm now trying to use it on 3.10 and am getting errors.
I believe I've conquered most of them, especially changing print to print( and disabling imports of CoreFoundation and Quartz.CoreGraphics.
It seems that the only remaining error is:
line 85, in main
writeContext = CGPDFContextCreateWithURL(CFURLCreateFromFileSystemRepresentation(kCFAllocatorDefault,
arg, len(arg), False), None, None) NameError: name
'CGPDFContextCreateWithURL' is not defined
If I declare CGPDFContextCreateWithURL as an empty global, the error shifts to CFURLCreateFromFileSystemRepresentation. Declare that and the error is kCFAllocatorDefault.
Here's how I'm trying to handle those.
global CGPDFContextCreateWithURL CGPDFContextCreateWithURL = ""
I don't understand that entire line so any help in making it right would be appreciated.
#
# join
# Joing pages from a a collection of PDF files into a single PDF file.
#
# join [--output <file>] [--shuffle] [--verbose]"
#
# Parameter:
#
# --shuffle
# Take a page from each PDF input file in turn before taking another from each file.
# If this option is not specified then all of the pages from a PDF file are appended
# to the output PDF file before the next input PDF file is processed.
#
# --verbose
# Write information about the doings of this tool to stderr.
#
import sys
import os
import getopt
import tempfile
import shutil
# from CoreFoundation import *
# from Quartz.CoreGraphics import *
global verbose
verbose = False
def createPDFDocumentWithPath(path):
if verbose:
print("Creating PDF document from file %s" % (path))
return CGPDFDocumentCreateWithURL(CFURLCreateFromFileSystemRepresentation(kCFAllocatorDefault, path, len(path), False))
def writePageFromDoc(writeContext, doc, pageNum):
page = CGPDFDocumentGetPage(doc, pageNum)
if page:
mediaBox = CGPDFPageGetBoxRect(page, kCGPDFMediaBox)
if CGRectIsEmpty(mediaBox):
mediaBox = None
CGContextBeginPage(writeContext, mediaBox)
CGContextDrawPDFPage(writeContext, page)
CGContextEndPage(writeContext)
if verbose:
print("Copied page %d from %s" % (pageNum, doc))
def shufflePages(writeContext, docs, maxPages):
for pageNum in xrange(1, maxPages + 1):
for doc in docs:
writePageFromDoc(writeContext, doc, pageNum)
def append(writeContext, docs, maxPages):
for doc in docs:
for pageNum in xrange(1, maxPages + 1) :
writePageFromDoc(writeContext, doc, pageNum)
def main(argv):
global verbose
# The PDF context we will draw into to create a new PDF
writeContext = None
# If True then generate more verbose information
source = None
shuffle = False
# Parse the command line options
try:
options, args = getopt.getopt(argv, "o:sv", ["output=", "shuffle", "verbose"])
except getopt.GetoptError:
usage()
sys.exit(2)
for option, arg in options:
if option in ("-o", "--output") :
if verbose:
print("Setting %s as the destination." % (arg))
writeContext = CGPDFContextCreateWithURL(CFURLCreateFromFileSystemRepresentation(kCFAllocatorDefault, arg, len(arg), False), None, None)
elif option in ("-s", "--shuffle") :
if verbose :
print("Shuffle pages to the output file.")
shuffle = True
elif option in ("-v", "--verbose") :
print("Verbose mode enabled.")
verbose = True
else :
print("Unknown argument: %s" % (option))
if writeContext:
# create PDFDocuments for all of the files.
docs = map(createPDFDocumentWithPath, args)
# find the maximum number of pages.
maxPages = 0
for doc in docs:
if CGPDFDocumentGetNumberOfPages(doc) > maxPages:
maxPages = CGPDFDocumentGetNumberOfPages(doc)
if shuffle:
shufflePages(writeContext, docs, maxPages)
else:
append(writeContext, docs, maxPages)
CGPDFContextClose(writeContext)
del writeContext
#CGContextRelease(writeContext)
def usage():
print("Usage: join [--output <file>] [--shuffle] [--verbose]")
if __name__ == "__main__":
main(sys.argv[1:])

Python script - Blogger2Wordpress - how to save file?

I use the blogger2wordpress python script that Google released back in 2010 (https://code.google.com/archive/p/google-blog-converters-appengine/downloads), to convert a 95mb blogger export file to wordpress wxr format.
However, the script has this code:
#!/usr/bin/env python
# Copyright 2008 Google Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0.txt
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os.path
import logging
import re
import sys
import time
from xml.sax.saxutils import unescape
import BeautifulSoup
import gdata
from gdata import atom
import iso8601
import wordpress
__author__ = 'JJ Lueck (EMAIL#gmail.com)'
###########################
# Constants
###########################
BLOGGER_URL = 'http://www.blogger.com/'
BLOGGER_NS = 'http://www.blogger.com/atom/ns#'
KIND_SCHEME = 'http://schemas.google.com/g/2005#kind'
YOUTUBE_RE = re.compile('http://www.youtube.com/v/([^&]+)&?.*')
YOUTUBE_FMT = r'[youtube=http://www.youtube.com/watch?v=\1]'
GOOGLEVIDEO_RE = re.compile('(http://video.google.com/googleplayer.swf.*)')
GOOGLEVIDEO_FMT = r'[googlevideo=\1]'
DAILYMOTION_RE = re.compile('http://www.dailymotion.com/swf/(.*)')
DAILYMOTION_FMT = r'[dailymotion id=\1]'
###########################
# Translation class
###########################
class Blogger2Wordpress(object):
"""Performs the translation of a Blogger export document to WordPress WXR."""
def __init__(self, doc):
"""Constructs a translator for a Blogger export file.
Args:
doc: The WXR file as a string
"""
# Ensure UTF8 chars get through correctly by ensuring we have a
# compliant UTF8 input doc.
self.doc = doc.decode('utf-8', 'replace').encode('utf-8')
# Read the incoming document as a GData Atom feed.
self.feed = atom.FeedFromString(self.doc)
self.next_id = 1
def Translate(self):
"""Performs the actual translation to WordPress WXR export format.
Returns:
A WordPress WXR export document as a string, or None on error.
"""
# Create the top-level document and the channel associated with it.
channel = wordpress.Channel(
title = self.feed.title.text,
link = self.feed.GetAlternateLink().href,
base_blog_url = self.feed.GetAlternateLink().href,
pubDate = self._ConvertPubDate(self.feed.updated.text))
posts_map = {}
for entry in self.feed.entry:
# Grab the information about the entry kind
entry_kind = ""
for category in entry.category:
if category.scheme == KIND_SCHEME:
entry_kind = category.term
if entry_kind.endswith("#comment"):
# This entry will be a comment, grab the post that it goes to
in_reply_to = entry.FindExtensions('in-reply-to')
post_item = None
# Check to see that the comment has a corresponding post entry
if in_reply_to:
post_id = self._ParsePostId(in_reply_to[0].attributes['ref'])
post_item = posts_map.get(post_id, None)
# Found the post for the comment, add the commment to it
if post_item:
# The author email may not be included in the file
author_email = ''
if entry.author[0].email:
author_email = entry.author[0].email.text
# Same for the the author's url
author_url = ''
if entry.author[0].uri:
author_url = entry.author[0].uri.text
post_item.comments.append(wordpress.Comment(
comment_id = self._GetNextId(),
author = entry.author[0].name.text,
author_email = author_email,
author_url = author_url,
date = self._ConvertDate(entry.published.text),
content = self._ConvertContent(entry.content.text)))
elif entry_kind.endswith('#post'):
# This entry will be a post
post_item = self._ConvertEntry(entry, False)
posts_map[self._ParsePostId(entry.id.text)] = post_item
channel.items.append(post_item)
elif entry_kind.endswith('#page'):
# This entry will be a static page
page_item = self._ConvertEntry(entry, True)
posts_map[self._ParsePageId(entry.id.text)] = page_item
channel.items.append(page_item)
wxr = wordpress.WordPressWxr(channel=channel)
return wxr.WriteXml()
def _ConvertEntry(self, entry, is_page):
"""Converts the contents of an Atom entry into a WXR post Item element."""
# A post may have an empty title, in which case the text element is None.
title = ''
if entry.title.text:
title = entry.title.text
# Check here to see if the entry points to a draft or regular post
status = 'publish'
if entry.control and entry.control.draft:
status = 'draft'
# If no link is present in the Blogger entry, just link
if entry.GetAlternateLink():
link = entry.GetAlternateLink().href
else:
link = BLOGGER_URL
# Declare whether this is a post of a page
post_type = 'post'
if is_page:
post_type = 'page'
blogger_blog = ''
blogger_permalink = ''
if entry.GetAlternateLink():
blogger_path_full = entry.GetAlternateLink().href.replace('http://', '')
blogger_blog = blogger_path_full.split('/')[0]
blogger_permalink = blogger_path_full[len(blogger_blog):]
# Create the actual item element
post_item = wordpress.Item(
title = title,
link = link,
pubDate = self._ConvertPubDate(entry.published.text),
creator = entry.author[0].name.text,
content = self._ConvertContent(entry.content.text),
post_id = self._GetNextId(),
post_date = self._ConvertDate(entry.published.text),
status = status,
post_type = post_type,
blogger_blog = blogger_blog,
blogger_permalink = blogger_permalink,
blogger_author = entry.author[0].name.text)
# Convert the categories which specify labels into wordpress labels
for category in entry.category:
if category.scheme == BLOGGER_NS:
post_item.labels.append(category.term)
return post_item
def _ConvertContent(self, text):
"""Unescapes the post/comment text body and replaces video content.
All <object> and <embed> tags in the post that relate to video must be
changed into the WordPress tags for embedding video,
e.g. [youtube=http://www.youtube.com/...]
If no text is provided, the empty string is returned.
"""
if not text:
return ''
# First unescape all XML tags as they'll be escaped by the XML emitter
content = unescape(text)
# Use an HTML parser on the body to look for video content
content_tree = BeautifulSoup.BeautifulSoup(content)
# Find the object tag
objs = content_tree.findAll('object')
for obj_tag in objs:
# Find the param tag within which contains the URL to the movie
param_tag = obj_tag.find('param', { 'name': 'movie' })
if not param_tag:
continue
# Get the video URL
video = param_tag.attrMap.get('value', None)
if not video:
continue
# Convert the video URL if necessary
video = YOUTUBE_RE.subn(YOUTUBE_FMT, video)[0]
video = GOOGLEVIDEO_RE.subn(GOOGLEVIDEO_FMT, video)[0]
video = DAILYMOTION_RE.subn(DAILYMOTION_FMT, video)[0]
# Replace the portion of the contents with the video
obj_tag.replaceWith(video)
return str(content_tree)
def _ConvertPubDate(self, date):
"""Translates to a pubDate element's time/date format."""
date_tuple = iso8601.parse_date(date)
return date_tuple.strftime('%a, %d %b %Y %H:%M:%S %z')
def _ConvertDate(self, date):
"""Translates to a wordpress date element's time/date format."""
date_tuple = iso8601.parse_date(date)
return date_tuple.strftime('%Y-%m-%d %H:%M:%S')
def _GetNextId(self):
"""Returns the next identifier to use in the export document as a string."""
next_id = self.next_id;
self.next_id += 1
return str(next_id)
def _ParsePostId(self, text):
"""Extracts the post identifier from a Blogger entry ID."""
matcher = re.compile('post-(\d+)')
matches = matcher.search(text)
return matches.group(1)
def _ParsePageId(self, text):
"""Extracts the page identifier from a Blogger entry ID."""
matcher = re.compile('page-(\d+)')
matches = matcher.search(text)
return matches.group(1)
if __name__ == '__main__':
if len(sys.argv) <= 1:
print 'Usage: %s <blogger_export_file>' % os.path.basename(sys.argv[0])
print
print ' Outputs the converted WordPress export file to standard out.'
sys.exit(-1)
wp_xml_file = open(sys.argv[1])
wp_xml_doc = wp_xml_file.read()
translator = Blogger2Wordpress(wp_xml_doc)
print translator.Translate()
wp_xml_file.close()
This scripts outputs the wxr file in the terminal window which is useless for me when the import file has tons of entries.
As I am not familiar with python, how can I modify the script to output the data into a .xml file?
Edit:
I did changed the end of the script to:
wp_xml_file = open(sys.argv[1])
wp_xml_doc = wp_xml_file.read()
translator = Blogger2Wordpress(wp_xml_doc)
print translator.Translate()
fh = open("testoutput.xml", "w")
fh.write(wp_xml_doc);
fh.close();
wp_xml_file.close()
But the produced file is an "invalid wxr file" :/
Can anybody help? Thanks!
Quick and dirty answer:
Output to the stdout is normal behaviour.
You might want to redirect it to a file for instance:
python2 blogger2wordpress your_blogger_export_file > backup
The output will be saved in the file named backup.
Or you can replace print translator.Translate() by
with open('output_file', 'w') as fd:
fd.write(translator.Translate())
This should do the trick (haven't tried).

Parsing Index page in a PDF text book with Python

I have to extract text from PDF pages as it is with the indentation into a CSV file.
Index page from PDF text book:
I should split the text into class and subclass type hierarchy along with the page numbers. For example in the image,
Application server is the class and Apache Tomcat is the subclass in the page number 275
This is the expected output of the CSV:
I have used Tika parser to parse the PDF, but the indentation is not maintained properly (not unique) in the parsed content for splitting the text into class and subclasses.
This is how the parsed text looks like:
Can anyone suggest me the right approach for this requirement?
despite I have no knowledge of pdf extraction, but it is possible to reconstruct the hierarchy from "the parsed text", because the "subclass" part always starts and ends with an extra newline character.
with following test text:
app architect . 50
app logic . 357
app server . 275
tomcat . 275
websphere . 275
jboss . 164
architect
acceptance . 303
development path . 304
architecting . 48
architectural activity . 25, 320
following code:
import csv
import sys
import re
def gen():
is_subclass = False
p_class = None
with open('test.data') as f:
s = f.read()
lines = re.findall(r'[^\n]+\n+', s)
for line in lines:
if ' . ' in line:
class_name, page_no = map(lambda s: s.strip(), line.split('.'))
else:
class_name, page_no = line.strip(), ''
if line.endswith('\n\n'):
if not is_subclass:
p_class = class_name
is_subclass = True
continue
if is_subclass:
yield (p_class, class_name, page_no)
else:
yield (class_name, '', page_no)
if line.endswith('\n\n'):
is_subclass = False
writer = csv.writer(sys.stdout)
writer.writerows(gen())
yields:
app architect,,50
app logic,,357
app server,tomcat,275
app server,websphere,275
app server,jboss,164
architect,acceptance,303
architect,development path,304
architecting,,48
architectural activity,,"25, 320"
hope this helps.
So here is the solution:
Install Fitz(PyMuPDF) https://github.com/rk700/PyMuPDF
Run the code below in the same folder than your PDF file with Python 2.7
Compare the result
Code:
import fitz
import json
import re
import csv
class MyClass:
def __init__(self, text, main_class):
my_arr = re.split("[.]*", text)
if main_class != my_arr[0].strip():
main_class = my_arr[0].strip()
self.main_class = main_class
self.sub_class = my_arr[0].strip()
try:
self.page = my_arr[1].strip()
except:
self.page = ""
def add_line(text, is_recording, main_class):
if(is_recording):
obj = MyClass(text, main_class)
if obj.sub_class == "Glossary":
return False, main_class
table.append(obj)
return True, obj.main_class
elif text == "Contents":
return True, main_class
return False, main_class
last_text = ""
is_recording = False
main_class = ""
table = []
doc = fitz.open("TCS_1.pdf")
page = doc.getPageText(2, output="json")
blocks = json.loads(page)["blocks"]
for block in blocks:
if "lines" in block:
for line in block["lines"]:
line_text = ""
for span in block["lines"]:
line_text += span["spans"][0]["text"].encode("utf-8")
if last_text != line_text:
is_recording, main_class = add_line(line_text, is_recording, main_class)
last_text = line_text
writer = csv.writer(open("output.csv", 'w'), delimiter=',', lineterminator='\n')
for my_class in table:
writer.writerow([my_class.main_class, my_class.sub_class, my_class.page])
# print(my_class.main_class, my_class.sub_class, my_class.page)
Here is the CSV output of the file provided:

How to write these Pandoc Haskell filters in Python?

Question
I need to convert these Pandoc Haskell filters to Python using [pandocfilters].
#!/usr/bin/env runhaskell
import Text.Pandoc.JSON
main :: IO ()
main = toJSONFilter separator
where separator (Para [Span ("",[],[]) [Str "___separator___"]])
= RawBlock (Format "html") "<div class=\"separator\">***</div>"
separator x = x
#!/usr/bin/env runhaskel
import Text.Pandoc.JSON
main :: IO ()
main = toJSONFilter separator
where separator (Para [Span ("",[],[]) [Str "___separator___"]])
= (Para [Span ("",[],[]) [Str "***"]])
separator x = x
I expect it will be of the general form
#!/usr/bin/env python
from pandocfilters import toJSONFilter, Str
def separator(key, value, format, meta):
"""Need to write this."""
pass
if __name__ == '__main__':
toJSONFilter(separator)
Bonus if someone knows how to add "centered" formatting to the second filter for the docx format.
Background
In LaTeX I have a \separator{} macro which makes three centered ***. When processing this with Pandoc to html and docx, I use an alternate macro definition for \separator{} which just creates the text ___separator___. I then replace ___separator___ with content that works correctly in the new format. I need to switch from Haskell to Python filters for cross system compatibility reasons.
Example
Input file looks like
\documentclass{memoir}
\begin{document}
\newcommand{\separator}{\_\_\_separator\_\_\_}
First paragraph.
\separator{}
Second paragraph.
\end{document}
Default pandoc html output with no filter
<p>First paragraph.</p>
<p><span>___separator___</span></p>
<p>Second paragraph.</p>
Required html output when filtered:
<p>First paragraph.</p>
<div class="separator">***</div>
<p>Second paragraph.</p>
The docx filter shoud ideally produce a centered paragraph with ***.
Have a look at this filter:
#!/usr/bin/env python
import sys
from pandocfilters import toJSONFilter, Str, Para
def sep(key, value, format, meta):
if key == 'Para':
sys.stderr.write("--- Found a Para with value: " + str(value) + "\n")
if len(value) == 1:
if value[0]['t'] == 'Str' and value[0]['c'] == '---separator---':
return Para( [ Str("FOUND A SEPARATOR") ] )
return None
if __name__ == "__main__":
toJSONFilter(sep)
When given this input markdown:
This is a paragraph.
---separator---
This is another paragraph.
---separator---
it will produce this output HTML via pandoc --filter ... input.md -o output.html:
<p>This is a paragraph.</p>
<p>FOUND A SEPARATOR</p>
<p>This is another paragraph.</p>
<p>FOUND A SEPARATOR</p>
It also prints to stderr the structure of the Para nodes so that you can see exactly what they look like.
For LaTeX to HTML:
#!/usr/bin/env python
from pandocfilters import toJSONFilter, Str, Para, RawBlock
def separator(key, value, format, meta):
if key == 'Para':
if len(value) == 1:
try:
if value[0]['c'][1][0]['c'] == '___separator___':
return RawBlock('html', '<div class="separator">***</div>')
except KeyError:
return None
return Nonepprint
if __name__ == '__main__':
toJSONFilter(separator)
For LaTeX to docx:
#!/usr/bin/env python
from pandocfilters import toJSONFilter, Str, Para, RawBlock
def separator(key, value, format, meta):
if key == 'Para':
if len(value) == 1:
try:
if value[0]['c'][1][0]['c'] == '___separator___':
return Para([Str('***')])
except KeyError:
return None
return None
if __name__ == '__main__':
toJSONFilter(separator)
Note that this doesn't center the *** as I can't seem to work out how to inject that formatting.

split a pdf based on outline

i would like to use pyPdf to split a pdf file based on the outline where each destination in the outline refers to a different page within the pdf.
example outline:
main --> points to page 1
sect1 --> points to page 1
sect2 --> points to page 15
sect3 --> points to page 22
it is easy within pyPdf to iterate over each page of the document or each destination in the document's outline; however, i cannot figure out how to get the page number where the destination points.
does anybody know how to find the referencing page number for each destination in the outline?
I figured it out:
class Darrell(pyPdf.PdfFileReader):
def getDestinationPageNumbers(self):
def _setup_outline_page_ids(outline, _result=None):
if _result is None:
_result = {}
for obj in outline:
if isinstance(obj, pyPdf.pdf.Destination):
_result[(id(obj), obj.title)] = obj.page.idnum
elif isinstance(obj, list):
_setup_outline_page_ids(obj, _result)
return _result
def _setup_page_id_to_num(pages=None, _result=None, _num_pages=None):
if _result is None:
_result = {}
if pages is None:
_num_pages = []
pages = self.trailer["/Root"].getObject()["/Pages"].getObject()
t = pages["/Type"]
if t == "/Pages":
for page in pages["/Kids"]:
_result[page.idnum] = len(_num_pages)
_setup_page_id_to_num(page.getObject(), _result, _num_pages)
elif t == "/Page":
_num_pages.append(1)
return _result
outline_page_ids = _setup_outline_page_ids(self.getOutlines())
page_id_to_page_numbers = _setup_page_id_to_num()
result = {}
for (_, title), page_idnum in outline_page_ids.iteritems():
result[title] = page_id_to_page_numbers.get(page_idnum, '???')
return result
pdf = Darrell(open(PATH-TO-PDF, 'rb'))
template = '%-5s %s'
print template % ('page', 'title')
for p,t in sorted([(v,k) for k,v in pdf.getDestinationPageNumbers().iteritems()]):
print template % (p+1,t)
This is just what I was looking for. Darrell's additions to PdfFileReader should be part of PyPDF2.
I wrote a little recipe that uses PyPDF2 and sejda-console to split a PDF by bookmarks. In my case there are several Level 1 sections that I want to keep together. This script allows me to do that and give the resulting files meaningful names.
import operator
import os
import subprocess
import sys
import time
import PyPDF2 as pyPdf
# need to have sejda-console installed
# change this to point to your installation
sejda = 'C:\\sejda-console-1.0.0.M2\\bin\\sejda-console.bat'
class Darrell(pyPdf.PdfFileReader):
...
if __name__ == '__main__':
t0= time.time()
# get the name of the file to split as a command line arg
pdfname = sys.argv[1]
# open up the pdf
pdf = Darrell(open(pdfname, 'rb'))
# build list of (pagenumbers, newFileNames)
splitlist = [(1,'FrontMatter')] # Customize name of first section
template = '%-5s %s'
print template % ('Page', 'Title')
print '-'*72
for t,p in sorted(pdf.getDestinationPageNumbers().iteritems(),
key=operator.itemgetter(1)):
# Customize this to get it to split where you want
if t.startswith('Chapter') or \
t.startswith('Preface') or \
t.startswith('References'):
print template % (p+1, t)
# this customizes how files are renamed
new = t.replace('Chapter ', 'Chapter')\
.replace(': ', '-')\
.replace(': ', '-')\
.replace(' ', '_')
splitlist.append((p+1, new))
# call sejda tools and split document
call = sejda
call += ' splitbypages'
call += ' -f "%s"'%pdfname
call += ' -o ./'
call += ' -n '
call += ' '.join([str(p) for p,t in splitlist[1:]])
print '\n', call
subprocess.call(call)
print '\nsejda-console has completed.\n\n'
# rename the split files
for p,t in splitlist:
old ='./%i_'%p + pdfname
new = './' + t + '.pdf'
print 'renaming "%s"\n to "%s"...'%(old, new),
try:
os.remove(new)
except OSError:
pass
try:
os.rename(old, new)
print' succeeded.\n'
except:
print' failed.\n'
print '\ndone. Spliting took %.2f seconds'%(time.time() - t0)
Small update to #darrell class to be able to parse UTF-8 outlines, which I post as answer because comment would be hard to read.
Problem is in pyPdf.pdf.Destination.title which may be returned in two flavors:
pyPdf.generic.TextStringObject
pyPdf.generic.ByteStringObject
so that output from _setup_outline_page_ids() function returns also two different types for title object, which fails with UnicodeDecodeError if outline title contains anything then ASCII.
I added this code to solve the problem:
if isinstance(title, pyPdf.generic.TextStringObject):
title = title.encode('utf-8')
of whole class:
class PdfOutline(pyPdf.PdfFileReader):
def getDestinationPageNumbers(self):
def _setup_outline_page_ids(outline, _result=None):
if _result is None:
_result = {}
for obj in outline:
if isinstance(obj, pyPdf.pdf.Destination):
_result[(id(obj), obj.title)] = obj.page.idnum
elif isinstance(obj, list):
_setup_outline_page_ids(obj, _result)
return _result
def _setup_page_id_to_num(pages=None, _result=None, _num_pages=None):
if _result is None:
_result = {}
if pages is None:
_num_pages = []
pages = self.trailer["/Root"].getObject()["/Pages"].getObject()
t = pages["/Type"]
if t == "/Pages":
for page in pages["/Kids"]:
_result[page.idnum] = len(_num_pages)
_setup_page_id_to_num(page.getObject(), _result, _num_pages)
elif t == "/Page":
_num_pages.append(1)
return _result
outline_page_ids = _setup_outline_page_ids(self.getOutlines())
page_id_to_page_numbers = _setup_page_id_to_num()
result = {}
for (_, title), page_idnum in outline_page_ids.iteritems():
if isinstance(title, pyPdf.generic.TextStringObject):
title = title.encode('utf-8')
result[title] = page_id_to_page_numbers.get(page_idnum, '???')
return result
Darrell's class can be modified slightly to produce a multi-level table of contents for a pdf (in the manner of pdftoc in the pdftk toolkit.)
My modification adds one more parameter to _setup_page_id_to_num, an integer "level" which defaults to 1. Each invocation increments the level. Instead of storing just the page number in the result, we store the pair of page number and level. Appropriate modifications should be applied when using the returned result.
I am using this to implement the "PDF Hacks" browser-based page-at-a-time document viewer with a sidebar table of contents which reflects LaTeX section, subsection etc bookmarks. I am working on a shared system where pdftk can not be installed but where python is available.
A solution 10 years later for newer python and PyPDF:
from PyPDF2 import PdfReader, PdfWriter
filename = "main.pdf"
with open(filename, "rb") as f:
r = PdfReader(f)
bookmarks = list(map(lambda x: (x.title, r.get_destination_page_number(x)), r.outline))
print(bookmarks)
for i, b in enumerate(bookmarks):
begin = b[1]
end = bookmarks[i+1][1] if i < len(bookmarks) - 1 else len(r.pages)
# print(len(r.pages[begin:end]))
name = b[0] + ".pdf"
print(f"{name=}: {begin=}, {end=}")
with open(name, "wb") as f:
w = PdfWriter(f)
for p in r.pages[begin:end]:
w.add_page(p)
w.write(f)

Categories

Resources