I have a program which logs on to a specified gmail account and gets all the emails in a selected inbox that were sent from an email that you input at runtime.
I would like to be able to grab all the links from each email and append them to a list so that i can then filter out the ones i don't need before outputting them to another file. I was using a regex to do this which requires me to convert the payload to a string. The problem is that the regex i am using doesn't work for findall(), it only works when i use search() (I am not too familiar with regexes). I was wondering if there was a better way to extract all links from an email that doesn't involve me messing around with regexes?
My code currently looks like this:
print(f'[{Mail.timestamp}] Scanning inbox')
sys.stdout.write(Style.RESET)
self.search_mail_status, self.amount_matching_criteria = self.login_session.search(Mail.CHARSET,search_criteria)
if self.amount_matching_criteria == 0 or self.amount_matching_criteria == '0':
print(f'[{Mail.timestamp}] No mails from that email address could be found...')
Mail.enter_to_continue()
import main
main.main_wrapper()
else:
pattern = '(?P<url>https?://[^\s]+)'
prog = re.compile(pattern)
self.amount_matching_criteria = self.amount_matching_criteria[0]
self.amount_matching_criteria_str = str(self.amount_matching_criteria)
num_mails = re.search(r"\d.+",self.amount_matching_criteria_str)
num_mails = ((num_mails.group())[:-1]).split(' ')
sys.stdout.write(Style.GREEN)
print(f'[{Mail.timestamp}] Status code of {self.search_mail_status}')
sys.stdout.write(Style.RESET)
sys.stdout.write(Style.YELLOW)
print(f'[{Mail.timestamp}] Found {len(num_mails)} emails')
sys.stdout.write(Style.RESET)
num_mails = self.amount_matching_criteria.split()
for message_num in num_mails:
individual_response_code, individual_response_data = self.login_session.fetch(message_num, '(RFC822)')
message = email.message_from_bytes(individual_response_data[0][1])
if message.is_multipart():
print('multipart')
multipart_payload = message.get_payload()
for sub_message in multipart_payload:
string_payload = str(sub_message.get_payload())
print(prog.search(string_payload).group("url"))
Ended up using this for loop with a recursive function and a regex to get the links, i then removed all links without a the substring that you can input earlier on in the program before appending to a set
for message_num in self.amount_matching_criteria.split():
counter += 1
_, self.individual_response_data = self.login_session.fetch(message_num, '(RFC822)')
self.raw = email.message_from_bytes(self.individual_response_data[0][1])
raw = self.raw
self.scraped_email_value = email.message_from_bytes(Mail.scrape_email(raw))
self.scraped_email_value = str(self.scraped_email_value)
self.returned_links = prog.findall(self.scraped_email_value)
for i in self.returned_links:
if self.substring_filter in i:
self.link_set.add(i)
self.timestamp = time.strftime('%H:%M:%S')
print(f'[{self.timestamp}] Links scraped: [{counter}/{len(num_mails)}]')
The function used:
def scrape_email(raw):
if raw.is_multipart():
return Mail.scrape_email(raw.get_payload(0))
else:
return raw.get_payload(None,True)
I'm looking at parsing out just the most recent reply/message from an email thread as part of a zap.
I've found this link but how to I use it within a Zap? https://github.com/zapier/email-reply-parser
i.e. when I pick up a thread from gmail how do I just extract the most recent message?
Is this possible in Code by Zapier and if so how?
E.g.
Input:
Yes that is fine, I will email you in the morning.
On Fri, Nov 16, 2012 at 1:48 PM, Zapier wrote:
Our support team just commented on your open Ticket:
"Hi Royce, can we chat in the morning about your question?"
Ouput: i.e. the parsed email:
Yes that is fine, I will email you in the morning.
First off: it's not possible to use that in a code step directly. Python code steps don't have access to external packages.
That said, that package is just Python code, and there's nothing stopping you copying all of the important code into the Code step and using it that way.
It's worth noting that the linked code is pretty old and looks to be unmaintained, so it's unlikely to work without modifications.
I had a go at adapting this https://github.com/zapier/email-reply-parser which seemed to work as well.
"""
email_reply_parser is a python library port of GitHub's Email Reply Parser.
For more information, visit https://github.com/zapier/email-reply-parser
"""
import re
class EmailReplyParser(object):
""" Represents a email message that is parsed.
"""
#staticmethod
def read(text):
""" Factory method that splits email into list of fragments
text - A string email body
Returns an EmailMessage instance
"""
return EmailMessage(text).read()
#staticmethod
def parse_reply(text):
""" Provides the reply portion of email.
text - A string email body
Returns reply body message
"""
return EmailReplyParser.read(text).reply
class EmailMessage(object):
""" An email message represents a parsed email body.
"""
SIG_REGEX = re.compile(r'(--|__|-\w)|(^Sent from my (\w+\s*){1,3})')
QUOTE_HDR_REGEX = re.compile('On.*wrote:$')
QUOTED_REGEX = re.compile(r'(>+)')
HEADER_REGEX = re.compile(r'^\*?(From|Sent|To|Subject):\*? .+')
_MULTI_QUOTE_HDR_REGEX = r'(?!On.*On\s.+?wrote:)(On\s(.+?)wrote:)'
MULTI_QUOTE_HDR_REGEX = re.compile(_MULTI_QUOTE_HDR_REGEX, re.DOTALL | re.MULTILINE)
MULTI_QUOTE_HDR_REGEX_MULTILINE = re.compile(_MULTI_QUOTE_HDR_REGEX, re.DOTALL)
def __init__(self, text):
self.fragments = []
self.fragment = None
self.text = text.replace('\r\n', '\n')
self.found_visible = False
def read(self):
""" Creates new fragment for each line
and labels as a signature, quote, or hidden.
Returns EmailMessage instance
"""
self.found_visible = False
is_multi_quote_header = self.MULTI_QUOTE_HDR_REGEX_MULTILINE.search(self.text)
if is_multi_quote_header:
self.text = self.MULTI_QUOTE_HDR_REGEX.sub(is_multi_quote_header.groups()[0].replace('\n', ''), self.text)
# Fix any outlook style replies, with the reply immediately above the signature boundary line
# See email_2_2.txt for an example
self.text = re.sub('([^\n])(?=\n ?[_-]{7,})', '\\1\n', self.text, re.MULTILINE)
self.lines = self.text.split('\n')
self.lines.reverse()
for line in self.lines:
self._scan_line(line)
self._finish_fragment()
self.fragments.reverse()
return self
#property
def reply(self):
""" Captures reply message within email
"""
reply = []
for f in self.fragments:
if not (f.hidden or f.quoted):
reply.append(f.content)
return '\n'.join(reply)
def _scan_line(self, line):
""" Reviews each line in email message and determines fragment type
line - a row of text from an email message
"""
is_quote_header = self.QUOTE_HDR_REGEX.match(line) is not None
is_quoted = self.QUOTED_REGEX.match(line) is not None
is_header = is_quote_header or self.HEADER_REGEX.match(line) is not None
if self.fragment and len(line.strip()) == 0:
if self.SIG_REGEX.match(self.fragment.lines[-1].strip()):
self.fragment.signature = True
self._finish_fragment()
if self.fragment \
and ((self.fragment.headers == is_header and self.fragment.quoted == is_quoted) or
(self.fragment.quoted and (is_quote_header or len(line.strip()) == 0))):
self.fragment.lines.append(line)
else:
self._finish_fragment()
self.fragment = Fragment(is_quoted, line, headers=is_header)
def quote_header(self, line):
""" Determines whether line is part of a quoted area
line - a row of the email message
Returns True or False
"""
return self.QUOTE_HDR_REGEX.match(line[::-1]) is not None
def _finish_fragment(self):
""" Creates fragment
"""
if self.fragment:
self.fragment.finish()
if self.fragment.headers:
# Regardless of what's been seen to this point, if we encounter a headers fragment,
# all the previous fragments should be marked hidden and found_visible set to False.
self.found_visible = False
for f in self.fragments:
f.hidden = True
if not self.found_visible:
if self.fragment.quoted \
or self.fragment.headers \
or self.fragment.signature \
or (len(self.fragment.content.strip()) == 0):
self.fragment.hidden = True
else:
self.found_visible = True
self.fragments.append(self.fragment)
self.fragment = None
class Fragment(object):
""" A Fragment is a part of
an Email Message, labeling each part.
"""
def __init__(self, quoted, first_line, headers=False):
self.signature = False
self.headers = headers
self.hidden = False
self.quoted = quoted
self._content = None
self.lines = [first_line]
def finish(self):
""" Creates block of content with lines
belonging to fragment.
"""
self.lines.reverse()
self._content = '\n'.join(self.lines)
self.lines = None
#property
def content(self):
return self._content.strip()
return {'emailstring': EmailReplyParser.parse_reply(input_data['body'])}
I use the blogger2wordpress python script that Google released back in 2010 (https://code.google.com/archive/p/google-blog-converters-appengine/downloads), to convert a 95mb blogger export file to wordpress wxr format.
However, the script has this code:
#!/usr/bin/env python
# Copyright 2008 Google Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0.txt
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os.path
import logging
import re
import sys
import time
from xml.sax.saxutils import unescape
import BeautifulSoup
import gdata
from gdata import atom
import iso8601
import wordpress
__author__ = 'JJ Lueck (EMAIL#gmail.com)'
###########################
# Constants
###########################
BLOGGER_URL = 'http://www.blogger.com/'
BLOGGER_NS = 'http://www.blogger.com/atom/ns#'
KIND_SCHEME = 'http://schemas.google.com/g/2005#kind'
YOUTUBE_RE = re.compile('http://www.youtube.com/v/([^&]+)&?.*')
YOUTUBE_FMT = r'[youtube=http://www.youtube.com/watch?v=\1]'
GOOGLEVIDEO_RE = re.compile('(http://video.google.com/googleplayer.swf.*)')
GOOGLEVIDEO_FMT = r'[googlevideo=\1]'
DAILYMOTION_RE = re.compile('http://www.dailymotion.com/swf/(.*)')
DAILYMOTION_FMT = r'[dailymotion id=\1]'
###########################
# Translation class
###########################
class Blogger2Wordpress(object):
"""Performs the translation of a Blogger export document to WordPress WXR."""
def __init__(self, doc):
"""Constructs a translator for a Blogger export file.
Args:
doc: The WXR file as a string
"""
# Ensure UTF8 chars get through correctly by ensuring we have a
# compliant UTF8 input doc.
self.doc = doc.decode('utf-8', 'replace').encode('utf-8')
# Read the incoming document as a GData Atom feed.
self.feed = atom.FeedFromString(self.doc)
self.next_id = 1
def Translate(self):
"""Performs the actual translation to WordPress WXR export format.
Returns:
A WordPress WXR export document as a string, or None on error.
"""
# Create the top-level document and the channel associated with it.
channel = wordpress.Channel(
title = self.feed.title.text,
link = self.feed.GetAlternateLink().href,
base_blog_url = self.feed.GetAlternateLink().href,
pubDate = self._ConvertPubDate(self.feed.updated.text))
posts_map = {}
for entry in self.feed.entry:
# Grab the information about the entry kind
entry_kind = ""
for category in entry.category:
if category.scheme == KIND_SCHEME:
entry_kind = category.term
if entry_kind.endswith("#comment"):
# This entry will be a comment, grab the post that it goes to
in_reply_to = entry.FindExtensions('in-reply-to')
post_item = None
# Check to see that the comment has a corresponding post entry
if in_reply_to:
post_id = self._ParsePostId(in_reply_to[0].attributes['ref'])
post_item = posts_map.get(post_id, None)
# Found the post for the comment, add the commment to it
if post_item:
# The author email may not be included in the file
author_email = ''
if entry.author[0].email:
author_email = entry.author[0].email.text
# Same for the the author's url
author_url = ''
if entry.author[0].uri:
author_url = entry.author[0].uri.text
post_item.comments.append(wordpress.Comment(
comment_id = self._GetNextId(),
author = entry.author[0].name.text,
author_email = author_email,
author_url = author_url,
date = self._ConvertDate(entry.published.text),
content = self._ConvertContent(entry.content.text)))
elif entry_kind.endswith('#post'):
# This entry will be a post
post_item = self._ConvertEntry(entry, False)
posts_map[self._ParsePostId(entry.id.text)] = post_item
channel.items.append(post_item)
elif entry_kind.endswith('#page'):
# This entry will be a static page
page_item = self._ConvertEntry(entry, True)
posts_map[self._ParsePageId(entry.id.text)] = page_item
channel.items.append(page_item)
wxr = wordpress.WordPressWxr(channel=channel)
return wxr.WriteXml()
def _ConvertEntry(self, entry, is_page):
"""Converts the contents of an Atom entry into a WXR post Item element."""
# A post may have an empty title, in which case the text element is None.
title = ''
if entry.title.text:
title = entry.title.text
# Check here to see if the entry points to a draft or regular post
status = 'publish'
if entry.control and entry.control.draft:
status = 'draft'
# If no link is present in the Blogger entry, just link
if entry.GetAlternateLink():
link = entry.GetAlternateLink().href
else:
link = BLOGGER_URL
# Declare whether this is a post of a page
post_type = 'post'
if is_page:
post_type = 'page'
blogger_blog = ''
blogger_permalink = ''
if entry.GetAlternateLink():
blogger_path_full = entry.GetAlternateLink().href.replace('http://', '')
blogger_blog = blogger_path_full.split('/')[0]
blogger_permalink = blogger_path_full[len(blogger_blog):]
# Create the actual item element
post_item = wordpress.Item(
title = title,
link = link,
pubDate = self._ConvertPubDate(entry.published.text),
creator = entry.author[0].name.text,
content = self._ConvertContent(entry.content.text),
post_id = self._GetNextId(),
post_date = self._ConvertDate(entry.published.text),
status = status,
post_type = post_type,
blogger_blog = blogger_blog,
blogger_permalink = blogger_permalink,
blogger_author = entry.author[0].name.text)
# Convert the categories which specify labels into wordpress labels
for category in entry.category:
if category.scheme == BLOGGER_NS:
post_item.labels.append(category.term)
return post_item
def _ConvertContent(self, text):
"""Unescapes the post/comment text body and replaces video content.
All <object> and <embed> tags in the post that relate to video must be
changed into the WordPress tags for embedding video,
e.g. [youtube=http://www.youtube.com/...]
If no text is provided, the empty string is returned.
"""
if not text:
return ''
# First unescape all XML tags as they'll be escaped by the XML emitter
content = unescape(text)
# Use an HTML parser on the body to look for video content
content_tree = BeautifulSoup.BeautifulSoup(content)
# Find the object tag
objs = content_tree.findAll('object')
for obj_tag in objs:
# Find the param tag within which contains the URL to the movie
param_tag = obj_tag.find('param', { 'name': 'movie' })
if not param_tag:
continue
# Get the video URL
video = param_tag.attrMap.get('value', None)
if not video:
continue
# Convert the video URL if necessary
video = YOUTUBE_RE.subn(YOUTUBE_FMT, video)[0]
video = GOOGLEVIDEO_RE.subn(GOOGLEVIDEO_FMT, video)[0]
video = DAILYMOTION_RE.subn(DAILYMOTION_FMT, video)[0]
# Replace the portion of the contents with the video
obj_tag.replaceWith(video)
return str(content_tree)
def _ConvertPubDate(self, date):
"""Translates to a pubDate element's time/date format."""
date_tuple = iso8601.parse_date(date)
return date_tuple.strftime('%a, %d %b %Y %H:%M:%S %z')
def _ConvertDate(self, date):
"""Translates to a wordpress date element's time/date format."""
date_tuple = iso8601.parse_date(date)
return date_tuple.strftime('%Y-%m-%d %H:%M:%S')
def _GetNextId(self):
"""Returns the next identifier to use in the export document as a string."""
next_id = self.next_id;
self.next_id += 1
return str(next_id)
def _ParsePostId(self, text):
"""Extracts the post identifier from a Blogger entry ID."""
matcher = re.compile('post-(\d+)')
matches = matcher.search(text)
return matches.group(1)
def _ParsePageId(self, text):
"""Extracts the page identifier from a Blogger entry ID."""
matcher = re.compile('page-(\d+)')
matches = matcher.search(text)
return matches.group(1)
if __name__ == '__main__':
if len(sys.argv) <= 1:
print 'Usage: %s <blogger_export_file>' % os.path.basename(sys.argv[0])
print
print ' Outputs the converted WordPress export file to standard out.'
sys.exit(-1)
wp_xml_file = open(sys.argv[1])
wp_xml_doc = wp_xml_file.read()
translator = Blogger2Wordpress(wp_xml_doc)
print translator.Translate()
wp_xml_file.close()
This scripts outputs the wxr file in the terminal window which is useless for me when the import file has tons of entries.
As I am not familiar with python, how can I modify the script to output the data into a .xml file?
Edit:
I did changed the end of the script to:
wp_xml_file = open(sys.argv[1])
wp_xml_doc = wp_xml_file.read()
translator = Blogger2Wordpress(wp_xml_doc)
print translator.Translate()
fh = open("testoutput.xml", "w")
fh.write(wp_xml_doc);
fh.close();
wp_xml_file.close()
But the produced file is an "invalid wxr file" :/
Can anybody help? Thanks!
Quick and dirty answer:
Output to the stdout is normal behaviour.
You might want to redirect it to a file for instance:
python2 blogger2wordpress your_blogger_export_file > backup
The output will be saved in the file named backup.
Or you can replace print translator.Translate() by
with open('output_file', 'w') as fd:
fd.write(translator.Translate())
This should do the trick (haven't tried).
I know this topic is talked many times in Stackoverflow but it concerns many different methods and I need help. I'm stuck since four hours ^^'
Here is the message : local variable 'menuItem' referenced before assignment
def B2BpartnerMenuDetailModify(request, partnerId, menuId, menuItemId):
message = ''
e = B2BpartnerUser(request, partnerId)
try:
menuDetail = Menu.objects.get(id=menuId)
except Menu.DoesNotExist:
return logoutUser(request)
if request.method == 'POST':
form = MenuDetailForm(request.POST, mySelf=partnerId)
if form.is_valid():
descrShort = form.cleaned_data['descrShort']
paragraph = form.cleaned_data['paragraph']
producteur = form.cleaned_data['producteur']
position = MenuItem.objects.filter(menuId = menuDetail).filter(paragraph = paragraph).count() + 1
menuItem = MenuItem(menuId = menuDetail)
menuItem.descrShort = descrShort
menuItem.paragraph = paragraph
menuItem.producteur = producteur
menuItem.save()
if producteur > 0:
menuItemProd = MenuItemProd(menuItemId = menuItem)
menuItemProd.entrepriseId = producteur
menuItemProd.save()
message = _('Details modified successfuly')
else:
data = {'descrShort': menuItem.descrShort, 'paragraph': menuItem.paragraph, 'producteur': menuItem.producteur}
form = MenuDetailForm(initial=data)
menuItems = MenuItem.objects.filter(menuId = menuDetail).select_related()
menus = Menu.objects.filter(entrepriseId=e)
menuParagraph = MenuParagraph.objects.filter(actif=1)
modifier = True
#detail = False
return render (request, 'front/B2Bmenu.html', {'MenuDetailForm': form, 'menus': menus, 'message': message, 'partnerId': partnerId, 'modifier': modifier, 'detail': detail, 'menuDetail': menuDetail, 'menuParagraph': menuParagraph, 'menuId': menuId, 'menuItems': menuItems})
I'm sure I can get my page when this error is resolved. I'm sure it's a little error, I'm a beginner at Python but I love the language :)
If you want I can give you more details but I don't think it's necessary ^^
Have a nice day and thank you for your help :)
I found it !
I just forgot to add another try for this variable, just after the first try.
try:
menuItem = MenuItem.objects.get(id=menuItemId)
except MenuItem.DoesNotExist:
return logoutUser(request)
I have a Jupyter notebook that includes python variables in markdown cells like this:
code cell:
x = 10
markdown cell:
The value of x is {{x}}.
The IPython-notebook-extension Python Markdown allows me to dynamically display these variables if I execute the markdown cell with shift-enter in the notebook.
markdown cell:
The value of x is 10.
I would like to programmatically execute all cells in the notebook and save them to a new notebook using something like this:
import nbformat
from nbconvert.preprocessors import ExecutePreprocessor
with open('report.ipynb') as f:
nb = nbformat.read(f, as_version=4)
ep = ExecutePreprocessor(timeout=600, kernel_name='python3')
ep.preprocess(nb, {})
with open('report_executed.ipynb', 'wt') as f:
nbformat.write(nb, f)
This will execute the code cells but not the markdown cells. They still look like this:
The value of x is {{x}}.
I think the issue is that the notebook is not trusted. Is there a way to tell ExecutePreprocessor to trust the notebook? Is there another way to programmatically execute a notebook including python variables in the markdown cells?
The ExecutePreprocessor only looks at code cells, so your markdown cells are completely untouched. To do markdown processing, you need the Python Markdown preprocessor, as you have stated.
Unfortunately, the Python Markdown preprocessor system only executes the code in a live notebook, which it does by modifying the javascript involved with rendering cells. The modification stores the results of executing the code snippets in the cell metadata.
The PyMarkdownPreprocessor class (in pre_pymarkdown.py) was designed to be used with nbconvert operating on notebooks that had been rendered first in a live notebook setting. It processes markdown cells, replacing {{}} patterns with the values stored in the metadata.
In your situation, however, you don't have the live notebook metadata. I had a similar problem, and I solved it by writing my own execution preprocessor that also included logic to handle the markdown cells:
from nbconvert.preprocessors import ExecutePreprocessor, Preprocessor
import nbformat, nbconvert
from textwrap import dedent
class ExecuteCodeMarkdownPreprocessor(ExecutePreprocessor):
def __init__(self, **kw):
self.sections = {'default': True} # maps section ID to true or false
self.EmptyCell = nbformat.v4.nbbase.new_raw_cell("")
return super().__init__(**kw)
def preprocess_cell(self, cell, resources, cell_index):
"""
Executes a single code cell. See base.py for details.
To execute all cells see :meth:`preprocess`.
"""
if cell.cell_type not in ['code','markdown']:
return cell, resources
if cell.cell_type == 'code':
# Do code stuff
return self.preprocess_code_cell(cell, resources, cell_index)
elif cell.cell_type == 'markdown':
# Do markdown stuff
return self.preprocess_markdown_cell(cell, resources, cell_index)
else:
# Don't do anything
return cell, resources
def preprocess_code_cell(self, cell, resources, cell_index):
''' Process code cell.
'''
outputs = self.run_cell(cell)
cell.outputs = outputs
if not self.allow_errors:
for out in outputs:
if out.output_type == 'error':
pattern = u"""\
An error occurred while executing the following cell:
------------------
{cell.source}
------------------
{out.ename}: {out.evalue}
"""
msg = dedent(pattern).format(out=out, cell=cell)
raise nbconvert.preprocessors.execute.CellExecutionError(msg)
return cell, resources
def preprocess_markdown_cell(self, cell, resources, cell_index):
# Find and execute snippets of code
cell['metadata']['variables'] = {}
for m in re.finditer("{{(.*?)}}", cell.source):
# Execute code
fakecell = nbformat.v4.nbbase.new_code_cell(m.group(1))
fakecell, resources = self.preprocess_code_cell(fakecell, resources, cell_index)
# Output found in cell.outputs
# Put output in cell['metadata']['variables']
for output in fakecell.outputs:
html = self.convert_output_to_html(output)
if html is not None:
cell['metadata']['variables'][fakecell.source] = html
break
return cell, resources
def convert_output_to_html(self, output):
'''Convert IOpub output to HTML
See https://github.com/ipython-contrib/IPython-notebook-extensions/blob/master/nbextensions/usability/python-markdown/main.js
'''
if output['output_type'] == 'error':
text = '**' + output.ename + '**: ' + output.evalue;
return text
elif output.output_type == 'execute_result' or output.output_type == 'display_data':
data = output.data
if 'text/latex' in data:
html = data['text/latex']
return html
elif 'image/svg+xml' in data:
# Not supported
#var svg = ul['image/svg+xml'];
#/* embed SVG in an <img> tag, still get eaten by sanitizer... */
#svg = btoa(svg);
#html = '<img src="data:image/svg+xml;base64,' + svg + '"/>';
return None
elif 'image/jpeg' in data:
jpeg = data['image/jpeg']
html = '<img src="data:image/jpeg;base64,' + jpeg + '"/>'
return html
elif 'image/png' in data:
png = data['image/png']
html = '<img src="data:image/png;base64,' + png + '"/>'
return html
elif 'text/markdown' in data:
text = data['text/markdown']
return text
elif 'text/html' in data:
html = data['text/html']
return html
elif 'text/plain' in data:
text = data['text/plain']
# Strip <p> and </p> tags
# Strip quotes
# html.match(/<p>([\s\S]*?)<\/p>/)[1]
text = re.sub(r'<p>([\s\S]*?)<\/p>', r'\1', text)
text = re.sub(r"'([\s\S]*?)'",r'\1', text)
return text
else:
# Some tag we don't support
return None
else:
return None
You can then process you notebook with logic similar to your posted code:
import nbformat
from nbconvert.preprocessors import ExecutePreprocessor
import ExecuteCodeMarkdownPreprocessor # from wherever you put it
import PyMarkdownPreprocessor # from pre_pymarkdown.py
with open('report.ipynb') as f:
nb = nbformat.read(f, as_version=4)
ep = ExecuteCodeMarkdownPreprocessor(timeout=600, kernel_name='python3')
ep.preprocess(nb, {})
pymk = PyMarkdownPreprocessor()
pymk.preprocess(nb, {})
with open('report_executed.ipynb', 'wt') as f:
nbformat.write(nb, f)
Note that by including the Python Markdown preprocessing, your resultant notebook file will no longer have the {{}} syntax in the markdown cells - the markdown will have static content. If the recipient of the resultant notebook changes the code and executes again, the markdown will not be updated. However, if you are exporting to a different format (such as HTML), then you do want to replace the {{}} syntax with static content.
Update 2021.06.29
This again needs to be updated due to changes in nbconvert to using nbclient call (https://github.com/jupyter/nbconvert/commit/e7bf8350435a66cc50faf29ff12df492be5d7f57#diff-bee04d71b1dfc0202a0239b1513fd81d983edc339a9734ca4f4813276feed032). Since run_cell is no longer available, both the code and markdown cell processing needs to be modified. This works:
import nbformat
from nbconvert.preprocessors import ExecutePreprocessor
from nbconvert.preprocessors import execute
import re
# Taken from:
# https://stackoverflow.com/questions/35805121/execute-a-jupyter-notebook-including-inline-markdown-with-nbconvert, modified to avoid using superseeded run_cell calls.
class ExecuteCodeMarkdownPreprocessor(ExecutePreprocessor):
def __init__(self, **kw):
self.sections = {'default': True} # maps section ID to true or false
self.EmptyCell = nbformat.v4.nbbase.new_raw_cell("")
super().__init__(**kw)
def preprocess_cell(self, cell, resources, cell_index, store_history=True):
"""
Executes a single code cell. See base.py for details.
To execute all cells see :meth:`preprocess`.
"""
if cell.cell_type not in ['code', 'markdown']:
return cell, resources
if cell.cell_type == 'code':
# Do code stuff
return self.preprocess_code_cell(cell, resources, cell_index, store_history)
elif cell.cell_type == 'markdown':
# Do markdown stuff
cell, resources = self.preprocess_markdown_cell(cell, resources, cell_index, store_history)
return cell, resources
else:
# Don't do anything
return cell, resources
def preprocess_code_cell(self, cell, resources, cell_index, store_history):
""" Process code cell. Follow preprocess_cell from ExecutePreprocessor
"""
self._check_assign_resources(resources)
cell = self.execute_cell(cell, cell_index, store_history=True)
return cell, self.resources
def preprocess_markdown_cell(self, cell, resources, cell_index, store_history):
# Find and execute snippets of code
cell['metadata']['variables'] = {}
for m in re.finditer("{{(.*?)}}", cell.source):
# Execute code
self.nb.cells.append(nbformat.v4.nbbase.new_code_cell(m.group(1)))
fakecell, resources = self.preprocess_code_cell(self.nb.cells[-1], resources, len(self.nb.cells)-1, store_history)
self.nb.cells.pop()
# Output found in cell.outputs
# Put output in cell['metadata']['variables']
for output in fakecell.outputs:
html = self.convert_output_to_html(output)
if html is not None:
cell['metadata']['variables'][fakecell.source] = html
break
return cell, resources
def convert_output_to_html(self, output):
"""Convert IOpub output to HTML
See https://github.com/ipython-contrib/IPython-notebook-extensions/blob/master/nbextensions/usability/python-markdown/main.js
"""
if output['output_type'] == 'error':
text = '**' + output.ename + '**: ' + output.evalue
return text
elif output.output_type == 'execute_result' or output.output_type == 'display_data':
data = output.data
if 'text/latex' in data:
html = data['text/latex']
return html
elif 'image/svg+xml' in data:
# Not supported
#var svg = ul['image/svg+xml'];
#/* embed SVG in an <img> tag, still get eaten by sanitizer... */
#svg = btoa(svg);
#html = '<img src="data:image/svg+xml;base64,' + svg + '"/>';
return None
elif 'image/jpeg' in data:
jpeg = data['image/jpeg']
html = '<img src="data:image/jpeg;base64,' + jpeg + '"/>'
return html
elif 'image/png' in data:
png = data['image/png']
html = '<img src="data:image/png;base64,' + png + '"/>'
return html
elif 'text/markdown' in data:
text = data['text/markdown']
return text
elif 'text/html' in data:
html = data['text/html']
return html
elif 'text/plain' in data:
text = data['text/plain']
# Strip <p> and </p> tags
# Strip quotes
# html.match(/<p>([\s\S]*?)<\/p>/)[1]
text = re.sub(r'<p>([\s\S]*?)<\/p>', r'\1', text)
text = re.sub(r"'([\s\S]*?)'",r'\1', text)
return text
else:
# Some tag we don't support
return None
else:
return None
Usage remains the same.
Update 2020-07-08
The answer provided by #gordon-bean was a life-saver for me. In my final round of googling before giving up I found this answer, so before I continue I just want to say thanks!
However, slightly over 4 years after the original answer jupyter / nbconvert went through some changes and the provided code needs to be updated. So here it is:
from nbconvert.preprocessors import ExecutePreprocessor
from nbconvert.preprocessors import execute
import nbformat
import re
# Taken from:
# https://stackoverflow.com/questions/35805121/execute-a-jupyter-notebook-including-inline-markdown-with-nbconvert
class ExecuteCodeMarkdownPreprocessor(ExecutePreprocessor):
def __init__(self, **kw):
self.sections = {'default': True} # maps section ID to true or false
self.EmptyCell = nbformat.v4.nbbase.new_raw_cell("")
super().__init__(**kw)
def preprocess_cell(self, cell, resources, cell_index, store_history=True):
"""
Executes a single code cell. See base.py for details.
To execute all cells see :meth:`preprocess`.
"""
if cell.cell_type not in ['code', 'markdown']:
return cell, resources
if cell.cell_type == 'code':
# Do code stuff
return self.preprocess_code_cell(cell, resources, cell_index, store_history)
elif cell.cell_type == 'markdown':
# Do markdown stuff
return self.preprocess_markdown_cell(cell, resources, cell_index, store_history)
else:
# Don't do anything
return cell, resources
def preprocess_code_cell(self, cell, resources, cell_index, store_history):
""" Process code cell.
"""
# outputs = self.run_cell(cell)
reply, outputs = self.run_cell(cell, cell_index, store_history)
cell.outputs = outputs
cell_allows_errors = (self.allow_errors or "raises-exception"
in cell.metadata.get("tags", []))
if self.force_raise_errors or not cell_allows_errors:
for out in cell.outputs:
if out.output_type == 'error':
raise execute.CellExecutionError.from_cell_and_msg(cell, out)
if (reply is not None) and reply['content']['status'] == 'error':
raise execute.CellExecutionError.from_cell_and_msg(cell, reply['content'])
return cell, resources
def preprocess_markdown_cell(self, cell, resources, cell_index, store_history):
# Find and execute snippets of code
cell['metadata']['variables'] = {}
for m in re.finditer("{{(.*?)}}", cell.source):
# Execute code
fakecell = nbformat.v4.nbbase.new_code_cell(m.group(1))
fakecell, resources = self.preprocess_code_cell(fakecell, resources, cell_index, store_history)
# Output found in cell.outputs
# Put output in cell['metadata']['variables']
for output in fakecell.outputs:
html = self.convert_output_to_html(output)
if html is not None:
cell['metadata']['variables'][fakecell.source] = html
break
return cell, resources
def convert_output_to_html(self, output):
"""Convert IOpub output to HTML
See https://github.com/ipython-contrib/IPython-notebook-extensions/blob/master/nbextensions/usability/python-markdown/main.js
"""
if output['output_type'] == 'error':
text = '**' + output.ename + '**: ' + output.evalue
return text
elif output.output_type == 'execute_result' or output.output_type == 'display_data':
data = output.data
if 'text/latex' in data:
html = data['text/latex']
return html
elif 'image/svg+xml' in data:
# Not supported
#var svg = ul['image/svg+xml'];
#/* embed SVG in an <img> tag, still get eaten by sanitizer... */
#svg = btoa(svg);
#html = '<img src="data:image/svg+xml;base64,' + svg + '"/>';
return None
elif 'image/jpeg' in data:
jpeg = data['image/jpeg']
html = '<img src="data:image/jpeg;base64,' + jpeg + '"/>'
return html
elif 'image/png' in data:
png = data['image/png']
html = '<img src="data:image/png;base64,' + png + '"/>'
return html
elif 'text/markdown' in data:
text = data['text/markdown']
return text
elif 'text/html' in data:
html = data['text/html']
return html
elif 'text/plain' in data:
text = data['text/plain']
# Strip <p> and </p> tags
# Strip quotes
# html.match(/<p>([\s\S]*?)<\/p>/)[1]
text = re.sub(r'<p>([\s\S]*?)<\/p>', r'\1', text)
text = re.sub(r"'([\s\S]*?)'",r'\1', text)
return text
else:
# Some tag we don't support
return None
else:
return None
Usage of this code remains exactly the same as reported by Gordon Bean.