PURPOSE: I am currently working with rdkit to colour the structures of my molecules according to rdkit.Chem.Draw.SimilarityMaps. Now, I would like to use the matplotlib images SimilarityMaps function to introduce them in a pandas dataframe and export this table in the form of an html file.
CODE: I tried to do that with the following code
import pandas as pd
from rdkit import Chem
from rdkit.Chem import Draw
from rdkit.Chem.Draw import SimilarityMaps
from rdkit.Chem.Draw import IPythonConsole #Needed to show molecules
from rdkit.Chem.Draw.MolDrawing import MolDrawing, DrawingOptions
df = pd.DataFrame({'smiles':['Nc1nc(NC2CC2)c3ncn([C##H]4C[C#H](CO)C=C4)c3n1','CCCC(=O)Nc1ccc(OCC(O)CNC(C)C)c(c1)C(C)=O','CCN(CC)CCNC(=O)C1=CC=C(C=C1)NC(=O)C','CC(=O)NC1=CC=C(C=C1)O','CC(=O)Nc1sc(nn1)[S](N)(=O)=O']})
def getSim(smi):
mol = Chem.MolFromSmiles(smi)
refmol = Chem.MolFromSmiles('c1ccccc1')
fp = SimilarityMaps.GetMorganFingerprint(mol, fpType='bv')
fig, maxweight = SimilarityMaps.GetSimilarityMapForFingerprint(refmol, mol, SimilarityMaps.GetMorganFingerprint)
return fig
df['map'] = df['smiles'].map(getSim)
df.to_html('/.../test.html')
When I open the file test.html, the map column contains the information "Figure (200x200)". I check if my dataframe map column contains object: it's OK in python but not in html file.
QUESTION: I'm not sure how to get a dataframe with images and I'd like to have the help of the community to clarify this subject.
Thanks in advance
What you see as Figure (200x200) is the __repr__ string of the matplotlib Figure class. It is the text representation of that python object (the same that you would see when doing print(fig)).
What you want instead is to have an actual image in the table. An easy option would be to save the matplotlib figure as png image, create an html tag, <img src="some.png" /> and hence show the table.
import pandas as pd
import numpy as np;np.random.seed(1)
import matplotlib.pyplot as plt
import matplotlib.colors
df = pd.DataFrame({"info" : np.random.randint(0,10,10),
"status" : np.random.randint(0,3,10)})
cmap = matplotlib.colors.ListedColormap(["crimson","orange","limegreen"])
def createFigure(i):
fig, ax = plt.subplots(figsize=(.4,.4))
fig.subplots_adjust(0,0,1,1)
ax.axis("off")
ax.axis([0,1,0,1])
c = plt.Circle((.5,.5), .4, color=cmap(i))
ax.add_patch(c)
ax.text(.5,.5, str(i), ha="center", va="center")
return fig
def mapping(i):
fig = createFigure(i)
fname = "data/map_{}.png".format(i)
fig.savefig(fname)
imgstr = '<img src="{}" /> '.format(fname)
return imgstr
df['image'] = df['status'].map(mapping)
df.to_html('test.html', escape=False)
The drawback of this is that you have a lot of images saved somewhere on disk. If this is not desired, you may store the image encoded as base64 in the html file, <img src="data:image/png;base64,iVBORw0KGgoAAAAN..." />.
import pandas as pd
import numpy as np;np.random.seed(1)
import matplotlib.pyplot as plt
import matplotlib.colors
from io import BytesIO
import base64
df = pd.DataFrame({"info" : np.random.randint(0,10,10),
"status" : np.random.randint(0,3,10)})
cmap = matplotlib.colors.ListedColormap(["crimson","orange","limegreen"])
def createFigure(i):
fig, ax = plt.subplots(figsize=(.4,.4))
fig.subplots_adjust(0,0,1,1)
ax.axis("off")
ax.axis([0,1,0,1])
c = plt.Circle((.5,.5), .4, color=cmap(i))
ax.add_patch(c)
ax.text(.5,.5, str(i), ha="center", va="center")
return fig
def fig2inlinehtml(fig,i):
figfile = BytesIO()
fig.savefig(figfile, format='png')
figfile.seek(0)
# for python 2.7:
#figdata_png = base64.b64encode(figfile.getvalue())
# for python 3.x:
figdata_png = base64.b64encode(figfile.getvalue()).decode()
imgstr = '<img src="data:image/png;base64,{}" />'.format(figdata_png)
return imgstr
def mapping(i):
fig = createFigure(i)
return fig2inlinehtml(fig,i)
with pd.option_context('display.max_colwidth', -1):
df.to_html('test.html', escape=False, formatters=dict(status=mapping))
The output looks the same, but there are no images saved to disk.
This also works nicely in a Jupyter Notebook, with a small modification,
from IPython.display import HTML
# ...
pd.set_option('display.max_colwidth', -1)
HTML(df.to_html(escape=False, formatters=dict(status=mapping)))
Related
The following code display the image and audio in the top-bottom style:
Here is the test code:
import librosa
import matplotlib.pyplot as plt
import IPython.display as ipd
def plot_it(name, audio, sample_rate):
plt.figure(figsize=(8, 1))
plt.plot(audio)
plt.gca().set_title(name)
plt.show()
ipd.display(ipd.Audio(data=audio, rate=sample_rate))
Is it possible for changing the "top-bottom" style to "left-right" style for displaying the audio at the right side of the plt figure?
You can use a GridspecLayout which is similar to matplotlib's GridSpec. In order to direct to output into the needed grid cells, you can capture it using the Output widget:
import librosa
import matplotlib.pyplot as plt
import IPython.display as ipd
from ipywidgets import Output, GridspecLayout
def plot_it(name, audio, sample_rate):
grid = GridspecLayout(1, 2, align_items='center')
out = Output()
with out:
fig, ax = plt.subplots(figsize=(8, 1))
ax.plot(audio)
ax.set_title(name)
plt.close(fig)
ipd.display(ax.figure)
grid[0, 0] = out
out = Output()
with out:
ipd.display(ipd.Audio(data=audio, rate=sample_rate))
grid[0, 1] = out
ipd.display(grid)
name = 'nutcracker'
filename = librosa.example(name)
y, sr = librosa.load(filename)
plot_it(name, y, sr)
(It is essential to close the figure, otherwise you'll have double output of the figure. This is easier to do this using the OOP than the pyplot interface, that's why I changed your matplotlib code a bit)
I am iterating through files in folder and for each file I am plotting the close_price on x-axis and date on y-axis.
here is code.Everything is working fine except I want title "abc" to appear on each page but it not coming. What am I doing wrong here.
import os
from matplotlib.backends.backend_pdf import PdfPages
import pandas as pd
import matplotlib.pyplot as plt
pp = PdfPages('multipage.pdf')
pth = "D:/Technical_Data/"
for fle in os.listdir(pth):
df = pd.read_csv(os.path.join(pth, fle),usecols=(0, 4))
if not df.empty:
df=df.astype(float)
plt.title("abc")
df.plot()
pp.savefig()
pp.close()
You should pass the title as an argument of the plot() method, like:
import os
from matplotlib.backends.backend_pdf import PdfPages
import pandas as pd
import matplotlib.pyplot as plt
pp = PdfPages('multipage.pdf')
pth = "D:/Technical_Data/"
for fle in os.listdir(pth):
df = pd.read_csv(os.path.join(pth, fle),usecols=(0, 4))
if not df.empty:
df=df.astype(float)
df.plot(title="abc")
pp.savefig()
pp.close()
Another way would be to put plt.title("abc") after df.plot(). Currently, your title "abc" was overwritten by the default title of df.plot()… which is None.
I have to create a group of matplotlib figures, which I would like to directly present in a PDF report without saving them as a file.
The data for my plots is stored in a Pandas DataFrame:
Right now I do not know other better option than first save the image and use it later.
I am doing something like that:
import matplotlib.pylab as plt
from reportlab.platypus import BaseDocTemplate, Image
for index, row in myDataFrame.iterrows():
fig = plt.figure()
plt.plot(row['Xvalues'], row['Yvalues'],'o', color='r')
fig.savefig('figure_%s.png' % (row['ID']))
plt.close(fig)
text = []
doc = BaseDocTemplate(pageName, pagesize=landscape(A4))
for f in listdir(myFolder):
if f.endswith('png'):
image1 = Image(f)
text.append(image1)
doc.build(text)
Here is the best solution provided by matplotlib itself:
from matplotlib.backends.backend_pdf import PdfPages
import matplotlib.pyplot as plt
with PdfPages('foo.pdf') as pdf:
#As many times as you like, create a figure fig and save it:
fig = plt.figure()
pdf.savefig(fig)
....
fig = plt.figure()
pdf.savefig(fig)
Voilà
Find a full example here: multipage pdf matplotlib
I think you can save the figure into a buffer using io.BytessIO and use that in platypus. Something like this perhaps?
import io
import matplotlib.pylab as plt
from reportlab.platypus import BaseDocTemplate, Image
buffers = []
for index, row in myDataFrame.iterrows():
fig = plt.figure()
plt.plot(row['Xvalues'], row['Yvalues'],'o', color='r')
mybuffer = io.BytesIO()
fig.savefig(mybuffer, format = 'pdf')
mybuffer.seek(0)
buffers.append(mybuffer)
plt.close(fig)
text = []
doc = BaseDocTemplate(pageName, pagesize=landscape(A4))
doc.build(buffers)
using my package autobasedoc https://pypi.org/project/autobasedoc/ your example would look like that:
from autobasedoc import autorpt as ar
from autobasedoc import autoplot as ap
#ap.autoPdfImg
def my_plot(index, row, canvaswidth=5): #[inch]
fig, ax = ap.plt.subplots(figsize=(canvaswidth,canvaswidth))
fig.suptitle(f"My simple plot {index}", fontproperties=fontprop)
ax.plot(row['Xvalues'], row['Yvalues'],label=f"legendlabel{index}")
return fig
doc = ar.AutoDocTemplate(pageName)
content = []
for index, row in myDataFrame.iterrows():
content.append(my_plot(index, row))
doc.build(content)
This may seem to be a useless feature but it would be very helpful for me. I would like to save the output I get inside Canopy IDE. I would not think this is specific to Canopy but for the sake of clarity that is what I use. For example, my console Out[2] is what I would want from this:
I think that the formatting is quite nice and to reproduce this each time instead of just saving the output would be a waste of time. So my question is, how can I get a handle on this figure? Ideally the implimentation would be similar to standard methods, such that it could be done like this:
from matplotlib.backends.backend_pdf import PdfPages
pp = PdfPages('Output.pdf')
fig = plt.figure()
ax = fig.add_subplot(1, 1, 1)
df.plot(how='table')
pp.savefig()
pp.close()
NOTE: I realize that a very similar question has been asked before ( How to save the Pandas dataframe/series data as a figure? ) but it never received an answer and I think I have stated the question more clearly.
Here is a somewhat hackish solution but it gets the job done. You wanted a .pdf but you get a bonus .png. :)
import numpy as np
import pandas as pd
from matplotlib.backends.backend_pdf import PdfPages
import matplotlib.pyplot as plt
from PySide.QtGui import QImage
from PySide.QtGui import QPainter
from PySide.QtCore import QSize
from PySide.QtWebKit import QWebPage
arrays = [np.hstack([ ['one']*3, ['two']*3]), ['Dog', 'Bird', 'Cat']*2]
columns = pd.MultiIndex.from_arrays(arrays, names=['foo', 'bar'])
df =pd.DataFrame(np.zeros((3,6)),columns=columns,index=pd.date_range('20000103',periods=3))
h = "<!DOCTYPE html> <html> <body> <p> " + df.to_html() + " </p> </body> </html>";
page = QWebPage()
page.setViewportSize(QSize(5000,5000))
frame = page.mainFrame()
frame.setHtml(h, "text/html")
img = QImage(1000,700, QImage.Format(5))
painter = QPainter(img)
frame.render(painter)
painter.end()
a = img.save("html.png")
pp = PdfPages('html.pdf')
fig = plt.figure(figsize=(8,6),dpi=1080)
ax = fig.add_subplot(1, 1, 1)
img2 = plt.imread("html.png")
plt.axis('off')
ax.imshow(img2)
pp.savefig()
pp.close()
Edits welcome.
It is, I believe, an HTML table that your IDE is rendering. This is what ipython notebook does.
You can get a handle to it thusly:
from IPython.display import HTML
import pandas as pd
data = pd.DataFrame({'spam':['ham','green','five',0,'kitties'],
'eggs':[0,1,2,3,4]})
h = HTML(data.to_html())
h
and save to an HTML file:
my_file = open('some_file.html', 'w')
my_file.write(h.data)
my_file.close()
I think what is needed here is a consistent way of outputting a table to a pdf file amongst graphs output to pdf.
My first thought is not to use the matplotlib backend i.e.
from matplotlib.backends.backend_pdf import PdfPages
because it seemed somewhat limited in formatting options and leaned towards formatting the table as an image (thus rendering the text of the table in a non-selectable format)
If you want to mix dataframe output and matplotlib plots in a pdf without using the matplotlib pdf backend, I can think of two ways.
Generate your pdf of matplotlib figures as before and then insert pages containing the dataframe table afterwards. I view this as a difficult option.
Use a different library to generate the pdf. I illustrate one option to do this below.
First, install xhtml2pdf library. This seems a little patchily supported, but is active on Github and has some basic usage documentation here. You can install it via pip i.e. pip install xhtml2pdf
Once you've done that, here is a barebones example embedding a matplotlib figure, then the table (all text selectable), then another figure. You can play around with CSS etc to alter the formatting to your exact specifications, but I think this fulfils the brief:
from xhtml2pdf import pisa # this is the module that will do the work
import numpy as np
import pandas as pd
from matplotlib.backends.backend_pdf import PdfPages
import matplotlib.pyplot as plt
# Utility function
def convertHtmlToPdf(sourceHtml, outputFilename):
# open output file for writing (truncated binary)
resultFile = open(outputFilename, "w+b")
# convert HTML to PDF
pisaStatus = pisa.CreatePDF(
sourceHtml, # the HTML to convert
dest=resultFile, # file handle to recieve result
path='.') # this path is needed so relative paths for
# temporary image sources work
# close output file
resultFile.close() # close output file
# return True on success and False on errors
return pisaStatus.err
# Main program
if __name__=='__main__':
arrays = [np.hstack([ ['one']*3, ['two']*3]), ['Dog', 'Bird', 'Cat']*2]
columns = pd.MultiIndex.from_arrays(arrays, names=['foo', 'bar'])
df = pd.DataFrame(np.zeros((3,6)),columns=columns,index=pd.date_range('20000103',periods=3))
# Define your data
sourceHtml = '<html><head>'
# add some table CSS in head
sourceHtml += '''<style>
table, td, th {
border-style: double;
border-width: 3px;
}
td,th {
padding: 5px;
}
</style>'''
sourceHtml += '</head><body>'
#Add a matplotlib figure(s)
plt.plot(range(20))
plt.savefig('tmp1.jpg')
sourceHtml += '\n<p><img src="tmp1.jpg"></p>'
# Add the dataframe
sourceHtml += '\n<p>' + df.to_html() + '</p>'
#Add another matplotlib figure(s)
plt.plot(range(70,100))
plt.savefig('tmp2.jpg')
sourceHtml += '\n<p><img src="tmp2.jpg"></p>'
sourceHtml += '</body></html>'
outputFilename = 'test.pdf'
convertHtmlToPdf(sourceHtml, outputFilename)
Note There seems to be a bug in xhtml2pdf at the time of writing which means that some CSS is not respected. Particularly pertinent to this question is that it seems impossible to get double borders around the table
EDIT
In response comments, it became obvious that some users (well, at least #Keith who both answered and awarded a bounty!) want the table selectable, but definitely on a matplotlib axis. This is somewhat more in keeping with the original method. Hence - here is a method using the pdf backend for matplotlib and matplotlib objects only. I do not think the table looks as good - in particular the display of hierarchical column headers, but that's a matter of choice, I guess. I'm indebted to this answer and comments for the way to format axes for table display.
import numpy as np
import pandas as pd
from matplotlib.backends.backend_pdf import PdfPages
import matplotlib.pyplot as plt
# Main program
if __name__=='__main__':
pp = PdfPages('Output.pdf')
arrays = [np.hstack([ ['one']*3, ['two']*3]), ['Dog', 'Bird', 'Cat']*2]
columns = pd.MultiIndex.from_arrays(arrays, names=['foo', 'bar'])
df =pd.DataFrame(np.zeros((3,6)),columns=columns,index=pd.date_range('20000103',periods=3))
plt.plot(range(20))
pp.savefig()
plt.close()
# Calculate some sizes for formatting - constants are arbitrary - play around
nrows, ncols = len(df)+1, len(df.columns) + 10
hcell, wcell = 0.3, 1.
hpad, wpad = 0, 0
#put the table on a correctly sized figure
fig=plt.figure(figsize=(ncols*wcell+wpad, nrows*hcell+hpad))
plt.gca().axis('off')
matplotlib_tab = pd.tools.plotting.table(plt.gca(),df, loc='center')
pp.savefig()
plt.close()
#Add another matplotlib figure(s)
plt.plot(range(70,100))
pp.savefig()
plt.close()
pp.close()
I am writing an application in PyQt where I display some graphs with matplotlib. To do so, I use the following code :
from matplotlib.backends.backend_qt4agg import FigureCanvasQTAgg
import matplotlib.figure as fig
self.IntFig = fig.Figure()
self.IntBeamCanvas = FigureCanvasQTAgg(self.IntFig)
self.AxesIntInit = self.IntFig.add_subplot(111)
self.AxesIntInit.hold(False)
self.AxesIntInit.imshow(self.Int,extent =[-xx/2+xx/N,xx/2,-xx/2+xx/N,xx/2])
self.IntBeamCanvas.draw()
Later in the code I manage to save the figure created with the following code :
fname = QtGui.QFileDialog.getSaveFileName(self,'Save Intensity','C:' )
self.IntFig.savefig(str(fname))
But this saves only the figures (with its axes I mean). What if I want to save only the data
self.Int
that is displayed? I know the the pyplot.imsave method but don't know how to use it here, since I do not use pyplot but figure.Figure.
Does anyone has an idea?
You can save the image by following method:
import numpy as np
import pylab as pl
y, x = np.ogrid[-1:1:50j, -1:1:100j]
z = np.sqrt(x*x + y*y)
im = pl.imshow(z)
img = im.make_image()
h, w, s = img.as_rgba_str()
a = np.fromstring(s, dtype=np.uint8).reshape(h, w, 4)
pl.imsave("tmp.png", a)
the saved image: