Visualize 3 columns as a heatmap in seaborn / pandas [duplicate] - python

I need to create MatplotLib heatmap (pcolormesh) using Pandas DataFrame TimeSeries column (df_all.ts) as my X-axis.
How to convert Pandas TimeSeries column to something which can be used as X-axis in np.meshgrid(x, y) function to create heatmap? The workaround is to create Matplotlib drange using same parameters as in pandas column, but is there a simple way?
x = pd.date_range(df_all.ts.min(),df_all.ts.max(),freq='H')
xt = mdates.drange(df_all.ts.min(), df_all.ts.max(), dt.timedelta(hours=1))
y = arange(ylen)
X,Y = np.meshgrid(xt, y)

I do not know what you mean by heat map for a time series, but for a dataframe you may do as below:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from itertools import product
from string import ascii_uppercase
from matplotlib import patheffects
m, n = 4, 7 # 4 rows, 7 columns
df = pd.DataFrame(np.random.randn(m, n),
columns=list(ascii_uppercase[:n]),
index=list(ascii_uppercase[-m:]))
ax = plt.imshow(df, interpolation='nearest', cmap='Oranges').axes
_ = ax.set_xticks(np.linspace(0, n-1, n))
_ = ax.set_xticklabels(df.columns)
_ = ax.set_yticks(np.linspace(0, m-1, m))
_ = ax.set_yticklabels(df.index)
ax.grid('off')
ax.xaxis.tick_top()
optionally, to print actual values in the middle of each square, with some shadows for readability, you may do:
path_effects = [patheffects.withSimplePatchShadow(shadow_rgbFace=(1,1,1))]
for i, j in product(range(m), range(n)):
_ = ax.text(j, i, '{0:.2f}'.format(df.iloc[i, j]),
size='medium', ha='center', va='center',
path_effects=path_effects)

Related

How to visualize variations between columns through plot pandas

I have a data frame shown in the figure; some mismatches exist in the rows. I want to plot the first column versus all the other columns to depict the variation. Can anyone tell me how I can do that.?
You could use a 2D heatmap to show the differences:
For example, if you had a dataframe like this:
Dataframe:
U V W X Y Z
0 M M M M M M
1 K K R K K K
2 A A A A B A
3 I I I I I I
4 L L L L L L
You could use the following code to identify the differences to that in column U:
Code:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
df = pd.DataFrame({"U":['M','K','A','I','L'], "V":['M','K','A','I','L'], "W":['M','R','A','I','L'], "X":['M','K','A','I','L'], "Y":['M','K','B','I','L'], "Z":['M','K','A','I','L']})
# Create a new dataframe with the differences between the values in each column and the values in the first column
diff_df = df.apply(lambda x: x != df['U'])
# Convert the dataframe to a numpy array
diff_arr = diff_df.values
cmap = plt.cm.RdYlGn_r
fig, ax = plt.subplots()
im = ax.imshow(diff_arr, cmap=cmap)
cbar = ax.figure.colorbar(im, ax=ax)
cbar.ax.set_ylabel('Differences', rotation=-90, va="bottom")
ax.set_xticks(np.arange(len(df.columns)))
ax.set_yticks(np.arange(len(df)))
ax.set_xticklabels(df.columns)
ax.set_yticklabels(df.index)
for i in range(len(df)):
for j in range(len(df.columns)):
text = ax.text(j, i, df.iloc[i, j], ha="center", va="center", color="w")
ax.set_title("Differences")
fig.tight_layout()
plt.show()
Output:

How to remove NaN values from matshow 2D plot and format labels as percentages

I am plotting some 2D array data as a map with a color scale and I have some NaN values that I don't want to show up. I need these values to show up as plain white squares and for the formatting to be in percentage style. Here is what I have so far:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
filename = "data.csv"
df = pd.read_csv(filename, header=None)
fig, ax = plt.subplots()
ax.matshow(df, cmap='bwr')
for (i, j), z in np.ndenumerate(df):
ax.text(j, i, '{:0.1f}'.format(z), ha='center', va='center',
bbox=dict(boxstyle='round', facecolor='white', edgecolor='0.3'))
plt.show()
This is the data I have:
...and here is what I get out right now. This is almost exactly what I want, except for those NaN values and the percentage formatting.
Any help would be very much appreciated.
Format the text string with an if-else block.
nan can only be checked with np.isnan
See Fixed digits after decimal with f-strings for additional information about f-string formatting.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
# sample data
np.random.seed(365)
df = pd.DataFrame(np.random.random((4, 4)))
df.iloc[2, -1] = np.nan
df.iloc[3, 2:] = np.nan
# plot
fig, ax = plt.subplots()
ax.matshow(df, cmap='bwr')
for (i, j), z in np.ndenumerate(df):
if np.isnan(z):
z = '' # if z is nan make it an empty string
else:
z = f'{z*100:0.1f}%' # otherwise multiply by 100 and string format
ax.text(j, i, z, ha='center', va='center',
bbox=dict(boxstyle='round', facecolor='white', edgecolor='0.3'))
plt.show()

Matplotlib plot legend shows markers twice

The legend in my plot shows the marker icon twice in the legend
The code that produced this plot is given below
import pandas as pd
import random
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import numpy as np
N = 15
colors = cm.rainbow(np.linspace(0, 1, N))
df = []
for i in range(N):
s = 'NAME %d' % i
df.append(dict(x=random.random(), y=random.random(), name=s))
df = pd.DataFrame(df)
c = 0
labels = []
fig, ax = plt.subplots(figsize=(12,12))
for name, group in df.groupby('name'):
x = group['x'].values[0]
y = group['y'].values[0]
color = colors[c]
c += 1
ax.plot(x, y, color=color, marker='o', linestyle='', label=name)
labels.append(name)
handels, _ = ax.get_legend_handles_labels()
ax.legend(handels, labels)
Why is this happening?
My actual df has multiple entries for each name so that's why I do a groupby. Is there something I'm missing here?
you can either set plt.legend(loc=...,numpoints =1) directly or create a style sheet and set legend.numpoints : 1
If you use a linux system: place your stylesheets in ~/.config/matplotlib/stylelib/ you can use them with plt.style.use([your_style_sheet]). Additionally, you can e.g. make one sheet for the colors etc. and one for the size: plt.style.use([my_colors,half_column_latex])

Show values over matplotlib imshow plot [duplicate]

I need to create MatplotLib heatmap (pcolormesh) using Pandas DataFrame TimeSeries column (df_all.ts) as my X-axis.
How to convert Pandas TimeSeries column to something which can be used as X-axis in np.meshgrid(x, y) function to create heatmap? The workaround is to create Matplotlib drange using same parameters as in pandas column, but is there a simple way?
x = pd.date_range(df_all.ts.min(),df_all.ts.max(),freq='H')
xt = mdates.drange(df_all.ts.min(), df_all.ts.max(), dt.timedelta(hours=1))
y = arange(ylen)
X,Y = np.meshgrid(xt, y)
I do not know what you mean by heat map for a time series, but for a dataframe you may do as below:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from itertools import product
from string import ascii_uppercase
from matplotlib import patheffects
m, n = 4, 7 # 4 rows, 7 columns
df = pd.DataFrame(np.random.randn(m, n),
columns=list(ascii_uppercase[:n]),
index=list(ascii_uppercase[-m:]))
ax = plt.imshow(df, interpolation='nearest', cmap='Oranges').axes
_ = ax.set_xticks(np.linspace(0, n-1, n))
_ = ax.set_xticklabels(df.columns)
_ = ax.set_yticks(np.linspace(0, m-1, m))
_ = ax.set_yticklabels(df.index)
ax.grid('off')
ax.xaxis.tick_top()
optionally, to print actual values in the middle of each square, with some shadows for readability, you may do:
path_effects = [patheffects.withSimplePatchShadow(shadow_rgbFace=(1,1,1))]
for i, j in product(range(m), range(n)):
_ = ax.text(j, i, '{0:.2f}'.format(df.iloc[i, j]),
size='medium', ha='center', va='center',
path_effects=path_effects)

How should I pass a matplotlib object through a function; as Axis, Axes or Figure?

Sorry in advance if this is a little long winded but if I cut it down too much the problem is lost. I am trying to make a module on top of pandas and matplotlib which will give me the ability to make profile plots and profile matrices analogous to scatter_matrix. I am pretty sure my problem comes down to what object I need to return from Profile() so that I can handle Axes manipulation in Profile_Matrix(). Then the question is what to return form Profile_Matrix() so I can edit subplots.
My module (ProfileModule.py) borrows a lot from https://github.com/pydata/pandas/blob/master/pandas/tools/plotting.py and looks like:
import pandas as pd
from pandas import Series, DataFrame
import numpy as np
import matplotlib.pyplot as plt
def Profile(x,y,nbins,xmin,xmax):
df = DataFrame({'x' : x , 'y' : y})
binedges = xmin + ((xmax-xmin)/nbins) * np.arange(nbins+1)
df['bin'] = np.digitize(df['x'],binedges)
bincenters = xmin + ((xmax-xmin)/nbins)*np.arange(nbins) + ((xmax-xmin)/(2*nbins))
ProfileFrame = DataFrame({'bincenters' : bincenters, 'N' : df['bin'].value_counts(sort=False)},index=range(1,nbins+1))
bins = ProfileFrame.index.values
for bin in bins:
ProfileFrame.ix[bin,'ymean'] = df.ix[df['bin']==bin,'y'].mean()
ProfileFrame.ix[bin,'yStandDev'] = df.ix[df['bin']==bin,'y'].std()
ProfileFrame.ix[bin,'yMeanError'] = ProfileFrame.ix[bin,'yStandDev'] / np.sqrt(ProfileFrame.ix[bin,'N'])
fig = plt.figure();
ax = ProfilePlot.add_subplot(1, 1, 1)
plt.errorbar(ProfileFrame['bincenters'], ProfileFrame['ymean'], yerr=ProfileFrame['yMeanError'], xerr=(xmax-xmin)/(2*nbins), fmt=None)
return ax
#or should I "return fig"
def Profile_Matrix(frame):
import pandas.core.common as com
import pandas.tools.plotting as plots
from pandas.compat import lrange
from matplotlib.artist import setp
range_padding=0.05
df = frame._get_numeric_data()
n = df.columns.size
fig, axes = plots._subplots(nrows=n, ncols=n, squeeze=False)
# no gaps between subplots
fig.subplots_adjust(wspace=0, hspace=0)
mask = com.notnull(df)
boundaries_list = []
for a in df.columns:
values = df[a].values[mask[a].values]
rmin_, rmax_ = np.min(values), np.max(values)
rdelta_ext = (rmax_ - rmin_) * range_padding / 2.
boundaries_list.append((rmin_ - rdelta_ext, rmax_+ rdelta_ext))
for i, a in zip(lrange(n), df.columns):
for j, b in zip(lrange(n), df.columns):
ax = axes[i, j]
common = (mask[a] & mask[b]).values
nbins = 100
(xmin,xmax) = boundaries_list[i]
ax=Profile(df[b][common],df[a][common],nbins,xmin,xmax)
#Profile(df[b][common].values,df[a][common].values,nbins,xmin,xmax)
ax.set_xlabel('')
ax.set_ylabel('')
plots._label_axis(ax, kind='x', label=b, position='bottom', rotate=True)
plots._label_axis(ax, kind='y', label=a, position='left')
if j!= 0:
ax.yaxis.set_visible(False)
if i != n-1:
ax.xaxis.set_visible(False)
for ax in axes.flat:
setp(ax.get_xticklabels(), fontsize=8)
setp(ax.get_yticklabels(), fontsize=8)
return axes
This will run with something like:
import pandas as pd
from pandas import Series, DataFrame
import numpy as np
import matplotlib.pyplot as plt
import ProfileModule as pm
x = np.random.uniform(0, 100, size=1000)
y = x *x + 50*x*np.random.randn(1000)
z = x *y + 50*y*np.random.randn(1000)
nbins = 25
xmax = 100
xmin = 0
ProfilePlot = pm.Profile(x,y,nbins,xmin,xmax)
plt.title("Look this works!")
#This does not work as expected
frame = DataFrame({'z' : z,'x' : x , 'y' : y})
ProfileMatrix = pm.Profile_Matrix(frame)
plt.show()
This would hopefully produce a simple profile plot and a 3x3 profile matrix but it does not. I have tried various different methods to get this to work but I imagine it is not worth explaining them all.
I should mention I am using Enthought Canopy Express on Windows 7. Sorry for the long post and thanks again for any help with the code. This is my first week using Python.
You should pass around Axes objects and break your functions up to operate on a single axes at a time. You are close, but just change
import numpy as np
import matplotlib.pyplot as plt
def _profile(ax, x, y):
ln, = ax.plot(x, y)
# return the Artist created
return ln
def profile_matrix(n, m):
fig, ax_array = plt.subplots(n, m, sharex=True, sharey=True)
for ax in np.ravel(ax_array):
_profile(ax, np.arange(50), np.random.rand(50))
profile_matrix(3, 3)

Categories

Resources