How to make countour plot from dataframe [duplicate] - python

I want to draw a contour plot using table data.
I have 2 variables and response (3 columns).
I dont understand how to build this plot using it. I tried code below. But I had a next mistake: Input z must be 2D, not 1D.
feature_x = data.factor1
feature_y = data.factor2
# Creating 2-D grid of features
[X, Y] = np.meshgrid(feature_x, feature_y)
fig, ax = plt.subplots(1, 1)
Z = data.response
# plots filled contour plot
ax.contourf(X, Y, Z)
ax.set_title('Filled Contour Plot')
ax.set_xlabel('feature_x')
ax.set_ylabel('feature_y')
plt.show()
Data

To have a contour plot, z needs to be 2d matrix with all values for the points (x,y). You can think the data needed for a contour plot, as a DataFrame where index is x, columns are y and values are z. So z needs to be a 2d array of shape (x.size, y.size).
Since your z is not a 2d matrix but a 1d array, you cannot have a contour plot.
What you can do, for example, is a relplot with hue and/or size
import numpy as np
import pandas as pd
import seaborn as sns
x = np.array([1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3])
y = np.array([1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4])
z = np.array([249, 523, 603, 775, 577, 763, 808, 695, 642, 525, 795, 758])
df = pd.DataFrame({'x':x, 'y':y, 'z':z})
sns.relplot(
data=df,
x='x', y='y',
size='z', sizes=(10, 100),
hue='z',
palette='coolwarm',
);
EDIT
But... if you're looking for a continuous estimate, you can use gaussian_kde, for example
import scipy.stats as sps
import matplotlib.pyplot as plt
offset = .25
xmin = x.min()-offset
xmax = x.max()+offset
ymin = y.min()-offset
ymax = y.max()+offset
X, Y = np.mgrid[xmin:xmax:100j, ymin:ymax:100j]
positions = np.vstack([X.ravel(), Y.ravel()])
values = np.vstack([x, y])
kernel = sps.gaussian_kde(values, weights=z)
Z = np.reshape(kernel(positions).T, X.shape)
fig, ax = plt.subplots(figsize=(7, 7))
ax.imshow(np.rot90(Z), cmap=plt.cm.gist_earth_r,
extent=[xmin, xmax, ymin, ymax],
aspect='auto'
)
sns.scatterplot(
data=df,
x='x', y='y',
size='z', sizes=(10, 200),
color='k'
)
ax.set_xlim([xmin, xmax])
ax.set_ylim([ymin, ymax])
ax.legend(loc='upper left', bbox_to_anchor=(1,1))
plt.show()

Related

Confused about plotting interpolated 2D data with matplotlib

I have some unstructured 2D data that I would like to interpolate on a unit offset grid (ie grid indices start at 1 not 0) using scipy and plot using matplotlib. The code is below
import numpy as np
import matplotlib.pyplot as plt
import scipy.interpolate
# X Y Z
data = [[ 9, 2, 2.0],
[ 3, 3, 5.0],
[ 6, 4, 1.0],
[ 2, 6, 3.0],
[10, 7, 4.5],
[ 5, 8, 2.0]]
data = np.array(data)
coords = data[:, 0:2]
zvals = data[:, 2]
# Create the grid on which to interpolate (unit offset)
nx = 10
ny = 10
x = np.arange(nx)
x += 1
y = np.arange(ny)
y += 1
grid_x, grid_y = np.meshgrid(x, y, indexing='xy')
# Interpolate
grid_z1 = scipy.interpolate.griddata(coords, zvals, (grid_x, grid_y), method='linear')
# Plot the results
fig, axs = plt.subplots()
plt.imshow(grid_z1)
plt.plot(coords[:,0], coords[:,1], 'k.', ms=10)
plt.show()
The point data seem to be in the right place but matplotlib seems to be plotting the gridded data as zero-offset not unit-offset. I am obviously missing something - just not sure what. Thanks in advance!
Update
fig, axs = plt.subplots()
plt.imshow(grid_z1, origin='lower', extent=[1, 10, 1, 10])
plt.plot(coords[:,0], coords[:,1], 'k.', ms=10)
plt.show()
IIUC, you want to define xlim and ylim equal to the plot and not the heatmap:
plt.plot(coords[:,0], coords[:,1], 'k.', ms=10)
ax = plt.gca()
xlim, ylim = ax.get_xlim(), ax.get_ylim()
plt.imshow(grid_z1)
ax.set_xlim(xlim)
ax.set_ylim(ylim)
Output:

Difference between graphs

def PlotPolly(model, independent_variable, dependent_variabble, Name):
x_new = np.linspace(15, 55, 100)
y_new = model(x_new)
plt.plot(independent_variable, dependent_variabble, '.', x_new, y_new, '-') #4
plt.title('Polynomial Fit with Matplotlib for Price ~ Length')
ax = plt.gca()
ax.set_facecolor((0.898, 0.898, 0.898))
fig = plt.gcf()
plt.xlabel(Name)
plt.ylabel('Price of Cars')
plt.show()
plt.close()
I get this with this code:
But when from line 4 I remove x_new and y_new line becomes
plt.plot(independent_variable, dependent_variabble)
I get this graph :
Can you explain what is meaning of x_new and y_new and why absence of this results in this kind of graph
In your code x_new and y_new both of them have the continuous values but independent_variable and dependent_variabble have ā€¨discontinuous values and for plot discontinues you need scatter plot. see this example:
import numpy as np
import matplotlib.pyplot as plt
x = np.array([2, 1, 5, 3, 4, 2, 6, 4])
y = np.array([3, 1, 2, 0, 1, 2, 6, 4])
plt.plot(x, y, linestyle='-', marker='o')
Output:

How to create a step-plot with a gradient based on y-value?

In Python matplotlib, how can you get the line in a line or step plot to display a gradient based on the y-value?
Example plot (made in Tableau):
Code for step plot with a line that changes gradient according to x-value, adapted from this answer:
fig, ax = plt.subplots(figsize=(10, 4))
x = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]
y = [2, 3, 9, 10, 2, 9, 0, 1, 9, 1, -8]
T = np.linspace(0,1,np.size(x))**2
s = 1
for i in range(0, len(x)-s, s):
ax.step(x[i:i+s+1], y[i:i+s+1], marker='.', color=(0.0,0.5,T[i]))
ax.tick_params(axis='both', colors='lightgray', labelsize=8)
The following code is inspired by the multicolored-line example from the matplotlib docs. First the horizontal line segments are drawn and colored using their y-value. The vertical segments are subdivided in small chunks to colored individually.
vmin of the norm is set a bit lower to avoid the too-light range of the colormap.
import matplotlib.pyplot as plt
from matplotlib.collections import LineCollection
import numpy as np
x = np.arange(50)
y = np.random.randint(-3, 4, x.size).cumsum()
fig, ax = plt.subplots()
norm = plt.Normalize(y.min() - y.ptp() * .2, y.max())
cmap = 'inferno_r' # 'Reds'
horizontal_lines = np.array([x[:-1], y[:-1], x[1:], y[:-1]]).T.reshape(-1, 2, 2)
hor_lc = LineCollection(horizontal_lines, cmap=cmap, norm=norm)
hor_lc.set_array(y[:-1])
ax.add_collection(hor_lc)
factor = 10
long_y0 = np.linspace(y[:-1], y[1:], factor)[:-1, :].T.ravel()
long_y1 = np.linspace(y[:-1], y[1:], factor)[1:, :].T.ravel()
long_x = np.repeat(x[1:], factor - 1)
vertical_lines = np.array([long_x, long_y0, long_x, long_y1]).T.reshape(-1, 2, 2)
ver_lc = LineCollection(vertical_lines, cmap=cmap, norm=norm)
ver_lc.set_array((long_y0 + long_y1) / 2)
ax.add_collection(ver_lc)
ax.scatter(x, y, c=y, cmap=cmap, norm=norm)
plt.autoscale() # needed in case the scatter plot would be omited
plt.show()
Here is another example, with a black background. In this case the darkest part of the colormap is avoided. The changed code parts are:
y = np.random.randint(-9, 10, x.size)
ax.patch.set_color('black')
norm = plt.Normalize(y.min(), y.max() + y.ptp() * .2)
cmap = 'plasma_r'
Here is an example with a TwoSlopeNorm and the blue-white-red colormap:
from matplotlib.colors import TwoSlopeNorm
y = np.random.uniform(-1, 1, x.size * 10).cumsum()[::10]
y = (y - y.min()) / y.ptp() * 15 - 5
norm = TwoSlopeNorm(vmin=-5, vcenter=0, vmax=10)
cmap = 'bwr'

2D histogram plot from data file

How can you make a 2d histogram (or surface) plot out of a given 2d numpy array where the first index is the label for the 'x' dimension, the 2nd index for the 'y' dimension and the third is the value to be plotted? Example:
[[4.80000e+01 5.12000e+02 8.03447e+03]
[4.80000e+01 2.56000e+02 9.24963e+03]
[4.80000e+01 1.28000e+02 9.02154e+03]
[4.80000e+01 6.40000e+01 8.96437e+03]
[4.80000e+01 3.20000e+01 7.98477e+03]
[4.80000e+01 1.60000e+01 1.07688e+04]
[4.80000e+01 8.00000e+00 1.96227e+04]
[4.80000e+01 4.00000e+00 3.51944e+04]
[5.20000e+01 5.12000e+02 7.53994e+03]
[5.20000e+01 2.56000e+02 7.54521e+03]
[5.20000e+01 1.28000e+02 8.12631e+03]
[5.20000e+01 6.40000e+01 8.12542e+03]
[5.20000e+01 3.20000e+01 7.49664e+03]
[5.20000e+01 1.60000e+01 9.92450e+03]
...
The whole 2d area to be plotted is (18,8).
Since matplotlib seems to accept the data in a 2d array (meshgrid/contour) I was trying to first stuff the values into a newly created 2d numpy array with a loop but failed at getting the indices looked up. Like
xlu=np.unique(d[:,0])
ylu=np.unique(d[:,1])
z=np.empty((xdim,ydim))
for x in d:
z[ np.where(xlu==x[0])[0] , np.where(ylu==x[1])[0] ] = z[2]
Is there a quicker/more elegant way to do get to a 2d histogram/surface plot?
plt.tricontourf(d[:,0], d[:,1], d[:,2]) creates filled contours from x, y and z data which aren't organised as a grid.
plt.hist2d(d[:,0], d[:,1], weights=d[:,2]) creates a 2d histogram where the weight of each x,y is given by z.
from matplotlib import pyplot as plt
import numpy as np
d = np.random.randn(1000, 10, 3).cumsum(axis=0).reshape(-1, 3)
fig, (ax1, ax2) = plt.subplots(ncols=2, figsize=(12, 4))
sm1 = ax1.tricontourf(d[:, 0], d[:, 1], d[:, 2], cmap='hot')
plt.colorbar(sm1, ax=ax1)
ax1.set_title('plt.tricontourf()')
_, _, _, sm2 = ax2.hist2d(d[:, 0], d[:, 1], weights=d[:, 2], bins=40, cmap='hot')
ax2.set_title('plt.hist2d()')
plt.colorbar(sm2, ax=ax2)
plt.tight_layout()
plt.show()
It seems that you need to interpolate your data since it is irregularly spaced.
Here is a snippet from official documentation (link).
import matplotlib.tri as tri
import numpy as np
np.random.seed(19680801)
npts = 200
ngridx = 100
ngridy = 200
x = np.random.uniform(-2, 2, npts)
y = np.random.uniform(-2, 2, npts)
z = x * np.exp(-x**2 - y**2)
fig, (ax1, ax2) = plt.subplots(nrows=2)
# -----------------------
# Interpolation on a grid
# -----------------------
# A contour plot of irregularly spaced data coordinates
# via interpolation on a grid.
# Create grid values first.
xi = np.linspace(-2.1, 2.1, ngridx)
yi = np.linspace(-2.1, 2.1, ngridy)
# Linearly interpolate the data (x, y) on a grid defined by (xi, yi).
triang = tri.Triangulation(x, y)
interpolator = tri.LinearTriInterpolator(triang, z)
Xi, Yi = np.meshgrid(xi, yi)
zi = interpolator(Xi, Yi)
# Note that scipy.interpolate provides means to interpolate data on a grid
# as well. The following would be an alternative to the four lines above:
#from scipy.interpolate import griddata
#zi = griddata((x, y), z, (xi[None,:], yi[:,None]), method='linear')
ax1.contour(xi, yi, zi, levels=14, linewidths=0.5, colors='k')
What it essentially does, it interpolates the values of your function to a regular grid using trilinear interpolation. There are other ways to do it depending on your goals.

Limit/mask matplotlib contour to data area

I have a pandas DataFrame with non-uniformly spaced data points given by an x, y and z column, where x and y are pairs of variables and z is the dependent variable. For example:
import matplotlib.pyplot as plt
from matploblib.mlab import griddata
import numpy as np
import pandas as pd
df = pd.DataFrame({'x':[0, 0, 1, 1, 3, 3, 3, 4, 4, 4],
'y':[0, 1, 0, 1, 0.2, 0.7, 1.4, 0.2, 1.4, 2],
'z':[50, 40, 40, 30, 30, 30, 20, 20, 20, 10]})
x = df['x']
y = df['y']
z = df['z']
I want to do a contour plot of the dependent variable z over x and y. For this, I create a new grid to interpolate the data on using matplotlib.mlab's griddata function.
xi = np.linspace(x.min(), x.max(), 100)
yi = np.linspace(y.min(), y.max(), 100)
z_grid = griddata(x, y, z, xi, yi, interp='linear')
plt.contourf(xi, yi, z_grid, 15)
plt.scatter(x, y, color='k') # The original data points
plt.show()
While this works, the output is not what I want. I do not want griddata to interpolate outside of the boundaries given by the min and max values of the x and y data. The following plots are what shows up after calling plt.show(), and then highlighted in purple what area of the data I want to have interpolated and contoured. The contour outside the purple line is supposed to be blank. How could I go about masking the outlying data?
The linked question does unfortunately not answer my question, as I don't have a clear mathematical way to define the conditions on which to do a triangulation. Is it possible to define a condition to mask the data based on the data alone, taking the above Dataframe as an example?
As seen in the answer to this question one may introduce a condition to mask the values.
The sentence from the question
"I do not want griddata to interpolate outside of the boundaries given by the min and max values of the x and y data." implies that there is some min/max condition present, which can be used.
Should that not be the case, one may clip the contour using a path. The points of this path need to be specified as there is no generic way of knowing which points should be the edges. The code below does this for three different possible paths.
import matplotlib.pyplot as plt
from matplotlib.path import Path
from matplotlib.patches import PathPatch
from matplotlib.mlab import griddata
import numpy as np
import pandas as pd
df = pd.DataFrame({'x':[0, 0, 1, 1, 3, 3, 3, 4, 4, 4],
'y':[0, 1, 0, 1, 0.2, 0.7, 1.4, 0.2, 1.4, 2],
'z':[50, 40, 40, 30, 30, 30, 20, 20, 20, 10]})
x = df['x']
y = df['y']
z = df['z']
xi = np.linspace(x.min(), x.max(), 100)
yi = np.linspace(y.min(), y.max(), 100)
z_grid = griddata(x, y, z, xi, yi, interp='linear')
clipindex = [ [0,2,4,7,8,9,6,3,1,0],
[0,2,4,7,5,8,9,6,3,1,0],
[0,2,4,7,8,9,6,5,3,1,0]]
fig, axes = plt.subplots(ncols=3, sharey=True)
for i, ax in enumerate(axes):
cont = ax.contourf(xi, yi, z_grid, 15)
ax.scatter(x, y, color='k') # The original data points
ax.plot(x[clipindex[i]], y[clipindex[i]], color="crimson")
clippath = Path(np.c_[x[clipindex[i]], y[clipindex[i]]])
patch = PathPatch(clippath, facecolor='none')
ax.add_patch(patch)
for c in cont.collections:
c.set_clip_path(patch)
plt.show()
Ernest's answer is a great solution, but very slow for lots of contours. Instead of clipping every one of them, I built a mask by constructing the complement polygon of the desired clipping mask.
Here is the code based on Ernest's accepted answer:
import numpy as np
import pandas as pd
import matplotlib.tri as tri
import matplotlib.pyplot as plt
from descartes import PolygonPatch
from shapely.geometry import Polygon
df = pd.DataFrame({'x':[0, 0, 1, 1, 3, 3, 3, 4, 4, 4],
'y':[0, 1, 0, 1, 0.2, 0.7, 1.4, 0.2, 1.4, 2],
'z':[50, 40, 40, 30, 30, 30, 20, 20, 20, 10]})
points = df[['x', 'y']]
values = df[['z']]
xi = np.linspace(points.x.min(), points.x.max(), 100)
yi = np.linspace(points.y.min(), points.y.max(), 100)
triang = tri.Triangulation(points.x, points.y)
interpolator = tri.LinearTriInterpolator(triang, values.z)
Xi, Yi = np.meshgrid(xi, yi)
zi = interpolator(Xi, Yi)
clipindex = [ [0,2,4,7,8,9,6,3,1,0],
[0,2,4,7,5,8,9,6,3,1,0],
[0,2,4,7,8,9,6,5,3,1,0]]
fig, axes = plt.subplots(ncols=3, sharey=True, figsize=(10,4))
for i, ax in enumerate(axes):
ax.set_xlim(-0.5, 4.5)
ax.set_ylim(-0.2, 2.2)
xlim = ax.get_xlim()
ylim = ax.get_ylim()
cont = ax.contourf(Xi, Yi, zi, 15)
ax.scatter(points.x, points.y, color='k', zorder=2) # The original data points
ax.plot(points.x[clipindex[i]], points.y[clipindex[i]], color="crimson", zorder=1)
#### 'Universe polygon':
ext_bound = Polygon([(xlim[0], ylim[0]), (xlim[0], ylim[1]), (xlim[1], ylim[1]), (xlim[1], ylim[0]), (xlim[0], ylim[0])])
#### Clipping mask as polygon:
inner_bound = Polygon([ (row.x, row.y) for idx, row in points.iloc[clipindex[i]].iterrows() ])
#### Mask as the symmetric difference of both polygons:
mask = ext_bound.symmetric_difference(inner_bound)
ax.add_patch(PolygonPatch(mask, facecolor='white', zorder=1, edgecolor='white'))
plt.show()

Categories

Resources