While trying to loop over equations on matplotlib, I only get the last text from plt.text(). How can I iterate over matplotlib figures and annotate equation for each plot? Also the plt.savefig() function does not save the figures.
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from scipy.stats import pearsonr
df=pd.read_csv(r'C:\GISN21\Ex_04\data5\data3_4.txt',sep="\t",header=0)
df2=df.loc[:, df.columns != 'Station']
def calculate_pvalues(df):
df = df.dropna()._get_numeric_data()
dfcols = pd.DataFrame(columns=df.columns)
pvalues = dfcols.transpose().join(dfcols, how='outer')
for r in df.columns:
for c in df.columns:
pvalues[r][c] = round(pearsonr(df[r], df[c])[1], 8)
if pvalues[r][c] < 0.05:
i=0
if r != c:
#best fit line
(m,b)=np.polyfit(df[r] ,df[c] ,1)
equation = 'y = ' + str(round(m,4)) + 'x' ' + ' + str(round(b,4))
f = plt.figure()
plt.text(0.5,0.5, equation)
plt.savefig("correlation{i}.png".format(i=i))
ax = f.add_subplot(1,1,1)
p = sns.regplot(x=df[r],y=df[c],data=df,ax=ax)
return pvalues
sns.pairplot(df,kind='scatter')
plt.savefig('correlation.png')
plt.show()
By default, plt.text uses data coordinates, and matplotlib doesn't automatically adjust the limits to include text that is not within the data limits. So unless (0.5, 0.5) is within the limits when plt.text is called it will not be visible. If this is the case, you can resolve the issue by using axis coordinates. This can be done by supplying the axis transformation to the transform keyword, i.e.
plt.text(0.5, 0.5, equation, transform = plt.gca().transAxes)
Will place the Text instance in the center of the current axes.
Related
This code:
N = 1000
df = pd.DataFrame({
'distribution_1': np.random.randn(N),
'distribution_2': np.random.randn(N)
})
df['distribution_2'] = df['distribution_2'] / 2
df = df.melt(value_vars=['distribution_1', 'distribution_2'], value_name='value', var_name='distribution')
g = sns.ecdfplot(data=df, x='value', hue='distribution')
g.set(ylim=(-.1, 1.1))
yields a figure like this:
The CDFs of both distributions do not extend to the limits of the x-axis. I would like to know how to do this. In ggplot2 there is a boolean flag called pad that does this (see e.g. REF).
This is also possible in seaborn? (I was unable to find it ...)
sns.ecdfplot() doesn't seem to support such an option. As a workaround, you could loop through the generated lines, and move the endpoints. (Also note that sns.ecdfplot doesn't return a FacetGrid, but a subplot. Naming the return value g might be a bit confusing when comparing to code in example and tutorials.)
from matplotlib import pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
N = 1000
df = pd.DataFrame({'distribution_1': np.random.randn(N),
'distribution_2': np.random.randn(N) / 2})
df = df.melt(value_vars=['distribution_1', 'distribution_2'], value_name='value', var_name='distribution')
ax = sns.ecdfplot(data=df, x='value', hue='distribution', palette='winter')
ax.set_ylim(-.1, 1.1)
xmin, xmax = ax.get_xlim()
for line in ax.lines:
x = line.get_xdata()
x[0] = xmin
x[-1] = xmax
line.set_xdata(x)
plt.show()
I have a dataset with 76 features and 1 dependent variable (y). I use seaborn to draw pairplot between features and y in Jupyter notebook. Since the No. of features is high, size of plot for every feature is very small, as can be seen below:
I am looking for a way to draw pairplot in several rows. Also, I don't want to copy and paste pairplot code in several cells in notebook. I am looking for a way to make this figure automatically.
The code I am using (I cannot share dataset, so I use a sample dataset):
from sklearn.datasets import load_boston
import math
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
X, y = load_boston(return_X_y=True)
X = pd.DataFrame(X)
y = pd.DataFrame(y)
data = pd.concat([X, y], axis=1)
y_name = 'y'
features_names = [f'feature_{i}' for i in range(1, X.shape[1]+1)]
column_names = features_names + [y_name]
data.columns = column_names
plot_size=7
num_plots_x=5 # No. of plots in every row
num_plots_y = math.ceil(len(features_names)/num_plots_x) # No. of plots in y direction
fig = plt.figure(figsize=(plot_size*num_plots_y, plot_size*num_plots_x), facecolor='white')
axes = [fig.add_subplot(num_plots_y,1,i+1) for i in range(num_plots_y)]
for i, ax in enumerate(axes):
start_index = i * num_plots_x
end_index = (i+1) * num_plots_x
if end_index > len(features_names): end_index = len(features_names)
sns.pairplot(x_vars=features_names[start_index:end_index], y_vars=y_name, data = data)
plt.savefig('figure.png')
The above code has two problems. It shows empty box at the top of the figure and then it shows the pairplots. Following is part of the figure that I get.
Second problem is that it only saves the last row as png file, not the whole figure.
If you have any idea to solve this, please let me know. Thank you.
When I run it directly (python script.py) then it opens every row in separated window - so it treats it as separated objects and it saves in file only last object.
Other problem is that sns doesn't need fig and axes - it can't use subplots to put all on one image - and when I remove fig axes then it stops showing first window with empty box.
I found that FacetGrid has col_wrap to put in many rows. And I found that someone suggested to add this col_wrap in pairplot - Add parameter col_wrap to pairplot #2121 and there is also example how to FacetGrid with scatterplot instead of pairplot and then it can use col_wrap.
Here is code which use FacetGrid with col_wrap
from sklearn.datasets import load_boston
import math
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
X, y = load_boston(return_X_y=True)
X = pd.DataFrame(X)
y = pd.DataFrame(y)
data = pd.concat([X, y], axis=1)
y_name = 'y'
features_names = [f'feature_{i}' for i in range(1, X.shape[1]+1)]
column_names = features_names + [y_name]
data.columns = column_names
plot_size=7
num_plots_x=5 # No. of plots in every row
num_plots_y = math.ceil(len(features_names)/num_plots_x) # No. of plots in y direction
'''
for i in range(num_plots_y):
start = i * num_plots_x
end = start + num_plots_x
sns.pairplot(x_vars=features_names[start:end], y_vars=y_name, data=data)
'''
g = sns.FacetGrid(pd.DataFrame(features_names), col=0, col_wrap=4, sharex=False)
for ax, x_var in zip(g.axes, features_names):
sns.scatterplot(data=data, x=x_var, y=y_name, ax=ax)
g.tight_layout()
plt.savefig('figure.png')
plt.show()
Result ('figure.png'):
I would like to replace part of my plot where the function dips down to '-1' with a dashed line carrying on from the previous point (see plots below).
Here's some code I've written, along with its output:
import numpy as np
import matplotlib.pyplot as plt
y = [5,6,8,3,5,7,3,6,-1,3,8,5]
plt.plot(np.linspace(1,12,12),y,'r-o')
plt.show()
for i in range(1,len(y)):
if y[i]!=-1:
plt.plot(np.linspace(i-1,i,2),y[i-1:i+1],'r-o')
else:
y[i]=y[i-1]
plt.plot(np.linspace(i-1,i,2),y[i-1:i+1],'r--o')
plt.ylim(-1,9)
plt.show()
Here's the original plot
Modified plot:
The code I've written works (it produces the desired output), but it's inefficient and takes a long time when I actually run it on my (much larger) dataset. Is there a smarter way to go about doing this?
You can achieve something similar without the loops:
import pandas as pd
import matplotlib.pyplot as plt
# Create a data frame from the list
a = pd.DataFrame([5,6,-1,-1, 8,3,5,7,3,6,-1,3,8,5])
# Prepare a boolean mask
mask = a > 0
# New data frame with missing values filled with the last element of
# the previous segment. Choose 'bfill' to use the first element of
# the next segment.
a_masked = a[mask].fillna(method = 'ffill')
# Prepare the plot
fig, ax = plt.subplots()
line, = ax.plot(a_masked, ls = '--', lw = 1)
ax.plot(a[mask], color=line.get_color(), lw=1.5, marker = 'o')
plt.show()
You can also highlight the negative regions by choosing a different colour for the lines:
My answer is based on a great post from July, 2017. The latter also tackles the case when the first element is NaN or in your case a negative number:
Dotted lines instead of a missing value in matplotlib
I would use numpy functionality to cut your line into segments and then plot all solid and dashed lines separately. In the example below I added two additional -1s to your data to see that this works universally.
import numpy as np
import matplotlib.pyplot as plt
Y = np.array([5,6,-1,-1, 8,3,5,7,3,6,-1,3,8,5])
X = np.arange(len(Y))
idxs = np.where(Y==-1)[0]
sub_y = np.split(Y,idxs)
sub_x = np.split(X,idxs)
fig, ax = plt.subplots()
##replacing -1 values and plotting dotted lines
for i in range(1,len(sub_y)):
val = sub_y[i-1][-1]
sub_y[i][0] = val
ax.plot([sub_x[i-1][-1], sub_x[i][0]], [val, val], 'r--')
##plotting rest
for x,y in zip(sub_x, sub_y):
ax.plot(x, y, 'r-o')
plt.show()
The result looks like this:
Note, however, that this will fail if the first value is -1, as then your problem is not well defined (no previous value to copy from). Hope this helps.
Not too elegant, but here's something that doesn't use loops which I came up with (based on the above answers) which works. #KRKirov and #Thomas Kühn , thank you for your answers, I really appreciate them
import pandas as pd
import matplotlib.pyplot as plt
# Create a data frame from the list
a = pd.DataFrame([5,6,-1,-1, 8,3,5,7,3,6,-1,3,8,5])
b=a.copy()
b[2]=b[0].shift(1,axis=0)
b[4]=(b[0]!=-1) & (b[2]==-1)
b[5]=b[4].shift(-1,axis=0)
b[0] = (b[5] | b[4])
c=b[0]
d=pd.DataFrame(c)
# Prepare a boolean mask
mask = a > 0
# New data frame with missing values filled with the last element of
# the previous segment. Choose 'bfill' to use the first element of
# the next segment.
a_masked = a[mask].fillna(method = 'ffill')
# Prepare the plot
fig, ax = plt.subplots()
line, = ax.plot(a_masked, 'b:o', lw = 1)
ax.plot(a[mask], color=line.get_color(), lw=1.5, marker = 'o')
ax.plot(a_masked[d], color=line.get_color(), lw=1.5, marker = 'o')
plt.show()
Im trying to smooth a graph line out but since the x-axis values are dates im having great trouble doing this. Say we have a dataframe as follows
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
%matplotlib inline
startDate = '2015-05-15'
endDate = '2015-12-5'
index = pd.date_range(startDate, endDate)
data = np.random.normal(0, 1, size=len(index))
cols = ['value']
df = pd.DataFrame(data, index=index, columns=cols)
Then we plot the data
fig, axs = plt.subplots(1,1, figsize=(18,5))
x = df.index
y = df.value
axs.plot(x, y)
fig.show()
we get
Now to smooth this line there are some usefull staekoverflow questions allready like:
Generating smooth line graph using matplotlib,
Plot smooth line with PyPlot
Creating numpy linspace out of datetime
But I just cant seem to get some code working to do this for my example, any suggestions?
You can use interpolation functionality that is shipped with pandas. Because your dataframe has a value for every index already, you can populate it with an index that is more sparse, and fill every previously non-existent indices with NaN values. Then, after choosing one of many interpolation methods available, interpolate and plot your data:
index_hourly = pd.date_range(startDate, endDate, freq='1H')
df_smooth = df.reindex(index=index_hourly).interpolate('cubic')
df_smooth = df_smooth.rename(columns={'value':'smooth'})
df_smooth.plot(ax=axs, alpha=0.7)
df.plot(ax=axs, alpha=0.7)
fig.show()
There is one workaround, we will create two plots - 1) non smoothed /interploted with date labels 2) smoothed without date labels.
Plot the 1) using argument linestyle=" " and convert the dates to be plotted on x-axis to string type.
Plot the 2) using the argument linestyle="-" and interpolating the x-axis and y-axis using np.linespace and make_interp_spline respectively.
Following is the use of the discussed workaround for your code.
# your initial code
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from scipy.interpolate import make_interp_spline
%matplotlib inline
startDate = "2015-05-15"
endDate = "2015-07-5" #reduced the end date so smoothness is clearly seen
index = pd.date_range(startDate, endDate)
data = np.random.normal(0, 1, size=len(index))
cols = ["value"]
df = pd.DataFrame(data, index=index, columns=cols)
fig, axs = plt.subplots(1, 1, figsize=(40, 12))
x = df.index
y = df.value
# workaround by creating linespace for length of your x axis
x_new = np.linspace(0, len(df.index), 300)
a_BSpline = make_interp_spline(
[i for i in range(0, len(df.index))],
df.value,
k=5,
)
y_new = a_BSpline(x_new)
# plot this new plot with linestyle = "-"
axs.plot(
x_new[:-5], # removing last 5 entries to remove noise, because interpolation outputs large values at the end.
y_new[:-5],
"-",
label="interpolated"
)
# to get the date on x axis we will keep our previous plot but linestyle will be None so it won't be visible
x = list(x.astype(str))
axs.plot(x, y, linestyle=" ", alpha=0.75, label="initial")
xt = [x[i] for i in range(0,len(x),5)]
plt.xticks(xt,rotation="vertical")
plt.legend()
fig.show()
Resulting Plot
Overalpped plot to see the smoothing.
Depending on what exactly you mean by "smoothing," the easiest way can be the use of savgol_filter or something similar. Unlike with interpolated splines, this method means that the smoothed line does not pass through the measured points, effectively filtering out higher-frequency noise.
from scipy.signal import savgol_filter
...
windowSize = 21
polyOrder = 1
smoothed = savgol_filter(values, windowSize, polyOrder)
axes.plot(datetimes, smoothed, color=chart.color)
The higher the polynomial order value, the closer the smoothed line is to the raw data.
Here is an example.
I am trying to make a profile plot for two columns of a pandas.DataFrame. I would not expect this to be in pandas directly but it seems there is nothing in matplotlib either. I have searched around and cannot find it in any package other than rootpy. Before I take the time to write this myself I thought I would ask if there was a small package that contained profile histograms, perhaps where they are known by a different name.
If you don't know what I mean by "profile histogram" have a look at the ROOT implementation. http://root.cern.ch/root/html/TProfile.html
You can easily do it using scipy.stats.binned_statistic.
import scipy.stats
import numpy
import matplotlib.pyplot as plt
x = numpy.random.rand(10000)
y = x + scipy.stats.norm(0, 0.2).rvs(10000)
means_result = scipy.stats.binned_statistic(x, [y, y**2], bins=50, range=(0,1), statistic='mean')
means, means2 = means_result.statistic
standard_deviations = numpy.sqrt(means2 - means**2)
bin_edges = means_result.bin_edges
bin_centers = (bin_edges[:-1] + bin_edges[1:])/2.
plt.errorbar(x=bin_centers, y=means, yerr=standard_deviations, linestyle='none', marker='.')
Use seaborn. Data as from #MaxNoe
import numpy as np
import seaborn as sns
# just some random numbers to get started
x = np.random.uniform(-2, 2, 10000)
y = np.random.normal(x**2, np.abs(x) + 1)
sns.regplot(x=x, y=y, x_bins=10, fit_reg=None)
You can do much more (error bands are from bootstrap, you can change the estimator on the y-axis, add regression, ...)
While #Keith's answer seems to fit what you mean, it is quite a lot of code. I think this can be done much simpler, so one gets the key concepts and can adjust and build on top of it.
Let me stress one thing: what ROOT is calling a ProfileHistogram is not a special kind of plot. It is an errorbar plot. Which can simply be done in matplotlib.
It is a special kind of computation and that's not the task of a plotting library. This lies in the pandas realm, and pandas is great at stuff like this. It's symptomatical for ROOT as the giant monolithic pile it is to have an extra class for this.
So what you want to do is: discretize in some variable x and for each bin, calculate something in another variable y.
This can easily done using np.digitize together with the pandas groupy and aggregate methods.
Putting it all together:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
# just some random numbers to get startet
x = np.random.uniform(-2, 2, 10000)
y = np.random.normal(x**2, np.abs(x) + 1)
df = pd.DataFrame({'x': x, 'y': y})
# calculate in which bin row belongs base on `x`
# bins needs the bin edges, so this will give as 100 equally sized bins
bins = np.linspace(-2, 2, 101)
df['bin'] = np.digitize(x, bins=bins)
bin_centers = 0.5 * (bins[:-1] + bins[1:])
bin_width = bins[1] - bins[0]
# grouby bin, so we can calculate stuff
binned = df.groupby('bin')
# calculate mean and standard error of the mean for y in each bin
result = binned['y'].agg(['mean', 'sem'])
result['x'] = bin_centers
result['xerr'] = bin_width / 2
# plot it
result.plot(
x='x',
y='mean',
xerr='xerr',
yerr='sem',
linestyle='none',
capsize=0,
color='black',
)
plt.savefig('result.png', dpi=300)
Just like ROOT ;)
I made a module myself for this functionality.
import pandas as pd
from pandas import Series, DataFrame
import numpy as np
import matplotlib.pyplot as plt
def Profile(x,y,nbins,xmin,xmax,ax):
df = DataFrame({'x' : x , 'y' : y})
binedges = xmin + ((xmax-xmin)/nbins) * np.arange(nbins+1)
df['bin'] = np.digitize(df['x'],binedges)
bincenters = xmin + ((xmax-xmin)/nbins)*np.arange(nbins) + ((xmax-xmin)/(2*nbins))
ProfileFrame = DataFrame({'bincenters' : bincenters, 'N' : df['bin'].value_counts(sort=False)},index=range(1,nbins+1))
bins = ProfileFrame.index.values
for bin in bins:
ProfileFrame.ix[bin,'ymean'] = df.ix[df['bin']==bin,'y'].mean()
ProfileFrame.ix[bin,'yStandDev'] = df.ix[df['bin']==bin,'y'].std()
ProfileFrame.ix[bin,'yMeanError'] = ProfileFrame.ix[bin,'yStandDev'] / np.sqrt(ProfileFrame.ix[bin,'N'])
ax.errorbar(ProfileFrame['bincenters'], ProfileFrame['ymean'], yerr=ProfileFrame['yMeanError'], xerr=(xmax-xmin)/(2*nbins), fmt=None)
return ax
def Profile_Matrix(frame):
#Much of this is stolen from https://github.com/pydata/pandas/blob/master/pandas/tools/plotting.py
import pandas.core.common as com
import pandas.tools.plotting as plots
from pandas.compat import lrange
from matplotlib.artist import setp
range_padding=0.05
df = frame._get_numeric_data()
n = df.columns.size
fig, axes = plots._subplots(nrows=n, ncols=n, squeeze=False)
# no gaps between subplots
fig.subplots_adjust(wspace=0, hspace=0)
mask = com.notnull(df)
boundaries_list = []
for a in df.columns:
values = df[a].values[mask[a].values]
rmin_, rmax_ = np.min(values), np.max(values)
rdelta_ext = (rmax_ - rmin_) * range_padding / 2.
boundaries_list.append((rmin_ - rdelta_ext, rmax_+ rdelta_ext))
for i, a in zip(lrange(n), df.columns):
for j, b in zip(lrange(n), df.columns):
common = (mask[a] & mask[b]).values
nbins = 100
(xmin,xmax) = boundaries_list[i]
ax = axes[i, j]
Profile(df[a][common],df[b][common],nbins,xmin,xmax,ax)
ax.set_xlabel('')
ax.set_ylabel('')
plots._label_axis(ax, kind='x', label=b, position='bottom', rotate=True)
plots._label_axis(ax, kind='y', label=a, position='left')
if j!= 0:
ax.yaxis.set_visible(False)
if i != n-1:
ax.xaxis.set_visible(False)
for ax in axes.flat:
setp(ax.get_xticklabels(), fontsize=8)
setp(ax.get_yticklabels(), fontsize=8)
return axes
To my knowledge matplotlib doesn't still allow to directly produce profile histograms.
You can instead give a look at Hippodraw, a package developed at SLAC, that can be used as a Python extension module.
Here there is a Profile histogram example:
http://www.slac.stanford.edu/grp/ek/hippodraw/datareps_root.html#datareps_profilehist