How to save multiple Seaborn plots into single pdf file - python

So I'm trying to save multiple plots that i create in a for loop into a single pdf file. I've searched around on SO and pieced together some code that seems to work except it doesn't save the figures it creates a pdf but without anything in it.
Here's the code to reproduce it:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
dftest = pd.DataFrame(np.random.randint(low=0, high=10, size=(5, 5)),
columns=['a', 'b', 'c', 'd', 'e'])
from matplotlib.backends.backend_pdf import PdfPages
with PdfPages('count.pdf') as pdf_pages:
df1 = dftest.select_dtypes([np.int, np.float, np.object])
for i, col in enumerate(df1.columns):
plt.figure(i)
countplot = sns.countplot(x=col, data=df1)
pdf_pages.savefig(countplot.fig)

Saving the plt.figure works for me
with PdfPages('count.pdf') as pdf_pages:
df1 = dftest.select_dtypes([np.int, np.float, np.object])
for i, col in enumerate(df1.columns):
figu = plt.figure(i)
countplot = sns.countplot(x=col, data=df1)
pdf_pages.savefig(figu)

Related

Matplotlib BoxPlot Labels and Title

Thank you in advance for your help!
I am trying to create a boxplot in matplotlib and I get an error when trying to add the labels.
This is the code that pulls an error:
df_selected_station_D.boxplot(column='20 cm', by='Month',figsize=(15,5),grid=True, xlabel = 'x data');
This is the error it causes:
TypeError: boxplot() got an unexpected keyword argument 'xlabel'
What does this error mean and why am I getting it? (Complete code and images below)
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
raw_data = pd.read_csv('all-deep-soil-temperatures.csv', index_col=1, parse_dates=True)
df_all_stations = raw_data.copy()
df_selected_station = df_all_stations[df_all_stations['Station'] == 'Minot']
df_selected_station.fillna(method = 'ffill', inplace=True);
df_selected_station_D=df_selected_station.resample(rule='D').mean()
df_selected_station_D['Day'] = df_selected_station_D.index.dayofyear
mean=df_selected_station_D.groupby(by='Day').mean()
mean['Day']=mean.index
df_selected_station_D['Month'] = df_selected_station_D.index.month
df_selected_station_D.head()
df_selected_station_D.boxplot(column='20 cm', by='Month',figsize=(15,5),grid=True);
The data is not the same, but adding labels and modifying titles can be accomplished with the following code
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
fig, ax1 = plt.subplots()
np.random.seed(1234)
df = pd.DataFrame(np.random.randn(10, 4), columns=['Col1', 'Col2', 'Col3', 'Col4'])
ax1 = df.boxplot(column=['Col1', 'Col2', 'Col3'], figsize=(15,5), grid=True)
ax1.set_title('test title')
ax1.set_xlabel('x data')
ax1.set_ylabel('y data')
plt.show()

How to make a distplot for each column in a pandas dataframe

I 'm using Seaborn in a Jupyter notebook to plot histograms like this:
import numpy as np
import pandas as pd
from pandas import DataFrame
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
df = pd.read_csv('CTG.csv', sep=',')
sns.distplot(df['LBE'])
I have an array of columns with values that I want to plot histogram for and I tried plotting a histogram for each of them:
continous = ['b', 'e', 'LBE', 'LB', 'AC']
for column in continous:
sns.distplot(df[column])
And I get this result - only one plot with (presumably) all histograms:
My desired result is multiple histograms that looks like this (one for each variable):
How can I do this?
Insert plt.figure() before each call to sns.distplot() .
Here's an example with plt.figure():
Here's an example without plt.figure():
Complete code:
# imports
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = [6, 2]
%matplotlib inline
# sample time series data
np.random.seed(123)
df = pd.DataFrame(np.random.randint(-10,12,size=(300, 4)), columns=list('ABCD'))
datelist = pd.date_range(pd.datetime(2014, 7, 1).strftime('%Y-%m-%d'), periods=300).tolist()
df['dates'] = datelist
df = df.set_index(['dates'])
df.index = pd.to_datetime(df.index)
df.iloc[0]=0
df=df.cumsum()
# create distplots
for column in df.columns:
plt.figure() # <==================== here!
sns.distplot(df[column])
Distplot has since been deprecated in seaborn versions >= 0.14.0. You can, however, use sns.histplot() to plot histogram distributions of the entire dataframe (numerical features only) in the following way:
fig, axes = plt.subplots(2,5, figsize=(15, 5))
ax = axes.flatten()
for i, col in enumerate(df.columns):
sns.histplot(df[col], ax=ax[i]) # histogram call
ax[i].set_title(col)
# remove scientific notation for both axes
ax[i].ticklabel_format(style='plain', axis='both')
fig.tight_layout(w_pad=6, h_pad=4) # change padding
plt.show()
If, you specifically want a way to estimate the probability density function of a continuous random variable using the Kernel Density Function (mimicing the default behavior of sns.distplot()), then inside the sns.histplot() function call, add kde=True, and you will have curves overlaying the histograms.
Also works when looping with plt.show() inside:
for column in df.columns:
sns.distplot(df[column])
plt.show()

Add text annotation to plot from a pandas dataframe

My Code:
import matplotlib.pyplot as plt
import pandas as pd
import os, glob
path = r'C:/Users/New folder'
all_files = glob.glob(os.path.join(path, "*.txt"))
df = pd.DataFrame()
for file_ in all_files:
file_df = pd.read_csv(file_,sep=',', parse_dates=[0], infer_datetime_format=True,header=None, usecols=[0,1,2,3,4,5,6], names=['Date','Time','open', 'high', 'low', 'close','volume','tradingsymbol'])
df = df[['Date','Time','close','volume','tradingsymbol']]
df["Time"] = pd.to_datetime(df['Time'])
df.set_index('Time', inplace=True)
print(df)
fig, axes = plt.subplots(nrows=2, ncols=1)
################### Volume ###########################
df.groupby('tradingsymbol')['volume'].plot(legend=True, rot=0, grid=True, ax=axes[0])
################### PRICE ###########################
df.groupby('tradingsymbol')['close'].plot(legend=True, rot=0, grid=True, ax=axes[1])
plt.show()
My Current Output is like:
I need add text annotation to matplotlib plot. My desired output similar to below image:
It's hard to answer this question without access to your dataset, or a simpler example. However, I'll try my best.
Let's begin by setting up a dataframe which may or may resemble your data:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
df = pd.DataFrame(np.random.randint(low=0, high=10, size=(5, 3)),
columns=['a', 'b', 'c'])
With the dataset we'll now proceed to plot it with
fig, ax = plt.subplots(1, 1)
df.plot(legend=True, ax=ax)
Finally, we'll loop over the columns and annotate each datapoint as
for col in df.columns:
for id, val in enumerate(df[col]):
ax.text(id, val, str(val))
This gave me the plot following plot, which resembles your desired figure.

Plot duplication in Pandas Plot()

There is an issue with the plot() function in Pandas
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
df = pd.DataFrame(np.random.randn(8, 4), columns=['A', 'B', 'A', 'B'])
ax = df.plot()
ax.legend(ncol=1, bbox_to_anchor=(1., 1, 0., 0), loc=2 , prop={'size':6})
This will make a plot with too many lines. Note however that half will be on top of each other. It seems to have something to do with the axis because when I do not use them the issue goes away.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
df = pd.DataFrame(np.random.randn(8, 4), columns=['A', 'B', 'A', 'B'])
df.plot()
UPDATE
While not idea for my use case the issue can be fixed by using MultiIndex
columns = pd.MultiIndex.from_arrays([np.hstack([ ['left']*2, ['right']*2]), ['A', 'B']*2], names=['High', 'Low'])
df = pd.DataFrame(np.random.randn(8, 4), columns=columns)
ax = df.plot()
ax.legend(ncol=1, bbox_to_anchor=(1., 1, 0., 0), loc=2 , prop={'size':16})
It has to do with your duplication of column names, not ax at all (if you call plt.legend after your second example you see the same extra lines). Having multiple columns with the same name is confusing the call to DataFrame.plot_frame.
If you change your columns to ['A', 'B', 'C', 'D'] instead, it's fine.

Multiple histograms in Pandas

I would like to create the following histogram (see image below) taken from the book "Think Stats". However, I cannot get them on the same plot. Each DataFrame takes its own subplot.
I have the following code:
import nsfg
import matplotlib.pyplot as plt
df = nsfg.ReadFemPreg()
preg = nsfg.ReadFemPreg()
live = preg[preg.outcome == 1]
first = live[live.birthord == 1]
others = live[live.birthord != 1]
#fig = plt.figure()
#ax1 = fig.add_subplot(111)
first.hist(column = 'prglngth', bins = 40, color = 'teal', \
alpha = 0.5)
others.hist(column = 'prglngth', bins = 40, color = 'blue', \
alpha = 0.5)
plt.show()
The above code does not work when I use ax = ax1 as suggested in: pandas multiple plots not working as hists nor this example does what I need: Overlaying multiple histograms using pandas. When I use the code as it is, it creates two windows with histograms. Any ideas how to combine them?
Here's an example of how I'd like the final figure to look:
As far as I can tell, pandas can't handle this situation. That's ok since all of their plotting methods are for convenience only. You'll need to use matplotlib directly. Here's how I do it:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import pandas
#import seaborn
#seaborn.set(style='ticks')
np.random.seed(0)
df = pandas.DataFrame(np.random.normal(size=(37,2)), columns=['A', 'B'])
fig, ax = plt.subplots()
a_heights, a_bins = np.histogram(df['A'])
b_heights, b_bins = np.histogram(df['B'], bins=a_bins)
width = (a_bins[1] - a_bins[0])/3
ax.bar(a_bins[:-1], a_heights, width=width, facecolor='cornflowerblue')
ax.bar(b_bins[:-1]+width, b_heights, width=width, facecolor='seagreen')
#seaborn.despine(ax=ax, offset=10)
And that gives me:
In case anyone wants to plot one histogram over another (rather than alternating bars) you can simply call .hist() consecutively on the series you want to plot:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import pandas
np.random.seed(0)
df = pandas.DataFrame(np.random.normal(size=(37,2)), columns=['A', 'B'])
df['A'].hist()
df['B'].hist()
This gives you:
Note that the order you call .hist() matters (the first one will be at the back)
A quick solution is to use melt() from pandas and then plot with seaborn.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# make dataframe
df = pd.DataFrame(np.random.normal(size=(200,2)), columns=['A', 'B'])
# plot melted dataframe in a single command
sns.histplot(df.melt(), x='value', hue='variable',
multiple='dodge', shrink=.75, bins=20);
Setting multiple='dodge' makes it so the bars are side-by-side, and shrink=.75 makes it so the pair of bars take up 3/4 of the whole bin.
To help understand what melt() did, these are the dataframes df and df.melt():
From the pandas website (http://pandas.pydata.org/pandas-docs/stable/visualization.html#visualization-hist):
df4 = pd.DataFrame({'a': np.random.randn(1000) + 1, 'b': np.random.randn(1000),
'c': np.random.randn(1000) - 1}, columns=['a', 'b', 'c'])
plt.figure();
df4.plot(kind='hist', alpha=0.5)
You make two dataframes and one matplotlib axis
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
df1 = pd.DataFrame({
'data1': np.random.randn(10),
'data2': np.random.randn(10)
})
df2 = df1.copy()
fig, ax = plt.subplots()
df1.hist(column=['data1'], ax=ax)
df2.hist(column=['data2'], ax=ax)
Here is the snippet, In my case I have explicitly specified bins and range as I didn't handle outlier removal as the author of the book.
fig, ax = plt.subplots()
ax.hist([first.prglngth, others.prglngth], 10, (27, 50), histtype="bar", label=("First", "Other"))
ax.set_title("Histogram")
ax.legend()
Refer Matplotlib multihist plot with different sizes example.
this could be done with brevity
plt.hist([First, Other], bins = 40, color =('teal','blue'), label=("First", "Other"))
plt.legend(loc='best')
Note that as the number of bins increase, it may become a visual burden.
You could also try to check out the pandas.DataFrame.plot.hist() function which will plot the histogram of each column of the dataframe in the same figure.
Visibility is limited though but you can check out if it helps!
https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.plot.hist.html

Categories

Resources