#dataframe
a=
timestamp count
2021-08-16 20
2021-08-17 60
2021-08-18 35
2021-08-19 1
2021-08-20 0
2021-08-21 1
2021-08-22 50
2021-08-23 36
2021-08-24 68
2021-08-25 125
2021-08-26 54
I applied this code
a.plot(kind="density")
this is not what i want.
I want to put Count on Y axis and timestamp in X axis with Density plotting.
just like i can do it with plt.bar(a['timestamp'],a['count'])
OR this is not possible with Density plotting?
The following code creates a density histogram. The total area sums to 1, supposing each of the timestamps counts as 1 unit. To get the timestamps as x-axis, they are set as the index. To get the total area to sum to 1, all count values are divided by their total sum.
A kde a calculated from the same data.
from matplotlib import pyplot as plt
import pandas as pd
import numpy as np
from scipy.stats import gaussian_kde
from io import StringIO
a_str = '''timestamp count
2021-08-16 20
2021-08-17 60
2021-08-18 35
2021-08-19 1
2021-08-20 0
2021-08-21 1
2021-08-22 50
2021-08-23 36
2021-08-24 68
2021-08-25 125
2021-08-26 54'''
a = pd.read_csv(StringIO(a_str), delim_whitespace=True)
ax = (a.set_index('timestamp') / a['count'].sum()).plot.bar(width=0.9, rot=0, figsize=(12, 5))
kde = gaussian_kde(np.arange(len(a)), bw_method=0.2, weights=a['count'])
xs = np.linspace(-1, len(a), 200)
ax.plot(xs, kde(xs), lw=2, color='crimson', label='kde')
ax.set_xlim(xs[0], xs[-1])
ax.legend(labels=['kde', 'density histogram'])
ax.set_xlabel('')
ax.set_ylabel('density')
plt.tight_layout()
plt.show()
If you just want to plot the kde curve, you can leave out the histogram. Optionally you can fill the area under the curve.
fig, ax = plt.subplots(figsize=(12, 5))
kde = gaussian_kde(np.arange(len(a)), bw_method=0.2, weights=a['count'])
xs = np.linspace(-1, len(a), 200)
# plot the kde curve
ax.plot(xs, kde(xs), lw=2, color='crimson', label='kernel density estimation')
# optionally fill the area below the curve
ax.fill_between(xs, kde(xs), color='crimson', alpha=0.2)
ax.set_xticks(np.arange(len(a)))
ax.set_xticklabels(a['timestamp'])
ax.set_xlim(xs[0], xs[-1])
ax.set_ylim(ymin=0)
ax.legend()
ax.set_xlabel('')
ax.set_ylabel('density')
plt.tight_layout()
plt.show()
To plot multiple similar curves, for example using more count columns, you can use a loop. A list of colors that go well together could be obtained from the Set2 colormap:
from matplotlib import pyplot as plt
import pandas as pd
import numpy as np
from scipy.stats import gaussian_kde
a = pd.DataFrame({'timestamp': ['2021-08-16', '2021-08-17', '2021-08-18', '2021-08-19', '2021-08-20', '2021-08-21',
'2021-08-22', '2021-08-23', '2021-08-24', '2021-08-25', '2021-08-26']})
for i in range(1, 5):
a[f'count{i}'] = (np.random.uniform(0, 12, len(a)) ** 2).astype(int)
xs = np.linspace(-1, len(a), 200)
fig, ax = plt.subplots(figsize=(12, 4))
for column, color in zip(a.columns[1:], plt.cm.Set2.colors):
kde = gaussian_kde(np.arange(len(a)), bw_method=0.2, weights=a[column])
ax.plot(xs, kde(xs), lw=2, color=color, label=f"kde of '{column}'")
ax.fill_between(xs, kde(xs), color=color, alpha=0.2)
ax.set_xlim(xs[0], xs[-1])
ax.set_xticks(np.arange(len(a)))
ax.set_xticklabels(a['timestamp'])
ax.set_xlim(xs[0], xs[-1])
ax.set_ylim(ymin=0)
ax.legend()
ax.set_xlabel('Date')
ax.set_ylabel('Density of Counts')
plt.tight_layout()
plt.show()
Related
I am trying to plot a scatter plot on top of a bar plot using sns.scatterplot() and df.plot(kind='bar'); The figure turns out to be fine, but it would be even nicer if I can align each of the scatter points to its corresponding bar with an identical label.
I have read the document on Rectangle of matplotlib.pyplot that it has a get_x() method that can "Return the left coordinate of the rectangle";
I wonder if there is a way for me to assign these coordinates to the scatter points that'd be plotted by seaborn?
Code
fig, ax = plt.subplots(nrows=1, ncols=1)
fig.set_size_inches(9, 9)
fig.set_dpi(300)
bar_df.plot(kind='bar', ax=ax)
ax2 = ax.twinx()
sns.scatterplot(data=line_df, ax=ax2)
Dataframes
bar_df
year
apple
banana
citrus
...
2020
12
34
56
78
2025
12
34
56
78
2030
12
34
56
78
2035
12
34
56
78
line_df
year
apple
banana
citrus
...
2020
23
45
67
89
2025
23
45
67
89
2030
23
45
67
89
2035
23
45
67
89
It'd be really nice if I could make the points in the same vertical line as the bar with the same header;
sns.scatterplot interprets the x-axis as numeric. As such, it doesn't align well with a bar plot, nor does it have a dodge= parameter.
You can use sns.stripplot instead.
Seaborn works easiest with its data in "long form", which can be achieved via pandas pd.melt.
Here is some example code:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
bar_df, line_df = pd.read_html('https://stackoverflow.com/questions/73191315')
bar_df_long = bar_df.melt(id_vars='year', var_name='fruit', value_name='bar_value')
line_df_long = line_df.melt(id_vars='year', var_name='fruit', value_name='line_value')
fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(6,6), dpi=300)
sns.barplot(data=bar_df_long, x='year', y='bar_value', hue='fruit', dodge=True, ax=ax)
ax2 = ax.twinx()
sns.stripplot(data=line_df_long, x='year', y='line_value', hue='fruit', dodge=True, jitter=False,
edgecolor='black', linewidth=1, ax=ax2)
ax2.legend_.remove() # remove the second legend
plt.tight_layout()
plt.show()
I have a dataframe with positive and negative values from three kind of variables.
labels variable value
0 -10e5 nat -38
1 2e5 nat 50
2 10e5 nat 16
3 -10e5 agr -24
4 2e5 agr 35
5 10e5 agr 26
6 -10e5 art -11
7 2e5 art 43
8 10e5 art 20
when values are negative I want the barplot to follow the color sequence:
n_palette = ["#ff0000","#ff0000","#00ff00"]
Instead when positive I want it to reverse the palette:
p_palette = ["#00ff00","#00ff00","#ff0000"]
I've tried this:
palette = ["#ff0000","#ff0000","#00ff00",
"#00ff00","#00ff00","#ff00",
"#00ff00","#00ff00","#ff00"]
ax = sns.barplot(x=melted['labels'], y=melted['value'], hue = melted['variable'],
linewidth=1,
palette=palette)
But I get the following output:
what I'd like is the first two bars of the group to become green and the last one red when values are positive.
You seem to want to do the coloring depending on a criterion on two columns. It seems suitable to add a new column which uniquely labels that criterion.
Further, seaborn allows the palette to be a dictionary telling exactly which hue label gets which color. Adding barplot(..., order=[...]) would define a fixed order.
Here is some example code:
from matplotlib import pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
from io import StringIO
data_str = ''' labels variable value
0 -10e5 nat -38
1 2e5 nat 50
2 10e5 nat 16
3 -10e5 agr -24
4 2e5 agr 35
5 10e5 agr 26
6 -10e5 art -11
7 2e5 art 43
8 10e5 art 20
'''
melted = pd.read_csv(StringIO(data_str), delim_whitespace=True, dtype={'labels': str})
melted['legend'] = np.where(melted['value'] < 0, '-', '+')
melted['legend'] = melted['variable'] + melted['legend']
palette = {'nat-': "#ff0000", 'agr-': "#ff0000", 'art-': "#00ff00",
'nat+': "#00ff00", 'agr+': "#00ff00", 'art+': "#ff0000"}
ax = sns.barplot(x=melted['labels'], y=melted['value'], hue=melted['legend'],
linewidth=1, palette=palette)
ax.axhline(0, color='black')
plt.show()
PS: To remove the legend: ax.legend_.remove(). Or to have a legend with multiple columns: ax.legend(ncol=3).
A different approach, directly with the original dataframe, is to create two bar plots: one for the negative values and one for the positive. For this to work well, it is necessary that the 'labels' column (the x=) is explicitly made categorical. Also adding pd.Categorical(..., categories=['nat', 'agr', 'art']) for the 'variable' column could fix an order.
This will generate a legend with the labels twice with different colors. Depending on what you want, you can remove it or create a more custom legend.
An idea is to add the labels under the positive and on top of the negative bars:
sns.set()
melted = pd.read_csv(StringIO(data_str), delim_whitespace=True, dtype={'labels': str})
palette_pos = {'nat': "#00ff00", 'agr': "#00ff00", 'art': "#ff0000"}
palette_neg = {'nat': "#ff0000", 'agr': "#ff0000", 'art': "#00ff00"}
melted['labels'] = pd.Categorical(melted['labels'])
ax = sns.barplot(data=melted[melted['value'] < 0], x='labels', y='value', hue='variable',
linewidth=1, palette=palette_neg)
sns.barplot(data=melted[melted['value'] >= 0], x='labels', y='value', hue='variable',
linewidth=1, palette=palette_pos, ax=ax)
ax.legend_.remove()
ax.axhline(0, color='black')
ax.set_xlabel('')
ax.set_ylabel('')
for bar_container in ax.containers:
label = bar_container.get_label()
for p in bar_container:
x = p.get_x() + p.get_width() / 2
h = p.get_height()
if not np.isnan(h):
ax.text(x, 0, label + '\n\n' if h < 0 else '\n\n' + label, ha='center', va='center')
plt.show()
Still another option involves sns.catplot() which could be clearer when a lot of data is involved:
sns.set()
melted = pd.read_csv(StringIO(data_str), delim_whitespace=True, dtype={'labels': str})
melted['legend'] = np.where(melted['value'] < 0, '-', '+')
melted['legend'] = melted['variable'] + melted['legend']
palette = {'nat-': "#ff0000", 'agr-': "#ff0000", 'art-': "#00ff00",
'nat+': "#00ff00", 'agr+': "#00ff00", 'art+': "#ff0000"}
g = sns.catplot(kind='bar', data=melted, col='labels', y='value', x='legend',
linewidth=1, palette=palette, sharex=False, sharey=True)
for ax in g.axes.flat:
ax.axhline(0, color='black')
ax.set_xlabel('')
ax.set_ylabel('')
plt.show()
Dataframe
df
Sample Type y1 y2 y3 y4
S1 H 1000 135 220 171
S2 H 2900 1560 890 194
S3 P 678 350 127 255
S4 P 179 510 154 275
I want to plot y1, y2, y3, y4 vs Sample scatterplot with hue as Type.
Is there any way to do it in Seaborn?
Since, you want just one plot you can use sns.scatterplot:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
#df = pd.read_csv('yourfile.csv')
#plotting
df1 = df.melt(['Type','Sample'])
sns.scatterplot(data=df1, x="Sample", y="value",hue="Sample",style="Type")
plt.show()
In case you want multiple scatter plots, you can use sns.relplot:
#some preprocessing
df1 = df.melt(['Type','Sample'])
#plotting
sns.relplot(data=df1, x="Sample", y="value", hue="Type", col="variable", height=2, aspect=1.5)
plt.show()
In case, you want 2x2 grid :
df1 = df.melt(['Type','Sample'])
#plotting
sns.relplot(data=df1, x="Sample", y="value", hue="Type", col="variable",col_wrap=2, height=2, aspect=1.5)
plt.show()
In case, you want 1x4 grid :
df1 = df.melt(['Type','Sample'])
#plotting
sns.relplot(data=df1, x="Sample", y="value", hue="Type", col="variable",col_wrap=1, height=2, aspect=1.5)
plt.show()
Here is the sample data:
Datetime Price Data1 Data2 ShiftedPrice
0 2017-11-05 09:20:01.134 2123.0 12.23 34.12 300.0
1 2017-11-05 09:20:01.789 2133.0 32.43 45.62 330.0
2 2017-11-05 09:20:02.238 2423.0 35.43 55.62 NaN
3 2017-11-05 09:20:02.567 3423.0 65.43 56.62 NaN
4 2017-11-05 09:20:02.948 2463.0 45.43 58.62 NaN
I am trying to draw a plot between Datetime and Shiftedprice columns and horizontal lines for mean, confidence intervals of the ShiftedPrice column.
Have a look at the code below:
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
df1 = df.dropna(subset=['ShiftedPrice'])
df1
fig = plt.figure(figsize=(20,10))
ax = fig.add_subplot(121)
ax = df1.plot(x='Datetime',y='ShiftedPrice')
# Plotting the mean
ax.axhline(y=df1['ShiftedPrice'].mean(), color='r', linestyle='--', lw=2)
plt.show()
# Plotting Confidence Intervals
ax.axhline(y=df1['ShiftedPrice'].mean() + 1.96*np.std(df1['ShiftedPrice'],ddof=1), color='g', linestyle=':', lw=2)
ax.axhline(y=df1['ShiftedPrice'].mean() - 1.96*np.std(df1['ShiftedPrice'],ddof=1), color='g', linestyle=':', lw=2)
plt.show()
My problem is that horizontal lines are not appearing. Instead, I get the following message
ax.axhline(y=df1['ShiftedPrice'].mean(), color='r', linestyle='--', lw=2)
Out[22]: <matplotlib.lines.Line2D at 0xccc5c18>
I have the following dataframe, which I am aiming to plot both max data and min data on the same graph, using Month_Day as x-axis, but only printing 'Jan', 'Feb', 'Mar', etc...
Month_Day max min
0 Jan-01 243 86
1 Jan-02 230 90
2 Jan-03 233 104
3 Jan-04 220 73
4 Jan-05 224 71
but once I include the dates, it poped an error.
dates = pd.date_range('1/1/2015','31/12/2015', freq='D')
plt.plot(tmax, '-r', tmin, '-b')
#plt.plot(dates, tmax, '-r', dates, tmin, '-b') <- this is the line i plot dates as axis
plt.fill_between(range(len(tmin)), tmin, tmax, facecolor='gray', alpha=0.25)
plt.grid(True)
gives the error:
error: ordinal must be >= 1
You could use xaxis.set_major_formatter().
Here's a simple example of this:
import datetime
import random
import matplotlib.pyplot as plt
# make up some data
x = [datetime.datetime.now() + datetime.timedelta(days=i) for i in range(180)]
y = [i+random.gauss(0,1) for i,_ in enumerate(x)]
p1 = plt.subplot(211)
p1.xaxis.set_major_formatter(mdate.DateFormatter('%b', None))
# plot
plt.plot(x,y)
# beautify the x-labels
plt.gcf().autofmt_xdate()
plt.show()
Output