How to substract 2 columns of unaligned data with Python/Pandas? - python

I have two DataFrames
df1=
x y1
0 0 0
1 1 1
2 2 2
3 4 3
df2=
x y2
0 0.0 0
1 0.5 1
2 1.5 2
3 3.0 3
4 4.0 4
I need to calculate y2-y1 (for the same x value)
(in order to see the difference between 2 graphs)
As you can see, some values are in common between them... some are not
I think I will need to resample my data... but I don't know how !
I need to align data in order to have same 'x' column for the 2 dataframes.
between 2 points a linear interpolation should be done to get y value at a given x.
In this case resampling data with a x_step=0.5 will be good
I did this...
import pandas as pd
import matplotlib.pylab as plt
df1 = pd.DataFrame([[0.0,0.0],[1.0,1.0],[2.0,2.0],[4.0,3.0]],columns=['x','y1'])
df2 = pd.DataFrame([[0.0,0.0],[0.5,1.9],[1.5,2.0],[3.0,3.0],[4.0,4.0]],columns=['x','y2'])
print(df1)
print("="*10)
print(df1['x'])
print("="*10)
print(df1['y1'])
print("="*10)
fig = plt.figure()
fig.subplots_adjust(bottom=0.1)
ax = fig.add_subplot(111)
plt.title("{y} = f({x})".format(x='x', y='y'))
p1, = plt.plot(df1['x'], df1['y1'], color='b', marker='.')
p2, = plt.plot(df2['x'], df2['y2'], color='r', marker='.')
plt.legend([p1, p2], ["y1", "y2"])
plt.show()

import pandas as pd
import pylab as pl
df1 = pd.DataFrame([[0.0,0.0],[1.0,1.0],[2.0,2.0],[4.0,3.0]],columns=['x','y1'])
df2 = pd.DataFrame([[0.0,0.0],[0.5,1.9],[1.5,2.0],[3.0,3.0],[4.0,4.0]],columns=['x','y2'])
x = np.union1d(df1.x, df2.x)
y1 = np.interp(x, df1.x, df1.y1)
y2 = np.interp(x, df2.x, df2.y2)
pl.plot(x, y1, "-o")
pl.plot(x, y2, "-o")

Related

matplotlib plot for probabilities

What is an adequate plot type for probabilities?
For example if I have the following data:
t p1 p2 p3 p4
1 0 0 0 1
2 0.1 0 0 0.9
3 0.2 0 0 0.8
4 0.2 0.3 0 0.5
You can see that the values (other than t) sum 1.
I seem to remember there is some kind of bar graph that sum 1 but with the divisions of different colors.
Something like
You can use pandas to load the data and then create the following plot with matplotlib:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
df = pd.read_csv('test.csv', index_col=0)
samples = df.index.astype(str)
bottom = np.zeros(len(df))
width = 0.5
fig, ax = plt.subplots()
ax.set(xlabel='Samples', ylabel='Probability')
for label, probabilities in df.transpose().iterrows():
ax.bar(samples, probabilities, width, bottom=bottom, label=label)
bottom += probabilities
ax.legend(title='Classes')
plt.show()

Plot line on secondary axis with stacked bar chart - matplotlib

The following plots a stacked bar chart separated into 4 subplots. The four subplots are called from Area. The values are called from Result. This column contains 0's and 1's. I want to plot the total count of these values for each different combination in Group.
This works fine but I'm hoping to use the secondary axis to show the normalised values as a line plot. Specifically, the percentage of 1's compared to 0's. At the moment, I just have to total count of 0's and 1's as a bar chart. I'm hoping to plot the percentage of 1's using the secondary y-axis.
import pandas as pd
import matplotlib.pyplot as plt
df = pd.DataFrame({
'Result' :[0,1,1,1,0,1,1,0,1,0,1,1,1,1,0,1],
'Group' :[-2,-1,1,0,0,-1,-1,0,1,-1,0,1,-1,1,0,1],
'Area' :['North','East','South','West','North','East','South','West','North','East','South','West','North','East','South','West'],
})
total = df['Result'].sum()
def custom_stacked_barplot(t, sub_df, ax):
plot_df = pd.crosstab(index = sub_df['Group'],
columns = sub_df['Result'],
values = sub_df['Result'],
aggfunc = ['count',(lambda x: sum(x)/total*100)],
)
p = plot_df.plot(kind = "bar", y = 'count',stacked = True, ax = ax, rot = 0, width = 0.6, legend = False)
ax2=ax.twinx()
#plot norm line
#r = plot_df.plot(y = '<lambda>', ax = ax2, legend = False, zorder = 2, color = 'black')
return p
g_dfs = df.groupby(['Area'])
fig, axes = plt.subplots(nrows=4, ncols=1, figsize=(8,12))
for ax, (i,g) in zip(axes.ravel(), sorted(g_dfs)):
custom_stacked_barplot(i, g, ax)
plt.legend(bbox_to_anchor=(1.129, 2.56))
plt.show()
intended df output to plot:
count perc
Result 0 1 0
Group
-1 1.0 2.0 0.66
1 0.0 1.0 1.0
count perc
Result 0 1 0
Group
-2 1.0 0.0 0.0
-1 0.0 1.0 1.0
0 1.0 0.0 0.0
1 0.0 1.0 1.0
count perc
Result 0 1 0
Group
-1 0.0 1.0 1.0
0 1.0 1.0 0.5
1 0.0 1.0 1.0
count perc
Result 0 1 0
Group
0 1.0 1.0 0.5
1 0.0 2.0 1.0
try using twinx()
import matplotlib.pyplot as plt
df = pd.DataFrame({
'Result' :[0,1,1,1,0,1,1,0,1,0,1,1,1,1,0,1],
'Group' :[-2,-1,1,0,0,-1,-1,0,1,-1,0,1,-1,1,0,1],
'Area' :['North','East','South','West','North','East','South','West','North','East','South','West','North','East','South','West'],
})
total = df['Result'].sum()
def custom_stacked_barplot(t, sub_df, ax):
plot_df = pd.crosstab(index = sub_df['Group'],
columns=sub_df['Result'],
values=sub_df['Result'],
aggfunc = ['count',(lambda x: sum(x)/total*100)])
print(plot_df)
p = plot_df.plot(kind="bar",y='count',stacked=True, ax = ax, rot = 0, width = 0.6, legend = False)
ax2=ax.twinx()
r = plot_df.plot(kind="bar",y='<lambda>', stacked=True, ax = ax2, rot = 0, width = 0.6, legend = False)
return p,r
g_dfs = df.groupby(['Area'])
fig, axes = plt.subplots(nrows=4, ncols=1, figsize=(8,12))
for ax, (i,g) in zip(axes.ravel(), sorted(g_dfs)):
custom_stacked_barplot(i, g, ax)
plt.legend(bbox_to_anchor=(1.129, 2.56))
plt.show()
# save the plot as a file
fig.savefig('two_different_y_axis_for_single_python_plot_with_twinx.jpg',
format='jpeg',
dpi=100,
bbox_inches='tight')
plt.show()
The output looks something like :
Ok, so I gave this a try, too:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
df = pd.DataFrame({
'Result' :[0,1,1,1,0,1,1,0,1,0,1,1,1,1,0,1],
'Group' :[-2,-1,1,0,0,-1,-1,0,1,-1,0,1,-1,1,0,1],
'Area' :['North','East','South','West','North','East','South','West','North','East','South','West','North','East','South','West'],
})
## iterate over unique areas
unique_areas = df['Area'].unique()
fig, axes = plt.subplots(nrows=len(unique_areas), ncols=1, figsize=(8,12))
twin_axes=[]
for i,key in enumerate(unique_areas):
# print(f"== {key} ==") #<- uncomment this line to debug
## first, filter the df by 'Area'
area_df = df[(df['Area']==key)]
## and do the crosstab:
ct_df = pd.crosstab(index=area_df['Group'],
columns=area_df['Result'],
)
## to add the 'count' label you wanted to the dataframe multiindex:
ct_df = pd.concat({'count': ct_df}, names=['type'],axis=1)
## now iterate over the unique 'Groups' in the index ...
for ix in ct_df.index:
sub_df = ct_df.loc[ix,'count']
## ... and calculate the contribution of each Result
# which is equal to '1' (ct_df.loc[ix,1])
# in the total for this group (ct_df.loc[ix].sum())
ct_df.loc[ix,'perc'] = sub_df.loc[1]/sub_df.sum()
# print(ct_df) #<- uncomment this line to debug
## add your stacked bar plot
bar = ct_df.plot(kind = "bar", y = 'count',stacked = True, ax = axes[i], rot = 0, width = 0.6, legend = False)
## keep the twin_axes in a separate list
twin_axes.append(axes[i].twinx())
## generate the "correct" x values that match the bar plot locations
# (i.e. use [0,1,2,3] instead of [-2,-1,0,1] )
xs=np.arange(0,len(ct_df),1)
## and plot the percentages as a function this new x range as a black line:
twin_axes[i].plot(xs,ct_df['perc'],zorder=2,color='black')
## optional:
# using these 'xs' you could also e.g. add some labels for the contained groups:
for x in xs:
twin_axes[i].text(x,1.15,ct_df.index[x],color="b")
# make some nice changes to the formatting of the plots
for a in [twin_axes]:
# a[i].set_xlim(-1,4)
a[i].set_ylim(0,1.1)
plt.show()
Mainly, instead of trying to use the pd.crosstab to do everything, I'd suggest to do some quick and easy for loops over the unique areas, in order to get the df structure you want.
Each group-dependent dataframe now looks like what you wanted:
type count perc
Result 0 1
Group
-2 1 0 0.0
-1 0 1 1.0
0 1 0 0.0
1 0 1 1.0
type count perc
Result 0 1
Group
-1 1 2 0.666667
1 0 1 1.000000
type count perc
Result 0 1
Group
-1 0 1 1.0
0 1 1 0.5
1 0 1 1.0
type count perc
Result 0 1
Group
0 1 1 0.5
1 0 2 1.0
And the plot now looks like this:
Edit:
def create_plot(ax, x, y1, y2, y3):
ax1 = ax
ax2 = ax1.twinx()
ax1.bar(x, y1)
ax1.bar(x, y2, bottom=y1)
ax2.plot(x, y3, c="C3")
fig, axes = plt.subplots(nrows=4, ncols=1, figsize=(8,12))
for ax in axes:
create_plot(ax, (1,2,3,4), (1,2,3,4), (7,5,3,1), (1,4,2,3))
plt.show()
(Old post below)
Does something like
def create_plot(x, y1, y2, y3):
fig = plt.figure()
ax1 = fig.gca()
ax2 = ax1.twinx()
ax1.bar(x, y1)
ax1.bar(x, y2, bottom=y1)
ax2.plot(x, y3, c="C3")
return fig
fig = create_plot((1,2,3,4), (1,2,3,4), (7,5,3,1), (1,4,2,3))
plt.show()
meet what you need? This gives me:

Generating an histogram with Matplotlib using a dataframe for x and y

I'm generating a simple line chart with Matplotlib, here is my code:
fig = plt.figure(facecolor='#131722',dpi=155, figsize=(8, 4))
ax1 = plt.subplot2grid((1,2), (0,0), facecolor='#131722')
for x in OrderedList:
rate_buy = []
total_buy = []
for y in x['data']['bids']:
rate_buy.append(y[0])
total_buy.append(y[1])
rBuys = pd.DataFrame({'buy': rate_buy})
tBuys = pd.DataFrame({'total': total_buy})
ax1.plot(rBuys.buy, tBuys.total, color='#0400ff', linewidth=0.5, alpha=1)
ax1.fill_between(rBuys.buy, 0, tBuys.total, facecolor='#0400ff', alpha=1)
Which gives me the following output:
And here is the data i used in the dataframe:
buy
0 9611
1 9610
2 9609
3 9608
4 9607
5 9606
6 9605
7 9604
8 9603
9 9602
10 9601
11 9600
12 9599
total
0 3.033661
1 3.295753
2 3.599813
3 22.305765
4 22.987476
5 30.975145
6 39.492845
7 42.828580
8 46.677708
9 49.533740
10 50.925840
11 61.396243
12 61.921523
I want to get the same output of the image, but with an histogram chart or whatever it's similar to that, where the height of the column on the y axis is retrieved from the total dataframe and the x axis position is retrieved from the buy dataframe. So the first element will have position x=9611 and y=3.033661
Is it possible to do that with Matplotlib? I tried to use hist, but it doesn't allow me to set both the x and the y axis
Pandas uses matplotlib as well, and the API is very easy once you have the dataframe.
Here is an example.
d = {
'buy':[
9611,
9610,
9609,
9608,
9607,
9606,
9605,
9604,
9603,
9602,
9601,
9600,
9599
],
'total':[
3.033661,
3.295753,
3.599813,
22.305765,
22.987476,
30.975145,
39.492845,
42.828580,
46.677708,
49.533740,
50.925840,
61.396243,
61.921523
]
}
df = pd.DataFrame(d)
df = df.sort_values(by=['buy']) #remember to sort your x values!
df.plot(kind='bar', x='buy', y='total', width=1)
plt.show()

Pandas reverse of diff()

I have calculated the differences between consecutive values in a series, but I cannot reverse / undifference them using diffinv():
ds_sqrt = np.sqrt(ds)
ds_sqrt = pd.DataFrame(ds_sqrt)
ds_diff = ds_sqrt.diff().values
How can I undifference this?
You can do this via numpy. Algorithm courtesy of #Divakar.
Of course, you need to know the first item in your series for this to work.
df = pd.DataFrame({'A': np.random.randint(0, 10, 10)})
df['B'] = df['A'].diff()
x, x_diff = df['A'].iloc[0], df['B'].iloc[1:]
df['C'] = np.r_[x, x_diff].cumsum().astype(int)
# A B C
# 0 8 NaN 8
# 1 5 -3.0 5
# 2 4 -1.0 4
# 3 3 -1.0 3
# 4 9 6.0 9
# 5 7 -2.0 7
# 6 4 -3.0 4
# 7 0 -4.0 0
# 8 8 8.0 8
# 9 1 -7.0 1
You can use diff_inv from pmdarima.Docs link
# genarating random table
np.random.seed(10)
vals = np.random.randint(1, 10, 6)
df_t = pd.DataFrame({"a":vals})
#creating two columns with diff 1 and diff 2
df_t['dif_1'] = df_t.a.diff(1)
df_t['dif_2'] = df_t.a.diff(2)
df_t
a dif_1 dif_2
0 5 NaN NaN
1 1 -4.0 NaN
2 2 1.0 -3.0
3 1 -1.0 0.0
4 2 1.0 0.0
5 9 7.0 8.0
Then create a function that will return an array with inverse values of diff.
from pmdarima.utils import diff_inv
def inv_diff (df_orig_column,df_diff_column, periods):
# Generate np.array for the diff_inv function - it includes first n values(n =
# periods) of original data & further diff values of given periods
value = np.array(df_orig_column[:periods].tolist()+df_diff_column[periods:].tolist())
# Generate np.array with inverse diff
inv_diff_vals = diff_inv(value, periods,1 )[periods:]
return inv_diff_vals
Example of Use:
# df_orig_column - column with original values
# df_diff_column - column with differentiated values
# periods - preiods for pd.diff()
inv_diff(df_t.a, df_t.dif_2, 2)
Output:
array([5., 1., 2., 1., 2., 9.])
Reverse diff in one line with pandas
import pandas as pd
df = pd.DataFrame([10, 15, 14, 18], columns = ['Age'])
df['Age_diff'] = df.Age.diff()
df['reverse_diff'] = df['Age'].shift(1) + df['Age_diff']
print(df)
Age Age_diff reverse_diff
0 10 NaN NaN
1 15 5.0 15.0
2 14 -1.0 14.0
3 18 4.0 18.0
Here's a working example.
First, let's import needed packages
import numpy as np
import pandas as pd
import pmdarima as pm
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
Then, let's create a simple discretized cosine wave
period = 5
cycles = 7
x = np.cos(np.linspace(0, 2*np.pi*cycles, periods*cycles+1))
X = pd.DataFrame(x)
and plot
fig, ax = plt.subplots(figsize=(12, 5))
ax.plot(X, marker='.')
ax.set(
xticks=X.index
)
ax.axvline(0, color='r', ls='--')
ax.axvline(period, color='r', ls='--')
ax.set(
title='Original data'
)
plt.show()
Note that the period is 5. Let's now remove this "seasonality" by differentiating with period 5
X_diff = X.diff(periods=period)
# NOTE: the first `period` observations
# are needed for back transformation
X_diff.iloc[:period] = X[:period]
Note that we have to keep the first period observations to allow back transformation. If you don't need them you have to keep them elsewhere and then concatenate when you want to back transform.
fig, ax = plt.subplots(figsize=(12, 5))
ax.axvline(0, color='r', ls='--')
ax.axvline(period-1, color='r', ls='--')
ax.plot(X_diff, marker='.')
ax.annotate(
'Keep these original data\nto allow back transformation',
xy=(period-1, .5), xytext=(10, .5),
arrowprops=dict(color='k')
)
ax.set(
title='Transformed data'
)
plt.show()
Let's now back transform data with pmdarima.utils.diff_inv
X_diff_inv = pm.utils.diff_inv(X_diff, lag=period)[period:]
Note that we discard the first period results that would be 0 and not needed.
fig, ax = plt.subplots(figsize=(12, 5))
ax.axvline(0, color='r', ls='--')
ax.axvline(period-1, color='r', ls='--')
ax.plot(X_diff_inv, marker='.')
ax.set(
title='Back transformed data'
)
plt.show()
I think some examples may overcomplicate? The inverse of differentiating is simple integrating. But for this you need a starting value, so to say the const part when integrating dx = f(x) + const:
import pandas as pd
import matplotlib.pyplot as plt
# some example data
input = pd.Series([5., 1., 2., 1., 2., 9.])
# saving the offset ('const' part of integral) to reconstruct
offset = input[0]
# differentiating
diff = input.diff()
# the first row after diff() will always be NaN, it is reasonable to set this to zero
diff[0] = 0
# => reconstruct (reverse diff / integrate) <=
reverse_diff_no_offset = diff.cumsum()
reverse_diff = reverse_diff_no_offset + offset
# plot: You can see why a offset is needed. Any other offset will shift the line up/down
plt.plot(input, color='green', linestyle=None, marker="o")
plt.plot(reverse_diff_no_offset, color='grey')
plt.plot(reverse_diff, color='blue')
Also numpy has cumsum, so it will work there as well
arb = pd.DataFrame({'a': [1, 4, 9, 16, 25, 36]})
(-1)*arb['a'].diff(periods=-1)
Output:
3.0
5.0
7.0
9.0
11.0
NaN
Name: a, dtype: float64

Pandas : plot data frame object with marker?

This is my pandas DataFrame.
value action
0 1 0
1 2 1
2 3 1
3 4 1
4 3 0
5 2 1
6 5 0
What I want to do is mark value as o if action=0, x if action=1.
So, the plot marker should be like this:
o x x x o x o
But have no idea how to do this...
Need your helps. Thanks.
Consider the following approach:
plt.plot(df.index, df.value, '-X', markevery=df.index[df.action==1].tolist())
plt.plot(df.index, df.value, '-o', markevery=df.index[df.action==0].tolist())
Result:
alternative solution:
plt.plot(df.index, df.value, '-')
plt.scatter(df.index[df.action==0], df.loc[df.action==0, 'value'],
marker='o', s=100, c='green')
plt.scatter(df.index[df.action==1], df.loc[df.action==1, 'value'],
marker='x', s=100, c='red')
Result:
You can plot the filtered dataframe. I.e., you can create two dataframes, one for action 0 and one for action 1. Then plot each individually.
import pandas as pd
df = pd.DataFrame({"value":[1,2,3,4,3,2,5], "action":[0,1,1,1,0,1,0]})
df0 = df[df["action"] == 0]
df1 = df[df["action"] == 1]
ax = df.reset_index().plot(x = "index", y="value", legend=False, color="gray", zorder=0)
df0.reset_index().plot(x = "index", y="value",kind="scatter", marker="o", ax=ax)
df1.reset_index().plot(x = "index", y="value",kind="scatter", marker="x", ax=ax)

Categories

Resources