Pandas reverse of diff()

Pandas reverse of diff() - python

I have calculated the differences between consecutive values in a series, but I cannot reverse / undifference them using diffinv():
ds_sqrt = np.sqrt(ds)
ds_sqrt = pd.DataFrame(ds_sqrt)
ds_diff = ds_sqrt.diff().values
How can I undifference this?

You can do this via numpy. Algorithm courtesy of #Divakar.
Of course, you need to know the first item in your series for this to work.
df = pd.DataFrame({'A': np.random.randint(0, 10, 10)})
df['B'] = df['A'].diff()
x, x_diff = df['A'].iloc[0], df['B'].iloc[1:]
df['C'] = np.r_[x, x_diff].cumsum().astype(int)
# A B C
# 0 8 NaN 8
# 1 5 -3.0 5
# 2 4 -1.0 4
# 3 3 -1.0 3
# 4 9 6.0 9
# 5 7 -2.0 7
# 6 4 -3.0 4
# 7 0 -4.0 0
# 8 8 8.0 8
# 9 1 -7.0 1

You can use diff_inv from pmdarima.Docs link
# genarating random table
np.random.seed(10)
vals = np.random.randint(1, 10, 6)
df_t = pd.DataFrame({"a":vals})
#creating two columns with diff 1 and diff 2
df_t['dif_1'] = df_t.a.diff(1)
df_t['dif_2'] = df_t.a.diff(2)
df_t
a dif_1 dif_2
0 5 NaN NaN
1 1 -4.0 NaN
2 2 1.0 -3.0
3 1 -1.0 0.0
4 2 1.0 0.0
5 9 7.0 8.0
Then create a function that will return an array with inverse values of diff.
from pmdarima.utils import diff_inv
def inv_diff (df_orig_column,df_diff_column, periods):
# Generate np.array for the diff_inv function - it includes first n values(n =
# periods) of original data & further diff values of given periods
value = np.array(df_orig_column[:periods].tolist()+df_diff_column[periods:].tolist())
# Generate np.array with inverse diff
inv_diff_vals = diff_inv(value, periods,1 )[periods:]
return inv_diff_vals
Example of Use:
# df_orig_column - column with original values
# df_diff_column - column with differentiated values
# periods - preiods for pd.diff()
inv_diff(df_t.a, df_t.dif_2, 2)
Output:
array([5., 1., 2., 1., 2., 9.])

Reverse diff in one line with pandas
import pandas as pd
df = pd.DataFrame([10, 15, 14, 18], columns = ['Age'])
df['Age_diff'] = df.Age.diff()
df['reverse_diff'] = df['Age'].shift(1) + df['Age_diff']
print(df)
Age Age_diff reverse_diff
0 10 NaN NaN
1 15 5.0 15.0
2 14 -1.0 14.0
3 18 4.0 18.0

Here's a working example.
First, let's import needed packages
import numpy as np
import pandas as pd
import pmdarima as pm
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
Then, let's create a simple discretized cosine wave
period = 5
cycles = 7
x = np.cos(np.linspace(0, 2*np.pi*cycles, periods*cycles+1))
X = pd.DataFrame(x)
and plot
fig, ax = plt.subplots(figsize=(12, 5))
ax.plot(X, marker='.')
ax.set(
xticks=X.index
)
ax.axvline(0, color='r', ls='--')
ax.axvline(period, color='r', ls='--')
ax.set(
title='Original data'
)
plt.show()
Note that the period is 5. Let's now remove this "seasonality" by differentiating with period 5
X_diff = X.diff(periods=period)
# NOTE: the first `period` observations
# are needed for back transformation
X_diff.iloc[:period] = X[:period]
Note that we have to keep the first period observations to allow back transformation. If you don't need them you have to keep them elsewhere and then concatenate when you want to back transform.
fig, ax = plt.subplots(figsize=(12, 5))
ax.axvline(0, color='r', ls='--')
ax.axvline(period-1, color='r', ls='--')
ax.plot(X_diff, marker='.')
ax.annotate(
'Keep these original data\nto allow back transformation',
xy=(period-1, .5), xytext=(10, .5),
arrowprops=dict(color='k')
)
ax.set(
title='Transformed data'
)
plt.show()
Let's now back transform data with pmdarima.utils.diff_inv
X_diff_inv = pm.utils.diff_inv(X_diff, lag=period)[period:]
Note that we discard the first period results that would be 0 and not needed.
fig, ax = plt.subplots(figsize=(12, 5))
ax.axvline(0, color='r', ls='--')
ax.axvline(period-1, color='r', ls='--')
ax.plot(X_diff_inv, marker='.')
ax.set(
title='Back transformed data'
)
plt.show()

I think some examples may overcomplicate? The inverse of differentiating is simple integrating. But for this you need a starting value, so to say the const part when integrating dx = f(x) + const:
import pandas as pd
import matplotlib.pyplot as plt
# some example data
input = pd.Series([5., 1., 2., 1., 2., 9.])
# saving the offset ('const' part of integral) to reconstruct
offset = input[0]
# differentiating
diff = input.diff()
# the first row after diff() will always be NaN, it is reasonable to set this to zero
diff[0] = 0
# => reconstruct (reverse diff / integrate) <=
reverse_diff_no_offset = diff.cumsum()
reverse_diff = reverse_diff_no_offset + offset
# plot: You can see why a offset is needed. Any other offset will shift the line up/down
plt.plot(input, color='green', linestyle=None, marker="o")
plt.plot(reverse_diff_no_offset, color='grey')
plt.plot(reverse_diff, color='blue')
Also numpy has cumsum, so it will work there as well

arb = pd.DataFrame({'a': [1, 4, 9, 16, 25, 36]})
(-1)*arb['a'].diff(periods=-1)
Output:
3.0
5.0
7.0
9.0
11.0
NaN
Name: a, dtype: float64

Related

Annotate a normalized barchart with original data

I have a dataframe consisting of;
home away type
0 0.0 0.0 reds
1 5.0 1.0 yellows
2 7.0 5.0 corners
3 4.0 10.0 PPDA
4 5.0 1.0 shots off
5 7.0 5.0 shots on
6 1.0 1.0 goals
7 66.0 34.0 possession
to get the stacked bar chart I wanted, I normalized the data using
stackeddf1 = df1.iloc[:,0:2].apply(lambda x: x*100/sum(x),axis=1)
and then I create my barchart using
ax = stackeddf1.iloc[1:, 0:2].plot.barh(align='center', stacked=True, figsize=(20, 20),legend=None)
for p in ax.patches:
width, height = p.get_width(), p.get_height()
x, y = p.get_xy()
ax.text(x+width/2,
y+height/2,
'{:.0f}'.format(width),
horizontalalignment='center',
verticalalignment='center')
This though, annotates the barchart with the new normalized data. If possible I'd like to find a way to use my original to annotate.

You can use matplotlib's new bar_label function together with the values of the original dataframe:
import matplotlib.pyplot as plt
from matplotlib.ticker import PercentFormatter
import pandas as pd
import numpy as np
df = pd.DataFrame({'home': np.random.randint(1, 10, 10),
'away': np.random.randint(1, 10, 10),
'type': [*'abcdefghij']})
df_normed = df.set_index('type')
df_normed = df_normed.div(df_normed.sum(axis=1), axis=0).multiply(100)
ax = df_normed.plot.barh(stacked=True, width=0.9, cmap='turbo')
for bars, col in zip(ax.containers, df.columns):
ax.bar_label(bars, labels=df[col], label_type='center', fontsize=15, color='yellow')
ax.legend(loc='upper left', bbox_to_anchor=(1.01, 1))
for sp in ['top', 'right']:
ax.spines[sp].set_visible(False)
ax.xaxis.set_major_formatter(PercentFormatter())
ax.margins(x=0)
plt.tight_layout()
plt.show()

Plotting: qcut then groupby two variables

I have the following dataset:
df = pd.DataFrame({'cls': [1,2,2,1,2,1,2,1,2,1,2],
'x': [10,11,21,21,8,1,4,3,5,6,2],
'y': [10,1,2,2,5,2,4,3,8,6,5]})
df['bin'] = pd.qcut(np.array(df['x']), 4)
a = df.groupby(['bin', 'cls'])['y'].mean()
a
This gives me
bin cls
(0.999, 3.5] 1 2.5
2 5.0
(3.5, 6.0] 1 6.0
2 6.0
(6.0, 10.5] 1 10.0
2 5.0
(10.5, 21.0] 1 2.0
2 1.5
Name: y, dtype: float64
I want to plot the right-most column (that is, the average of y per cls per bin) per bin per class. That is, for each bin we have two values of y that I would like to plot as points/scatters. Is that possible using matplotlib or seaborn?

You can indeed use seaborn for what you're asking. Does this work?
# import libraries
import matplotlib.pyplot as plt
import seaborn as sns
# set up some plotting options
fig = plt.figure(figsize=(5, 5))
ax = fig.add_subplot(1,1,1)
# we reset index to avoid having to do multi-indexing
a = a.reset_index()
# use seaborn with argument 'hue' to do the grouping
sns.barplot(x="bin", y="y", hue="cls", data=a, ax=ax)
plt.show()
EDIT: I've just noticed that you wanted to plot "points". I wouldn't advise it for this dataset but you can do that if you replace barplot with catplot.

Plot line on secondary axis with stacked bar chart - matplotlib

The following plots a stacked bar chart separated into 4 subplots. The four subplots are called from Area. The values are called from Result. This column contains 0's and 1's. I want to plot the total count of these values for each different combination in Group.
This works fine but I'm hoping to use the secondary axis to show the normalised values as a line plot. Specifically, the percentage of 1's compared to 0's. At the moment, I just have to total count of 0's and 1's as a bar chart. I'm hoping to plot the percentage of 1's using the secondary y-axis.
import pandas as pd
import matplotlib.pyplot as plt
df = pd.DataFrame({
'Result' :[0,1,1,1,0,1,1,0,1,0,1,1,1,1,0,1],
'Group' :[-2,-1,1,0,0,-1,-1,0,1,-1,0,1,-1,1,0,1],
'Area' :['North','East','South','West','North','East','South','West','North','East','South','West','North','East','South','West'],
})
total = df['Result'].sum()
def custom_stacked_barplot(t, sub_df, ax):
plot_df = pd.crosstab(index = sub_df['Group'],
columns = sub_df['Result'],
values = sub_df['Result'],
aggfunc = ['count',(lambda x: sum(x)/total*100)],
)
p = plot_df.plot(kind = "bar", y = 'count',stacked = True, ax = ax, rot = 0, width = 0.6, legend = False)
ax2=ax.twinx()
#plot norm line
#r = plot_df.plot(y = '<lambda>', ax = ax2, legend = False, zorder = 2, color = 'black')
return p
g_dfs = df.groupby(['Area'])
fig, axes = plt.subplots(nrows=4, ncols=1, figsize=(8,12))
for ax, (i,g) in zip(axes.ravel(), sorted(g_dfs)):
custom_stacked_barplot(i, g, ax)
plt.legend(bbox_to_anchor=(1.129, 2.56))
plt.show()
intended df output to plot:
count perc
Result 0 1 0
Group
-1 1.0 2.0 0.66
1 0.0 1.0 1.0
count perc
Result 0 1 0
Group
-2 1.0 0.0 0.0
-1 0.0 1.0 1.0
0 1.0 0.0 0.0
1 0.0 1.0 1.0
count perc
Result 0 1 0
Group
-1 0.0 1.0 1.0
0 1.0 1.0 0.5
1 0.0 1.0 1.0
count perc
Result 0 1 0
Group
0 1.0 1.0 0.5
1 0.0 2.0 1.0

try using twinx()
import matplotlib.pyplot as plt
df = pd.DataFrame({
'Result' :[0,1,1,1,0,1,1,0,1,0,1,1,1,1,0,1],
'Group' :[-2,-1,1,0,0,-1,-1,0,1,-1,0,1,-1,1,0,1],
'Area' :['North','East','South','West','North','East','South','West','North','East','South','West','North','East','South','West'],
})
total = df['Result'].sum()
def custom_stacked_barplot(t, sub_df, ax):
plot_df = pd.crosstab(index = sub_df['Group'],
columns=sub_df['Result'],
values=sub_df['Result'],
aggfunc = ['count',(lambda x: sum(x)/total*100)])
print(plot_df)
p = plot_df.plot(kind="bar",y='count',stacked=True, ax = ax, rot = 0, width = 0.6, legend = False)
ax2=ax.twinx()
r = plot_df.plot(kind="bar",y='<lambda>', stacked=True, ax = ax2, rot = 0, width = 0.6, legend = False)
return p,r
g_dfs = df.groupby(['Area'])
fig, axes = plt.subplots(nrows=4, ncols=1, figsize=(8,12))
for ax, (i,g) in zip(axes.ravel(), sorted(g_dfs)):
custom_stacked_barplot(i, g, ax)
plt.legend(bbox_to_anchor=(1.129, 2.56))
plt.show()
# save the plot as a file
fig.savefig('two_different_y_axis_for_single_python_plot_with_twinx.jpg',
format='jpeg',
dpi=100,
bbox_inches='tight')
plt.show()
The output looks something like :

Ok, so I gave this a try, too:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
df = pd.DataFrame({
'Result' :[0,1,1,1,0,1,1,0,1,0,1,1,1,1,0,1],
'Group' :[-2,-1,1,0,0,-1,-1,0,1,-1,0,1,-1,1,0,1],
'Area' :['North','East','South','West','North','East','South','West','North','East','South','West','North','East','South','West'],
})
## iterate over unique areas
unique_areas = df['Area'].unique()
fig, axes = plt.subplots(nrows=len(unique_areas), ncols=1, figsize=(8,12))
twin_axes=[]
for i,key in enumerate(unique_areas):
# print(f"== {key} ==") #<- uncomment this line to debug
## first, filter the df by 'Area'
area_df = df[(df['Area']==key)]
## and do the crosstab:
ct_df = pd.crosstab(index=area_df['Group'],
columns=area_df['Result'],
)
## to add the 'count' label you wanted to the dataframe multiindex:
ct_df = pd.concat({'count': ct_df}, names=['type'],axis=1)
## now iterate over the unique 'Groups' in the index ...
for ix in ct_df.index:
sub_df = ct_df.loc[ix,'count']
## ... and calculate the contribution of each Result
# which is equal to '1' (ct_df.loc[ix,1])
# in the total for this group (ct_df.loc[ix].sum())
ct_df.loc[ix,'perc'] = sub_df.loc[1]/sub_df.sum()
# print(ct_df) #<- uncomment this line to debug
## add your stacked bar plot
bar = ct_df.plot(kind = "bar", y = 'count',stacked = True, ax = axes[i], rot = 0, width = 0.6, legend = False)
## keep the twin_axes in a separate list
twin_axes.append(axes[i].twinx())
## generate the "correct" x values that match the bar plot locations
# (i.e. use [0,1,2,3] instead of [-2,-1,0,1] )
xs=np.arange(0,len(ct_df),1)
## and plot the percentages as a function this new x range as a black line:
twin_axes[i].plot(xs,ct_df['perc'],zorder=2,color='black')
## optional:
# using these 'xs' you could also e.g. add some labels for the contained groups:
for x in xs:
twin_axes[i].text(x,1.15,ct_df.index[x],color="b")
# make some nice changes to the formatting of the plots
for a in [twin_axes]:
# a[i].set_xlim(-1,4)
a[i].set_ylim(0,1.1)
plt.show()
Mainly, instead of trying to use the pd.crosstab to do everything, I'd suggest to do some quick and easy for loops over the unique areas, in order to get the df structure you want.
Each group-dependent dataframe now looks like what you wanted:
type count perc
Result 0 1
Group
-2 1 0 0.0
-1 0 1 1.0
0 1 0 0.0
1 0 1 1.0
type count perc
Result 0 1
Group
-1 1 2 0.666667
1 0 1 1.000000
type count perc
Result 0 1
Group
-1 0 1 1.0
0 1 1 0.5
1 0 1 1.0
type count perc
Result 0 1
Group
0 1 1 0.5
1 0 2 1.0
And the plot now looks like this:

Edit:
def create_plot(ax, x, y1, y2, y3):
ax1 = ax
ax2 = ax1.twinx()
ax1.bar(x, y1)
ax1.bar(x, y2, bottom=y1)
ax2.plot(x, y3, c="C3")
fig, axes = plt.subplots(nrows=4, ncols=1, figsize=(8,12))
for ax in axes:
create_plot(ax, (1,2,3,4), (1,2,3,4), (7,5,3,1), (1,4,2,3))
plt.show()
(Old post below)
Does something like
def create_plot(x, y1, y2, y3):
fig = plt.figure()
ax1 = fig.gca()
ax2 = ax1.twinx()
ax1.bar(x, y1)
ax1.bar(x, y2, bottom=y1)
ax2.plot(x, y3, c="C3")
return fig
fig = create_plot((1,2,3,4), (1,2,3,4), (7,5,3,1), (1,4,2,3))
plt.show()
meet what you need? This gives me:

How can I create a boxplot on only positive values in Seaborn?

I want to create a boxplot on about 10 variables where only positive values are considered within each variable. This changes from variable to variable, So something that is 0 in one category might be positive in another.
To do it for one variable looks like this so far;
ax=sns.boxplot(data=[df['Category_1_value'][df['Category_1_value'] > 0]])
I could do the above 10 times but hoped there was an easier way.
Is there a simple option to just ignore the 0 values within each category?

Consider replacing all negative values with np.nan before plotting:
df[df < 0] = np.nan
fig, ax = plt.subplots(figsize=(10,4))
sns.boxplot(data=df, ax=ax)
plt.show()
plt.clf()
plt.close()
To demonstrate with random, seeded data.
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
np.random.seed(102918)
df = pd.DataFrame(np.random.randn(100, 5))
df.columns = ['Category_'+ str(i) +'_value' for i in range(1, 6)]
print(df.head(5)
# Category_1_value Category_2_value Category_3_value Category_4_value Category_5_value
# 0 -0.911648 -0.453908 -0.495518 0.733304 0.569576
# 1 0.780117 -0.079954 0.134944 -1.764539 -0.267812
# 2 -0.256881 0.470838 0.437137 1.295758 0.385070
# 3 -1.665858 -1.001672 -0.444930 0.758346 0.132343
# 4 -0.167982 1.033756 1.636315 0.458918 0.022343
df[df < 0] = np.nan
print(df.head(5))
# Category_1_value Category_2_value Category_3_value Category_4_value Category_5_value
# 0 NaN NaN NaN 0.733304 0.569576
# 1 0.780117 NaN 0.134944 NaN NaN
# 2 NaN 0.470838 0.437137 1.295758 0.385070
# 3 NaN NaN NaN 0.758346 0.132343
# 4 NaN 1.033756 1.636315 0.458918 0.022343
Plot
fig, ax = plt.subplots(figsize=(10,4))
sns.boxplot(data=df, ax=ax)
plt.show()
plt.clf()
plt.close()

How to substract 2 columns of unaligned data with Python/Pandas?

I have two DataFrames
df1=
x y1
0 0 0
1 1 1
2 2 2
3 4 3
df2=
x y2
0 0.0 0
1 0.5 1
2 1.5 2
3 3.0 3
4 4.0 4
I need to calculate y2-y1 (for the same x value)
(in order to see the difference between 2 graphs)
As you can see, some values are in common between them... some are not
I think I will need to resample my data... but I don't know how !
I need to align data in order to have same 'x' column for the 2 dataframes.
between 2 points a linear interpolation should be done to get y value at a given x.
In this case resampling data with a x_step=0.5 will be good
I did this...
import pandas as pd
import matplotlib.pylab as plt
df1 = pd.DataFrame([[0.0,0.0],[1.0,1.0],[2.0,2.0],[4.0,3.0]],columns=['x','y1'])
df2 = pd.DataFrame([[0.0,0.0],[0.5,1.9],[1.5,2.0],[3.0,3.0],[4.0,4.0]],columns=['x','y2'])
print(df1)
print("="*10)
print(df1['x'])
print("="*10)
print(df1['y1'])
print("="*10)
fig = plt.figure()
fig.subplots_adjust(bottom=0.1)
ax = fig.add_subplot(111)
plt.title("{y} = f({x})".format(x='x', y='y'))
p1, = plt.plot(df1['x'], df1['y1'], color='b', marker='.')
p2, = plt.plot(df2['x'], df2['y2'], color='r', marker='.')
plt.legend([p1, p2], ["y1", "y2"])
plt.show()

import pandas as pd
import pylab as pl
df1 = pd.DataFrame([[0.0,0.0],[1.0,1.0],[2.0,2.0],[4.0,3.0]],columns=['x','y1'])
df2 = pd.DataFrame([[0.0,0.0],[0.5,1.9],[1.5,2.0],[3.0,3.0],[4.0,4.0]],columns=['x','y2'])
x = np.union1d(df1.x, df2.x)
y1 = np.interp(x, df1.x, df1.y1)
y2 = np.interp(x, df2.x, df2.y2)
pl.plot(x, y1, "-o")
pl.plot(x, y2, "-o")

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Pandas reverse of diff() - python

I have calculated the differences between consecutive values in a series, but I cannot reverse / undifference them using diffinv(): ds_sqrt = np.sqrt(ds) ds_sqrt = pd.DataFrame(ds_sqrt) ds_diff = ds_sqrt.diff().values How can I undifference this?

Reverse diff in one line with pandas import pandas as pd df = pd.DataFrame([10, 15, 14, 18], columns = ['Age']) df['Age_diff'] = df.Age.diff() df['reverse_diff'] = df['Age'].shift(1) + df['Age_diff'] print(df) Age Age_diff reverse_diff 0 10 NaN NaN 1 15 5.0 15.0 2 14 -1.0 14.0 3 18 4.0 18.0

arb = pd.DataFrame({'a': [1, 4, 9, 16, 25, 36]}) (-1)*arb['a'].diff(periods=-1) Output: 3.0 5.0 7.0 9.0 11.0 NaN Name: a, dtype: float64

Related

Annotate a normalized barchart with original data

Plotting: qcut then groupby two variables

Plot line on secondary axis with stacked bar chart - matplotlib

How can I create a boxplot on only positive values in Seaborn?

How to substract 2 columns of unaligned data with Python/Pandas?

Categories

Resources