Add probability of x and conversion % - python

This is how the data currently looks like:
id testers_time stage_1_to_2_time activated_time stage_2_to_3_time engaged_time
a 10 30 40 30 70
b 30
c 15 30 45
d
dict = {'id': ['a','b','c','d'], 'testers_time': [10, 30, 15, None], 'stage_1_to_2_time': [30, None, 30, None], 'activated_time' : [40, None, 45, None],'stage_2_to_3_time' : [30, None, None, None],'engaged_time' : [70, None, None, None]}
df = pd.DataFrame(dict, columns=['id', 'testers_time', 'stage_1_to_2_time', 'activated_time', 'stage_2_to_3_time', 'engaged_time'])
I have a plot of testers_time against its cumulative probability from a CDF:
def ecdf(df):
n = len(df)
x = np.sort(df)
y = np.arange(1.0, n+1) / n
return x, y
df = df['testers_time'].dropna().sort_values()
print(df)
x, y = ecdf(df)
plt.plot(x, y, marker='.', linestyle='none')
plt.axvline(x.mean(), color='gray', linestyle='dashed', linewidth=2) #Add mean
x_m = int(x.mean())
y_m = stats.percentileofscore(df, x.mean())/100.0
plt.annotate('(%s,%s)' % (x_m,int(y_m*100)) , xy=(x_m,y_m), xytext=(10,-5), textcoords='offset points')
percentiles= np.array([0,25,50,75,100])
x_p = np.percentile(df, percentiles)
y_p = percentiles/100.0
plt.plot(x_p, y_p, marker='D', color='red', linestyle='none') # Overlay quartiles
for x,y in zip(x_p, y_p):
plt.annotate('%s' % int(x), xy=(x,y), xytext=(10,-5), textcoords='offset points')
What I am trying to do is graph testers_time against:
1) Its none-cumulative probability, if graphed it should look like a sort of a PDF
2) Its cumulative conversion %, where conversion is any id that has a populated (not blank or null) testers_time. So id a (1 of 4 ids) converts, that's 25%, id b converts, thats 50% (since cumulative), id c converts, that's 75%, and id d doesnt convert so 75% conversion is the max, at 30 days testers_time.
Can you assist with adding the above into columns in the df, or graph them? Thank you.

A1: df['prob'] = df['testers_time'].map(df.testers_time.value_counts(normalize=True))
A2: df['conv'] = df['testers_time'].rank(ascending=1)/len(df)

Related

How to create a scatter plot with linear trend from multiple data sets in an array

I'd like to create a scatter plot with 'age' and 'income' as the x and y axis from this data, whilst also separating by gender (m or f):
vals = [[39, 50907.00500830538, 'm'], [71, 58137.09607273632, 'm'], [27, 44311.25956375814, 'f'], [50, 53194.40398297405, 'f'], [41, 48227.6226667045, 'f'], [38, 51081.77610221989, 'f'], [25, 49202.743772155154, 'f'], [45, 46958.227355122865, 'm'], [46, 54815.07514726054, 'm'], [25, 46734.0863416376, 'f'], [44, 52252.36769285552, 'm'], [70, 58453.80544624214, 'f']]
This is the code I currently have:
ages = [x[0] for x in vals]
incomes = [x[1] for x in vals]
fig, ax = plt.subplots()
male_data = [(a,i) for a,i,g in vals if g == 'male']
male_ages = [a for a,i in male_data]
male_incomes = [i for a,i in male_data]
ax.scatter(male_ages, male_incomes, color='blue', label='male')
female_data = [(a,i) for a,i,g in vals if g == 'female']
female_ages = [a for a,i in female_data]
female_incomes = [i for a,i in female_data]
ax.scatter(female_ages, female_incomes, color='red', label='female')
z = np.polyfit(x, y, 1)
ax.legend()
ax.set_xlabel('age')
ax.set_ylabel('income')
I'm also trying to use this code to create a linear trend however I was unsuccessful:
p = np.poly1d(z)
You were checking for the wrong string. The data labeled it as "f" or "m" and you were checking for the full word.
female_data = [(a,i) for a,i,g in vals if g == 'f']
male_data = [(a,i) for a,i,g in vals if g == 'm']
#substitute in ages and incomes for x and y respectively.
#Z returns a the coefficients starting from higher power to 0, in this case from 1 to 0.
z = np.polyfit(ages, incomes, 1)
#compose the function
f = lambda x: (x * z[0]) + z[1]
#create the range of x for our graph
x = [x for x in np.linspace(20,80,10)]
#use our function to calculate the y for each x in our range
y = [f(a) for a in x]
#plot the line
ax.plot(x,y)
ax.set_xlabel('age')
ax.set_ylabel('income')
plt.show()

adjust_text: set label distance to a line

I have the following dataframe:
d = {'a': [2, 3, 4.5], 'b': [3, 2, 5]}
df = pd.DataFrame(data=d, index=["val1", "val2","val3"])
df.head()
a b
val1 2.0 3
val2 3.0 2
val3 4.5 5
I plotted this dataframe with the following code:
fig, ax=plt.subplots(figsize=(10,10))
ax.scatter(df["a"], df["b"],s=1)
x1=[0, 2512]
y1=[0, 2512]
ax.plot(x1,y1, 'r-')
#set limits:
ax = plt.gca()
ax.set_xlim([0, 10])
ax.set_ylim([0, 10])
#add labels:
TEXTS = []
for idx, names in enumerate(df.index.values):
x, y = df["a"].iloc[idx], df["b"].iloc[idx]
TEXTS.append(ax.text(x, y, names, fontsize=12));
# Adjust text position and add lines
adjust_text(
TEXTS,
expand_points=(2.5, 2.5),
expand_text=(2.5,2),
autoalign="xy",
arrowprops=dict(arrowstyle="-", lw=1),
ax=ax
);
However, I can not find a way to push the labels away from the red diagonal line, in order to get this result:
You can use the regular matplotlib annotate function and change the direction of the offset depending on the position of the data point relative to the red line:
ax = df.plot.scatter('a', 'b')
ax.set_aspect(1)
ax.plot((0,10), (0,10), 'r-')
offset = np.array([-1, 1])
for s, xy in df.iterrows():
xy = xy.to_numpy()
direction = 1 if xy[1] > xy[0] else -1
ax.annotate(s, xy, xy + direction * offset, ha='center', va='center', arrowprops=dict(arrowstyle='-', lw=1))

Plotting quadrant chart to differntiate population in 4 groups based on mean values of X & Y and find the final count

started learning how to plot data on python and I need help achieving the following:
I have the following example df6:
df6 = pd.DataFrame({
'emails': [50, 60 ,30, 40, 90, 10, 0,85 ],
'delivered': [20, 16 ,6, 15, 66, 6, 0,55 ]
})
df6
Looks like:
emails delivered
0 50 20
1 60 16
2 30 6
3 40 15
4 90 66
5 10 6
6 0 0
7 85 55
I need to plot emails VS delivered in a 4 quadrant chart. X & Y range will be slightly extra of the max and the cross section will be the means of both columns.
What I did so far, used describe() to get the values of the df6 then:
fig, ax = plt.subplots()
fig.set_size_inches(7, 5)
plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)
plt.axhline(y=45.6, color="black", linestyle="--")
plt.axvline(x=23, color="black", linestyle="--")
plt.plot(df6['delivered'],df6['emails'],"o")
plt.xlim([0, df6['delivered'].max()+20])
plt.ylim([0, df6['emails'].max()+20])
plt.show()
I got the following output so far:
What I am looking for is seeing the chart into just 4 groups scattered and label each group with the total count of one quarter:
I found it easier to normalize the data before plotting... UPDATE: Messed something up with counts, but the code is here to analyze my mistake.
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scale = scaler.fit(df6)
# normalize the sen_matrix
norm_df = pd.DataFrame(scale.transform(df6), columns=df6.columns)
quadrant_1 = sum(np.logical_and(norm_df['emails'] < 0, norm_df['delivered'] < 0))
display(quadrant_1)
quadrant_2 = sum(np.logical_and(norm_df['emails'] > 0, norm_df['delivered'] < 0))
display(quadrant_2)
quadrant_3 = sum(np.logical_and(norm_df['emails'] < 0, norm_df['delivered'] > 0))
display(quadrant_3)
quadrant_4 = sum(np.logical_and(norm_df['emails'] > 0, norm_df['delivered'] > 0))
display(quadrant_4)
fig, ax = plt.subplots()
fig.set_size_inches(7, 5)
plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)
plt.axhline(y=0, color="black", linestyle="--")
plt.axvline(x=0, color="black", linestyle="--")
plt.plot(norm_df['delivered'],norm_df['emails'],"o")
plt.gca().spines['bottom'].set_visible(False)
plt.gca().spines['left'].set_visible(False)
plt.gca().axes.get_xaxis().set_visible(False)
plt.gca().axes.get_yaxis().set_visible(False)
plt.text(0,-2.1,'Delivered',horizontalalignment='center', verticalalignment='center')
plt.text(-2.1,0,'Emails', horizontalalignment='center', verticalalignment='center', rotation=90)
plt.text(1,1,'Count: ' + str(quadrant_1),horizontalalignment='center', verticalalignment='center')
plt.text(-1,1,'Count: ' + str(quadrant_2), horizontalalignment='center', verticalalignment='center')
plt.text(-1,-1,'Count: ' + str(quadrant_3),horizontalalignment='center', verticalalignment='center')
plt.text(1,-1,'Count: ' + str(quadrant_4), horizontalalignment='center', verticalalignment='center')
plt.xlim([-2, 2])
plt.ylim([-2, 2])
plt.show()
So to use the means in your plots you can start by simply modifying these 2 lines:
plt.axhline(y=df6['emails'].mean(), color="black", linestyle="--")
plt.axvline(x=df6['delivered'].mean(), color="black", linestyle="--")
We can then use pd.value_counts to compute the counts:
counts = df6.transform(lambda s: s >= s.mean()).value_counts()
pos = df6.agg(['min', 'max'])
Here counts contains the values of each pair of above/below means:
emails delivered
False False 4
True False 2
True 2
and pos contains the x/y (or email/delivered) coordinates at which the boxes are placed:
emails delivered
min 0 0
max 90 66
So you can adjust pos to change the annotation placement.
Finally you want to do the annotation on the figure:
for (eml, dlv), num in counts.iteritems():
ax.text(s=f'count: {num}',
x=pos.loc['max' if dlv else 'min', 'delivered'],
y=pos.loc['max' if eml else 'min', 'emails'],
ha='right' if dlv else 'left',
va='top' if eml else 'bottom',
)
Your are just missing the code for setting your left/bottom-spines position
import pandas as pd, numpy as np
df6 = pd.DataFrame({'emails': [50, 60 ,30, 40, 90, 10, 0,85 ],
'delivered': [20, 16 ,6, 15, 66, 6, 0,55 ]})
plt.plot(df6['delivered'],df6['emails'],"o")
count = np.count_nonzero(
(df6['emails'] < df6['delivered'].mean())&
(df6['delivered'] < df6['emails'].mean()) )
plt.annotate('count: %s'%count,(5,60))
plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)
plt.gca().spines['left'].set_position(('data',df6['delivered'].mean()))
plt.gca().spines['bottom'].set_position(('data',df6['emails'].mean()))
Here's another solution, with a more symmetric looking plot:
import pandas as pd
import matplotlib.pyplot as plt
df = pd.DataFrame(
{
"emails": [50, 60, 30, 40, 90, 10, 0, 85],
"delivered": [20, 16, 6, 15, 66, 6, 0, 55],
}
)
plt.plot(df["delivered"], df["emails"], "o")
plt.gca().spines["top"].set_visible(False)
plt.gca().spines["right"].set_visible(False)
plt.gca().spines["left"].set_position(("data", df["delivered"].mean()))
plt.gca().spines["bottom"].set_position(("data", df["emails"].mean()))
def get_lims(df, column, w=0.1):
mean = df[column].mean()
max_diff = max(
abs(df[column].max() - mean),
abs(df[column].min() - mean),
)
return [mean - max_diff - max_diff * w, mean + max_diff + max_diff * w]
plt.xlim(get_lims(df, "delivered"))
plt.ylim(get_lims(df, "emails"))
plt.show()

Heatmap with multiple colormaps by column

I have a dataframe where each column contains values considered "normal" if they fall within an interval, which is different for every column:
# The main df
df = pd.DataFrame({"A": [20, 10, 7, 39],
"B": [1, 8, 12, 9],
"C": [780, 800, 1200, 250]})
The df_info represents the intervals for each column of df.
So for example df_info["A"][0] is the min for the column df["A"] and df_info["A"][1] represents the max for the column df["A"] and so on.
df_info = pd.DataFrame({"A": [22, 35],
"B": [5, 10],
"C": [850, 900]})
Thanks to this SO Answer I was able to create a custom heatmap to print in blue values below the range, in red value above the range and in white values within the range. Just remember each column has a different range. SO i normalized according to this:
df_norm = pd.DataFrame()
for col in df:
col_min = df_info[col][0]
col_max = df_info[col][1]
df_norm[col] = (df[col] - col_min) / (col_max - col_min)
And finally printed my heatmap
vmin = df_norm.min().min()
vmax = df_norm.max().max()
norm_zero = (0 - vmin) / (vmax - vmin)
norm_one = (1 - vmin) / (vmax - vmin)
colors = [[0, 'darkblue'],
[norm_zero, 'white'],
[norm_one, 'white'],
[1, 'darkred']
]
cmap = LinearSegmentedColormap.from_list('', colors, )
fig, ax = plt.subplots()
ax=sns.heatmap(data=data,
annot=True,
annot_kws={'size': 'large'},
mask=None,
cmap=cmap,
vmin=vmin,
vmax=vmax) \
.set_facecolor('white')
In the example you can see that the third column has values much higher/lower compared to the the 0-1 interval (and to the first column) so they "absorb" all the shades of red and blue.
QUESTION:
What I want to obtain is use the entire shades of red/blue for each column or at least to reduce the perceptual difference between (for example) the first and third column.
I had tough of:
create a custom colormap where each colormap normalization is performed by column
use multiple colormaps, each one applied to a different column
applying a colormap mpl.colors.LogNorm but I'm not sure how to use it with my custom LinearSegmentedColormap
Using a mask per column, you could draw the heatmap column per column, each with its own colormap:
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.colors import LinearSegmentedColormap
from matplotlib.cm import ScalarMappable
df = pd.DataFrame({"A": [20, 10, 7, 39],
"B": [1, 8, 12, 9],
"C": [780, 800, 1200, 250]})
df_info = pd.DataFrame({"A": [22, 35],
"B": [5, 10],
"C": [850, 900]})
df_norm = pd.DataFrame()
for col in df:
col_min = df_info[col][0]
col_max = df_info[col][1]
df_norm[col] = (df[col] - col_min) / (col_max - col_min)
fig, ax = plt.subplots()
for col in df:
vmin = df_norm[col].min()
vmax = df_norm[col].max()
norm_zero = (0 - vmin) / (vmax - vmin)
norm_one = (1 - vmin) / (vmax - vmin)
colors = [[0, 'darkblue'],
[norm_zero, 'white'],
[norm_one, 'white'],
[1, 'darkred']]
cmap = LinearSegmentedColormap.from_list('', colors)
mask = df.copy()
for col_m in mask:
mask[col_m] = col != col_m
sns.heatmap(data=df_norm,
annot=df.to_numpy(), annot_kws={'size': 'large'}, fmt="g",
mask=mask,
cmap=cmap, vmin=vmin, vmax=vmax, cbar=False, ax=ax)
ax.set_facecolor('white')
colors = [[0, 'darkblue'],
[1 / 3, 'white'],
[2 / 3, 'white'],
[1, 'darkred']]
cmap = LinearSegmentedColormap.from_list('', colors)
cbar = plt.colorbar(ScalarMappable(cmap=cmap), ax=ax, ticks=[0, 1 / 3, 2 / 3, 1])
cbar.ax.yaxis.set_ticklabels(['min\nlimit', 'min', 'max', 'max\nlimit'])
plt.tight_layout()
plt.show()
You can re-scale your df_norm before plotting:
# alternative method to scale
df_norm = (df - df_info.iloc[0])/(df_info.iloc[1]-df_info.iloc[0])
# scale the norm
df_plot = (df_norm - df_norm.min())/(df_norm.max()-df_norm.min())
# heat map on the normalized `df_plot`
# use values in `df_norm` to annotate
# color bar doesn't make sense so we remove it
sns.heatmap(df_plot, annot=df_norm, cmap='RdBu_r', cbar=False))
Output:

How to generate a bar chart of occurrences per year in matplotlib python?

I have list of dates and I want to generate a bar chart with matplotlib in python.
2007-05-06
2007-05-11
2007-06-01
2007-06-04
2007-06-06
2007-09-01
2007-10-06
2007-11-06
2007-11-07
…
And I want to provide this two type of bar char
I can use this code but I'm searching for more efficient code because as you can see I have years between 2007 and 2012 and sometimes this range can be wider
def plot():
#--- the two samples ---
samples1 = [1, 1, 1, 3, 2, 5, 1, 10, 10, 8]
samples2 = [6, 6, 6, 1, 2, 3, 9, 12 ]
samples3 = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 10, 11, 12]
N = 12 # number of bins
hist1 = np.array([0] * N )
hist2 = np.array([0] * N )
hist3 = np.array([0] * N )
#--- create two histogram. Values of 1 go in Bin 0 ---
for x in samples1:
hist1[x-1] += 1
for x in samples2:
hist2[x-1] += 1
for x in samples3:
hist3[x-1] += 1
#--- display the bar-graph ---
width = 1
p1 = plt.bar( np.arange(0,N)+0.5, hist1, width, color='#9932cc' )
p2 = plt.bar( np.arange(0,N)+0.5, hist2, width, color='#ffa500', bottom=hist1 )
p3 = plt.bar( np.arange(0,N)+0.5, hist3, width, color='#d2691e', bottom=hist1+hist2 )
plt.legend( (p1[0], p2[0], p3[0]), ( 'hist1', 'hist2', 'hist3' ) )
plt.xlabel( 'Bins' )
plt.ylabel( 'Count' )
#plt.axis([1, 46, 0, 6])
plt.xticks( np.arange( 1,N+1 ) )
plt.axis( [width/2.0, N+width/2.0, 0, max( hist1+hist2+hist3)] )
plt.show()
Can you help me to generate this kind of chart !?
Thank you
You can use numpy histogram to get the data in bar format directly, which should be faster than looping in python. As a minimal example based on your data above,
import numpy as np
import matplotlib.pyplot as plt
#--- the two samples ---
samples1 = [1, 1, 1, 3, 2, 5, 1, 10, 10, 8]
samples2 = [6, 6, 6, 1, 2, 3, 9, 12 ]
samples3 = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 10, 11, 12]
N = 12 # number of bins
hist1 = np.array([0] * N )
hist2 = np.array([0] * N )
hist3 = np.array([0] * N )
#--- create two histogram. Values of 1 go in Bin 0 ---
hist1, n = np.histogram(samples1,N)
hist2, n = np.histogram(samples2,N)
hist3, n = np.histogram(samples3,N)
#--- display the bar-graph ---
width = 1
p1 = plt.bar( np.arange(0,N)+0.5, hist1, width, color='#9932cc' )
p2 = plt.bar( np.arange(0,N)+0.5, hist2, width, color='#ffa500', bottom=hist1 )
p3 = plt.bar( np.arange(0,N)+0.5, hist3, width, color='#d2691e', bottom=hist1+hist2 )
plt.legend( (p1[0], p2[0], p3[0]), ( '2010', '2011', '2012' ) )
plt.xlabel( 'Bins' )
plt.ylabel( 'Count' )
import datetime
months = [((datetime.date(2010, i, 1).strftime('%B'))[:3]) for i in range(1,13)]
plt.xticks( np.arange( 1,N+1 ),months )
plt.axis( [width/2.0, N+width/2.0, 0, max( hist1+hist2+hist3)] )
plt.show()
The two plots are generated in a very similar way, so I'll do the first one only. You need to loop over the months, and to get a stacked bar-chart set the bottom of each month's bar to the cumulative sum of the previous months' values for each year:
import numpy as np
import matplotlib.pyplot as plt
months = ('Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun',
'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec')
# Some random data for nyears from minyear
nyears = 8
nmonths = len(months)
minyear = 2005
monthly_counts = np.random.randint(low=2, high=15, size=(nyears,nmonths))
fig, ax = plt.subplots()
ind = np.arange(nyears)
width = 0.45
# Random colors for the months
c = np.random.rand(nmonths,3,1)
p = []
for imonth in range(nmonths):
p.append(ax.bar(ind, monthly_counts[:,imonth], width,
bottom=np.sum(monthly_counts[:,:imonth], axis=1),
color=c[imonth], alpha=0.8)
)
# Set x axis ticks and labels
ax.set_xticks(ind + width/2)
ax.set_xticklabels([str(minyear+i) for i in ind])
# Locate legend outside axes plot area
box = ax.get_position()
ax.set_position([box.x0, box.y0, box.width * 0.8, box.height])
ax.legend([pl[0] for pl in p], months, loc='center left', bbox_to_anchor=(1, 0.5))
plt.show()

Categories

Resources