calculating slope for a series trendline in Pandas - python

Is there an idiomatic way of getting the slope for linear trend line fitting values in a DataFrame column? The data is indexed with DateTime index.

This should do it:
import numpy as np
import pandas as pd
df = pd.DataFrame(np.random.rand(100, 5), pd.date_range('2012-01-01', periods=100))
def trend(df):
df = df.copy().sort_index()
dates = df.index.to_julian_date().values[:, None]
x = np.concatenate([np.ones_like(dates), dates], axis=1)
y = df.values
return pd.DataFrame(np.linalg.pinv(x.T.dot(x)).dot(x.T).dot(y).T,
df.columns, ['Constant', 'Trend'])
trend(df)
Using the same df above for its index:
df_sample = pd.DataFrame((df.index.to_julian_date() * 10 + 2) + np.random.rand(100) * 1e3,
df.index)
coef = trend(df_sample)
df_sample['trend'] = (coef.iloc[0, 1] * df_sample.index.to_julian_date() + coef.iloc[0, 0])
df_sample.plot(style=['.', '-'])

Related

Struggling to graph a Beta Distribution using Python

Given some measures I am trying to create a beta distribution. Given a max, min, mean and also an alpha and beta how do I call the beta.ppf or beta.pfd to generate a proper data set?
Working Sample
https://www.kaggle.com/iancoetzer/betaworking
Broken Sample
https://www.kaggle.com/iancoetzer/betaproblem
import matplotlib.pyplot as plt
from scipy.stats import beta
#
# Set the shape paremeters
#
a = 2.8754
b = 3.0300
minv = 82.292
maxv = 129.871
mean = 105.46
#
# Generate the value between
#
x = np.linspace(beta.ppf(minv, a, b),beta.ppf(maxv, a, b), 100)
#
# Plot the beta distribution
#
plt.figure(figsize=(7,7))
plt.xlim(0.7, 1)
plt.plot(x, beta.pdf(x, a, b), 'r-')
plt.title('Beta Distribution', fontsize='15')
plt.xlabel('Values of Random Variable X (0, 1)', fontsize='15')
plt.ylabel('Probability', fontsize='15')
plt.show()```
We managed to code a simple solution to compute and plot the Beta Distribution as follow: see the red beta curve.
Now we are trying to plot a Weibull distribution ...
#import libraries
import pandas as pd, numpy as np, gc, time, os, uuid, math, datetime
from joblib import Parallel, delayed
from numpy.random import default_rng
from scipy.stats import beta
from scipy import special
from scipy.stats import exponweib
import matplotlib.pyplot as plt
#sample parameters
low, high, mean, a, b, trials = 82.292, 129.871, 105.46, 2.8754, 3.0300, 10000
scale = (high-low)/6
#normal
normal_arr = np.random.normal(loc=mean, scale=scale, size=trials)
#triangular
triangular_arr = np.random.triangular(left=low, mode=mean, right=high, size=trials)
#log normal
mu = math.log(math.pow(mean,2) / math.sqrt(math.pow(scale,2) + math.pow(mean,2)))
sigma = math.sqrt(math.log(math.pow(scale,2)/(math.pow(mean,2)) + 1))
lognorm_arr = np.random.lognormal(mean=mu, sigma=sigma, size=trials)
#beta
beta_x = np.linspace(beta.ppf(0.0, a, b),beta.ppf(1, a, b), trials)
#by = beta.pdf(bx, a, b)
beta_arr = beta.ppf(beta_x, a, b, loc=low, scale=high - low)
#define binning(arr) method:
def binning(arr):
df = pd.DataFrame(arr)
df["Trial"] = range(1, len(df) + 1)
df[0] = df[0].astype(float)
df.rename(columns = {0: "Result"}, inplace=True)
minval = df["Result"].min()
maxval = df["Result"].max()
binCount = 100
bins = np.linspace(minval, maxval, binCount + 1)
labels = np.arange(1, binCount + 1)
df["bins"] = pd.cut(df["Result"], bins = bins, labels = labels, include_lowest = True)
dfBin = df.groupby(["bins"])["Result"].mean()
dfCount = df.groupby(["bins"])["Result"].count()
dfBin.replace(np.nan, 0.0, inplace=True)
dfCount.replace(np.nan, 0, inplace=True)
dfCount = pd.DataFrame(dfCount)
dfBin = pd.DataFrame(dfBin)
dfBin["bin"] = range(1, len(dfBin) + 1)
dfBin["Result"] = dfBin["Result"].astype(float)
df = pd.merge(dfBin, dfCount, left_index=True, right_index=True)
#Rename the resulting columns
df.rename(columns = {'Result_x':'Mean'}, inplace = True)
df.rename(columns = {'Result_y':'Trials'}, inplace = True)
return df
dfNormal = binning(normal_arr)
dfLog = binning(lognorm_arr)
dfTriangular = binning(triangular_arr)
dfBeta = binning(beta_arr)
dfWeibull = binning(wei_arr)
dfNormal.drop(dfNormal[dfNormal["Mean"] == 0].index, inplace=True)
dfLog.drop(dfLog[dfLog["Mean"] == 0].index, inplace=True)
dfTriangular.drop(dfTriangular[dfTriangular["Mean"] == 0].index, inplace=True)
dfBeta.drop(dfBeta[dfBeta["Mean"] == 0].index, inplace=True)
dfWeibull.drop(dfWeibull[dfWeibull["Mean"] == 0].index, inplace=True)
plt.plot(dfNormal["Mean"], dfNormal["Trials"], label="Normal")
plt.plot(dfLog["Mean"], dfLog["Trials"], label="Lognormal")
plt.plot(dfTriangular["Mean"], dfTriangular["Trials"], label="Triangular")
plt.plot(dfBeta["Mean"], dfBeta["Trials"], label="Beta")
plt.plot(dfWeibull["Mean"], dfWeibull["Trials"], label="Weibull")
plt.legend(loc='upper right')
plt.xlabel("R amount")
plt.ylabel("# Trials")
#plt.xlim(low, high)
plt.show()

Python's `.loc` is really slow on selecting subsets of Data

I'm having a large multindexed (y,t) single valued DataFrame df. Currently, I'm selecting a subset via df.loc[(Y,T), :] and create a dictionary out of it. The following MWE works, but the selection is very slow for large subsets.
import numpy as np
import pandas as pd
# Full DataFrame
y_max = 50
Y_max = range(1, y_max+1)
t_max = 100
T_max = range(1, t_max+1)
idx_max = tuple((y,t) for y in Y_max for t in T_max)
df = pd.DataFrame(np.random.sample(y_max*t_max), index=idx_max, columns=['Value'])
# Create Dictionary of Subset of Data
y1 = 4
yN = 10
Y = range(y1, yN+1)
t1 = 5
tN = 9
T = range(t1, tN+1)
idx_sub = tuple((y,t) for y in Y for t in T)
data_sub = df.loc[(Y,T), :] #This is really slow
dict_sub = dict(zip(idx_sub, data_sub['Value']))
# result, e.g. (y,t) = (5,7)
dict_sub[5,7] == df.loc[(5,7), 'Value']
I was thinking of using df.loc[(y1,t1),(yN,tN), :], but it does not work properly, as the second index is only bounded in the final year yN.
One idea is use Index.isin with itertools.product in boolean indexing:
from itertools import product
idx_sub = tuple(product(Y, T))
dict_sub = df.loc[df.index.isin(idx_sub),'Value'].to_dict()
print (dict_sub)

heatmap of values grouped by time - seaborn

I'm plotting the counts of a variable grouped by time as a heatmap. However, when including both hour and minute, the counts are quite low so the resulting heatmap doesn't really provide any real insight. Is it possible to group the counts in a bigger block of time? I'm hoping to test some different periods (5, 10 mins).
I'm also hoping to plot time on the x-axis. Similar to the output attached.
import seaborn as sns
import pandas as pd
from datetime import datetime
from datetime import timedelta
start = datetime(1900,1,1,10,0,0)
end = datetime(1900,1,1,13,0,0)
seconds = (end - start).total_seconds()
step = timedelta(minutes = 1)
array = []
for i in range(0, int(seconds), int(step.total_seconds())):
array.append(start + timedelta(seconds=i))
array = [i.strftime('%Y-%m-%d %H:%M%:%S') for i in array]
df2 = pd.DataFrame(array).rename(columns = {0:'Time'})
df2['Count'] = np.random.uniform(0.0, 0.5, size = len(df2))
df2['Count'] = df2['Count'].round(1)
df2['Time'] = pd.to_datetime(df2['Time'])
df2['Hour'] = df2['Time'].dt.hour
df2['Min'] = df2['Time'].dt.minute
g = df2.groupby(['Hour','Min','Count'])
count_df = g['Count'].nunique().unstack()
count_df.fillna(0, inplace = True)
sns.heatmap(count_df)
To deal with such cases, I think it would be easy to use data downsampling. It is also easy to change the thresholds. The axis labels in the output graph will need to be modified, but we recommend this method.
import seaborn as sns
import pandas as pd
import numpy as np
from datetime import datetime
from datetime import timedelta
start = datetime(1900,1,1,10,0,0)
end = datetime(1900,1,1,13,0,0)
seconds = (end - start).total_seconds()
step = timedelta(minutes = 1)
array = []
for i in range(0, int(seconds), int(step.total_seconds())):
array.append(start + timedelta(seconds=i))
array = [i.strftime('%Y-%m-%d %H:%M:%S') for i in array]
df2 = pd.DataFrame(array).rename(columns = {0:'Time'})
df2['Count'] = np.random.uniform(0.0, 0.5, size = len(df2))
df2['Count'] = df2['Count'].round(1)
df2['Time'] = pd.to_datetime(df2['Time'])
df2['Hour'] = df2['Time'].dt.hour
df2['Min'] = df2['Time'].dt.minute
df2.set_index('Time', inplace=True)
count_df = df2.resample('10min')['Count'].value_counts().unstack()
count_df.fillna(0, inplace = True)
sns.heatmap(count_df.T)
The way you could achieve this is by creating a column with numbers that have repeating elements for the number of minutes.
For example:
minutes = 3
x = [0,1,2]
np.repeat(x, repeats=minutes, axis=0)
>>>> [0,0,0,1,1,1,2,2,2]
and then group your data using this column.
So your code would look like:
...
minutes = 5
x = [i for i in range(int(df2.shape[0]/5))]
df2['group'] = np.repeat(x, repeats=minutes, axis=0)
g = df2.groupby(['Min', 'Count'])
count_df = g['Count'].nunique().unstack()
count_df.fillna(0, inplace = True)

Cluster groups by direction and magnitude - Python

I'm hoping to cluster vectors based on the direction and magnitude using python. I've found limited examples using R but none for python. Not to confuse with standard k-means for scatter points, I'm actually trying to cluster the whole vector.
The following takes two sets of xy points to generate a vector. I'm then hoping to cluster these vectors based on the length and direction.
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.cluster import KMeans
df = pd.DataFrame(np.random.randint(0,20,size=(100, 4)), columns=list('ABCD'))
plt.rcParams['image.cmap'] = 'Paired'
fig,ax = plt.subplots()
ax.set_xlim(-5, 25)
ax.set_ylim(-5, 25)
A = df['A']
B = df['B']
C = df['C']
D = df['D']
ax.quiver(A, B, (C-A), (D-B), angles = 'xy', scale_units = 'xy', scale = 1, alpha = 0.5)
X_1 = np.array(df[['A','B','C','D']])
model = KMeans(n_clusters = 20)
model.fit(X_1)
cluster_labels = model.predict(X_1)
df['n_cluster'] = cluster_labels
centroids_1 = pd.DataFrame(data = model.cluster_centers_, columns = ['start_x', 'start_y', 'end_x', 'end_y'])
cc = model.cluster_centers_
a = cc[:, 0]
b = cc[:, 1]
c = cc[:, 2]
d = cc[:, 3]
lc1 = ax.quiver(a, b, (c-a), (d-b), angles = 'xy', scale_units = 'xy', scale = 1, alpha = 0.8)
The following figure displays an example
What about this :
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import hdbscan
df = pd.DataFrame(np.random.randint(0,20,size=(100, 4)), columns=list('ABCD'))
plt.rcParams['image.cmap'] = 'Paired'
A = df['A'] #X start
B = df['B'] #Y start
C = df['C'] #X arrive
D = df['D'] #Y arrive
clusterer = hdbscan.HDBSCAN()
df['LENGTH'] = np.sqrt(np.square(df.C-df.A) + np.square(df.D-df.B))
df['DIRECTION'] = np.degrees(np.arctan2(df.D-df.B, df.C-df.A))
coords = df[['LENGTH', 'DIRECTION']].values
clusterer.fit_predict(coords)
cluster_labels = clusterer.labels_
num_clusters = len(set(cluster_labels))
clusters = pd.DataFrame(
[(coords[cluster_labels==n], len(coords[cluster_labels==n])) for n in range(num_clusters)],
columns=["points", "weight"]
)
colors = {0:"green", 1:"blue", 2:"red", 3:"yellow", 4:"pink"}
df['CLUSTER'] = np.nan
for x, (cluster, weight) in enumerate(clusters[clusters.weight>0].values.tolist()):
df_this_cluster = pd.DataFrame(cluster, columns=['LENGTH', 'DIRECTION'])
df_this_cluster['TEMP'] = x
df = df.merge(df_this_cluster, on=['LENGTH', 'DIRECTION'], how='left')
ix = df[df.TEMP.notnull()].index
df.loc[ix, "CLUSTER"] = df.loc[ix, "TEMP"]
df.drop("TEMP", axis=1, inplace=True)
df['COLOR'] = df['CLUSTER'].map(colors).fillna('black')
fig,ax = plt.subplots()
ax.set_xlim(-5, 25)
ax.set_ylim(-5, 25)
ax.quiver(df.A, df.B, (df.C-df.A), (df.D-df.B), angles='xy', scale_units='xy', scale=1, alpha=0.5, color=df.COLOR)
This will use clustering based on length and direction (direction being transformed to degrees, radians' small range doesn't match very well with the model on my first try).
I don't think this will be a very "cartesian" solution as the two values beeing analysed in the model are not in the same metrics... But the visual results are not so bad...
I did try another match based on the 4 coordinates, which is more rigorous. But it is (quite expectably) clustering the vectors by subareas of the space (when there are any) :
coords = df[['A', 'B', 'C', 'D']].values
clusterer.fit_predict(coords)
cluster_labels = clusterer.labels_
num_clusters = len(set(cluster_labels))
clusters = pd.DataFrame(
[(coords[cluster_labels==n], len(coords[cluster_labels==n])) for n in range(num_clusters)],
columns=["points", "weight"]
)
colors = {0:"green", 1:"blue", 2:"red", 3:"yellow", 4:"pink"}
df['CLUSTER'] = np.nan
for x, (cluster, weight) in enumerate(clusters[clusters.weight>0].values.tolist()):
df_this_cluster = pd.DataFrame(cluster, columns=['A', 'B', 'C', 'D'])
df_this_cluster['TEMP'] = x
df = df.merge(df_this_cluster, on=['A', 'B', 'C', 'D'], how='left')
ix = df[df.TEMP.notnull()].index
df.loc[ix, "CLUSTER"] = df.loc[ix, "TEMP"]
df.drop("TEMP", axis=1, inplace=True)
df['COLOR'] = df['CLUSTER'].map(colors).fillna('black')
EDIT
I gave it another try, based on the (very good) suggestion that angles are not a good variable given the fact that there are discontinuities around 0/2pi ; so I choose to use both sinuses and cosinuses instead. I also scaled the length (to have matching scales for the 3 variables) :
So the result would be :
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.preprocessing import robust_scale
import hdbscan
df = pd.DataFrame(np.random.randint(0,20,size=(100, 4)), columns=list('ABCD'))
plt.rcParams['image.cmap'] = 'Paired'
A = df['A'] #X start
B = df['B'] #Y start
C = df['C'] #X arrive
D = df['D'] #Y arrive
clusterer = hdbscan.HDBSCAN()
df['LENGTH'] = robust_scale(np.sqrt(np.square(df.C-df.A) + np.square(df.D-df.B)))
df['DIRECTION'] = np.arctan2(df.D-df.B, df.C-df.A)
df['COS'] = np.cos(df['DIRECTION'])
df['SIN'] = np.sin(df['DIRECTION'])
columns = ['LENGTH', 'COS', 'SIN']
clusterer = hdbscan.HDBSCAN()
values = df[columns].values
clusterer.fit_predict(values)
cluster_labels = clusterer.labels_
num_clusters = len(set(cluster_labels))
clusters = pd.DataFrame(
[(values[cluster_labels==n], len(values[cluster_labels==n])) for n in range(num_clusters)],
columns=["points", "weight"]
)
def get_cmap(n, name='hsv'):
'''
Returns a function that maps each index in 0, 1, ..., n-1 to a distinct
RGB color; the keyword argument name must be a standard mpl colormap name.
Credits to #Ali
https://stackoverflow.com/questions/14720331/how-to-generate-random-colors-in-matplotlib#answer-25628397
'''
return plt.cm.get_cmap(name, n)
cmap = get_cmap(num_clusters+1)
colors = {x:cmap(x) for x in range(num_clusters)}
df['CLUSTER'] = np.nan
for x, (cluster, weight) in enumerate(clusters[clusters.weight>0].values.tolist()):
df_this_cluster = pd.DataFrame(cluster, columns=columns)
df_this_cluster['TEMP'] = x
df = df.merge(df_this_cluster, on=columns, how='left')
df.reset_index(drop=True, inplace=True)
ix = df[df.TEMP.notnull()].index
df.loc[ix, "CLUSTER"] = df.loc[ix, "TEMP"]
df.drop("TEMP", axis=1, inplace=True)
df['CLUSTER'] = df['CLUSTER'].fillna(num_clusters-1)
df['COLOR'] = df['CLUSTER'].map(colors)
print("Number of clusters : ", num_clusters-1)
nrows = num_clusters//2 if num_clusters%2==0 else num_clusters//2 + 1
fig,axes = plt.subplots(nrows=nrows, ncols=2)
axes = [y for row in axes for y in row]
for k,ax in enumerate(axes):
ax.set_xlim(-5, 25)
ax.set_ylim(-5, 25)
ax.set_aspect('equal', adjustable='box')
if k+1 <num_clusters:
ax.set_title(f"CLUSTER #{k+1}", fontsize=10)
this_df = df[df.CLUSTER==k]
ax.quiver(
this_df.A, #X
this_df.B, #Y
(this_df.C-this_df.A), #X component of vector
(this_df.D-this_df.B), #Y component of vector
angles = 'xy',
scale_units = 'xy',
scale = 1,
color=this_df.COLOR
)
The results are way better (though it depends much of the input dataset) ; the last subplots refers to the vectors not being found to be inside a cluster:
Edit #2
If by "direction" you mean angle in the [0..pi[ interval (ie undirected vectors), you will want to include the following code before computing the cosinuses/sinuses :
ix = df[df.DIRECTION<0].index
df.loc[ix, "DIRECTION"] += np.pi
Maybe you can also cluster the angles (besides the vector norms) by the projections of a normalized vector onto the two unit vectors (1,0) and (0,1) with this function. Handling the projections directly (which are essentially the angles), we won't went into trouble caused by the periodicity of cosine function
def get_norm_and_angle(e1):
e1_norm = np.linalg.norm(e1,axis=1)
e1 = e1 / e1_norm[:,None]
e2 = np.array([1,0])
e3 = np.array([0,1])
return np.stack((e1_norm,e1#e2,e1#e3),axis=1)
Based on this function, here is one possible solution where there is no constraint on how many clusters we want to find. In the script below, five features are used for clustering
Vector norm
Vector projections on x and y axis
Vector starting points
with these five features
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.cluster import KMeans
def get_norm_and_angle(e1):
e1_norm = np.linalg.norm(e1,axis=1)
e1 = e1 / e1_norm[:,None]
e2 = np.array([1,0])
e3 = np.array([0,1])
return np.stack((e1_norm,e1#e2,e1#e3),axis=1)
data = np.cumsum(np.random.randint(0,10,size=(50, 4)),axis=0)
df = pd.DataFrame(data, columns=list('ABCD'))
A = df['A'];B = df['B']
C = df['C'];D = df['D']
starting_points = np.stack((A,B),axis=1)
vectors = np.stack((C,D),axis=1) - np.stack((A,B),axis=1)
different_view = get_norm_and_angle(vectors)
different_view = np.hstack((different_view,starting_points))
num_clusters = 8
model = KMeans(n_clusters=num_clusters)
model.fit(different_view)
cluster_labels = model.predict(different_view)
df['n_cluster'] = cluster_labels
cluster_centers = model.cluster_centers_
cluster_offsets = cluster_centers[:,0][:,None] * cluster_centers[:,1:3]
cluster_starts = np.vstack([np.mean(starting_points[cluster_labels==ind],axis=0) for ind in range(num_clusters)])
main_streams = np.hstack((cluster_starts,cluster_starts+cluster_offsets))
a,b,c,d = main_streams.T
fig,ax = plt.subplots(figsize=(8,8))
ax.set_xlim(-np.max(data)*.1,np.max(data)*1.1)
ax.set_ylim(-np.max(data)*.1,np.max(data)*1.1)
colors = sns.color_palette(n_colors=num_clusters)
lc1 = ax.quiver(a, b, (c-a), (d-b), angles = 'xy', scale_units = 'xy', color = colors, scale = 1, alpha = 0.8, zorder=100)
lc2 = ax.quiver(A, B, (C-A), (D-B), angles = 'xy', scale_units = 'xy', scale = .6, alpha = 0.2)
start_colors = [colors[ind] for ind in cluster_labels]
ax.scatter(starting_points[:,0],starting_points[:,1],c=start_colors)
plt.show()
A sample output is
as you can see in the figure, vectors with close starting points are clustered into the same group.

Pandas subsetting returing different results to numpy

I am trying to subset a pandas dataframe using two conditions. However, I am not getting the same results as when done with numpy. What am I doing wrong?
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
x = np.linspace(20,120,101)
y = np.linspace(-45,25,101)
xs,ys = np.meshgrid(x,y)
idx = (xs >=100) & (ys >= 0)
plt.scatter(xs,ys,s=2,c='b')
plt.scatter(xs[idx],ys[idx],s=2,c='r')
I need to remove the red block from my dataset, which I can do with numpy by using:
plt.scatter(xs[~idx],ys[~idx],s=2,c='b')
How do I replicate this with a pandas dataframe?
I've tried using the same logic as I used above:
data = {'x':x,'y':y}
df = pd.DataFrame(data)
mask = (df.x >=100) & (df.y >= 0)
df2 = df[~mask]
I've also tried using loc:
df.loc[(df.x >=100) & (df.y >= 0),['x','y']] = np.nan
Both of these methods give the following result:
How do I replicate the results from numpy?
Many thanks.
You don't obtain the same result because you didn't create all the couple of coordinates before passing them to pandas. Here is a quick solution:
data = {'x':xs.flatten(),'y':ys.flatten()}
df = pd.DataFrame(data)
mask = (df.x >=100) & (df.y >= 0)
df2 = df[~mask]
plt.scatter(df2.x,df2.y,s=2,c='b')
Flatten reshape your arrays to only have one dimension so that they can be used to construct a DF containing couple of coordinates and not lists.
Output:
Edit: Same result but with dataframe containing x and y
Split the df in chunks
data_x = np.linspace(20,120,101)
data_y = np.linspace(-45,25,101)
dataframe = pd.DataFrame({'x':data_x,'y':data_y})
chunk_size = 25
dfs = [dataframe[i:i+chunk_size] for i in range(0,dataframe.shape[0],chunk_size)]
Define the function that will give you the points you are interested in. Two loops because you need to get every configuration of x and y values
def generatorPoints(dfs):
for i in range(len(dfs)):
x = dfs[i].x
for j in range(len(dfs)):
y = dfs[j].y
xs, ys = np.meshgrid(x,y)
idx = (xs >=100) & (ys >= 0)
yield xs[~idx], ys[~idx]
x, y = [], []
for xs, ys in generatorPoints(dfs):
x.extend(xs), y.extend(ys)
plt.scatter(x,y,s=2,c='b')
This gives the same result as the previous code. There is certainly place to make some optimization but this is a start for your request :).

Categories

Resources