Pandas dataframe best line fitting - python

Hi I have the following dataset:
A = [1,10,23,45,24,25,55,67,73,26,13,96,53,23,24,43,90, 49],
B = [24,23,29, BW,49,59,72, BW,9,183,17,12,2,49,BW,479,18,BW]
I want to fit straight lines in between 'BW' with A column in the x axis and store the slope values in new column C.
That is slope 1 is obtained by using [1,10,23] as x values and [24,23,29] as y values. Slope 2 is obtained using [24,25,55] as x values and [49,59,72] as y values. This continues until the end of the data frame.
Expected output will be:
C = [slope1, np.nan, np.nan, BW, slope2, np.nan, np.nan, BW, slope3,np.nan, np.nan,np.nan,np.nan,np.nan, BW, slope4, np.nan, BW]
Also, is there a way I can show these lines in a graph? I am new in this and have no idea. Any help is much appreciated.

You can identify the 'BW' locations and then split your arrays in these indexes. Here is a sample of you could do that:
from pprint import pprint
import matplotlib.colors as mcolors
import matplotlib.pyplot as plt
import numpy as np
colors = list(mcolors.TABLEAU_COLORS.values())
A = [1, 10, 23, 45, 24, 25, 55, 67, 73, 26, 13, 96, 53, 23, 24, 43, 90, 49]
B = [24, 23, 29, 'BW', 49, 59, 72, 'BW', 9,
183, 17, 12, 2, 49, 'BW', 479, 18, 'BW']
index = [k for k, value in enumerate(B) if value == 'BW']
index = [-1] + index + [len(B)]
slopes = []
for k in range(len(index)-1):
x = A[index[k]+1:index[k+1]]
y = B[index[k]+1:index[k+1]]
if len(x) == 0:
continue
[slope, offset] = np.polyfit(x, y, 1)
slopes.append(slope)
reg_x = np.linspace(min(x), max(x), 10)
reg_y = slope*reg_x + offset
plt.plot(x, y, 'o', color=colors[k], label=f'Group {k}')
plt.plot(reg_x, reg_y, color=colors[k])
pprint(slopes)
plt.legend()
plt.show()
As slopes vector the result is
[0.24386920980926416,
0.5977443609022566,
-0.9183274470232099,
-9.808510638297868]
And the plot:
Maybe this is not the most elegant or pythonic way to solve this, but gets the job done

Related

Convert multidimensional array of indices clusters to a 1D categorical array

I have a function which returns a multidimensional array of k clusters. My algorith works for the most part, but I need it to return a categorical array instead of a multidimensional array. Here is my code:
import numpy as np
import pandas as pd
import random
from bokeh.sampledata.iris import flowers
from typing import List, Tuple
def get_closest(data_point: np.ndarray, centroids: np.ndarray):
"""
Takes a data_point and a nd.array of multiple centroids and returns the index of the centroid closest to data_point
by computing the euclidean distance for each centroid and picking the closest.
"""
N = centroids.shape[0]
dist = np.empty(N)
for i, c in enumerate(centroids):
dist[i] = np.linalg.norm(c - data_point)
index_min = np.argmin(dist)
return index_min
# Use these centroids in the first iteration of you algorithm if "Random Centroids" is set to False in the Dashboard
DEFAULT_CENTROIDS = np.array([[5.664705882352942, 3.0352941176470587, 3.3352941176470585, 1.0176470588235293],
[5.446153846153847, 3.2538461538461543, 2.9538461538461536, 0.8846153846153846],
[5.906666666666667, 2.933333333333333, 4.1000000000000005, 1.3866666666666667],
[5.992307692307692, 3.0230769230769234, 4.076923076923077, 1.3461538461538463],
[5.747619047619048, 3.0714285714285716, 3.6238095238095243, 1.1380952380952383],
[6.161538461538462, 3.030769230769231, 4.484615384615385, 1.5307692307692309],
[6.294117647058823, 2.9764705882352938, 4.494117647058823, 1.4],
[5.853846153846154, 3.215384615384615, 3.730769230769231, 1.2076923076923078],
[5.52857142857143, 3.142857142857143, 3.107142857142857, 1.007142857142857],
[5.828571428571429, 2.9357142857142855, 3.664285714285714, 1.1]])
def k_means(data_np: np.ndarray, k:int=3, n_iter:int=500, random_initialization=False) -> Tuple[np.ndarray, int]:
"""
:param data: your data, a numpy array with shape (n_entries, n_features)
:param k: The number of clusters to compute
:param n_iter: The maximal numnber of iterations
:param random_initialization: If False, DEFAULT_CENTROIDS are used as the centroids of the first iteration.
:return: A tuple (cluster_indices: A numpy array of cluster_indices,
n_iterations: the number of iterations it took until the algorithm terminated)
"""
# Initialize the algorithm by assigning random cluster labels to each entry in your dataset
k=k+1
centroids = data_np[random.sample(range(len(data_np)), k)]
labels = np.array([np.argmin([(el - c) ** 2 for c in centroids]) for el in data_np])
clustering = []
for k in range(k):
clustering.append(data_np[labels == k])
# Implement K-Means with a while loop, which terminates either if the centroids don't move anymore, or
# if the number of iterations exceeds n_iter
counter = 0
while counter < n_iter:
# Compute the new centroids, if random_initialization is false use DEFAULT_CENTROIDS in the first iteration
# if you use DEFAULT_CENTROIDS, make sure to only pick the k first entries from them.
if random_initialization is False and counter == 0:
centroids = DEFAULT_CENTROIDS[random.sample(range(len(DEFAULT_CENTROIDS)), k)]
# Update the cluster labels using get_closest
labels = np.array([get_closest(el, centroids) for el in data_np])
clustering = []
for i in range(k):
clustering.append(np.where(labels == i)[0])
counter += 1
new_centroids = np.zeros_like(centroids)
for i in range(k):
if len(clustering[i]) > 0:
new_centroids[i] = data_np[clustering[i]].mean(axis=0)
else:
new_centroids[i] = centroids[i]
# if the centroids didn't move, exit the while loop
if clustering is not None and (centroids == new_centroids).sum() == 0:
break
else:
centroids = new_centroids
pass
# return the final cluster labels and the number of iterations it took
return clustering, counter
# read and store the dataset
data: pd.DataFrame = flowers.copy(deep=True)
data = data.drop(['species'], axis=1)
data_np = np.asarray(data)
clustering, counter = k_means(data_np,4,500,False)
So clustering looks like so
clustering
[array([ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 57,
98], dtype=int64),
array([60, 93], dtype=int64),
array([ 50, 51, 52, 53, 54, 55, 56, 58, 61, 62, 63, 65, 66,
67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79,
80, 81, 82, 83, 86, 87, 89, 90, 91, 92, 94, 95, 96,
97, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110,
111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123,
124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136,
137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149],
dtype=int64),
array([59, 64, 84, 85, 88], dtype=int64)]
However, what I'm looking for is an array like
clustering
array([1, 3, 2, ..., 4, 1, 4], dtype=int64)]
Also, the while loop is always terminating after 1 iteration which shouldn't be the case.
counter
1
EDIT1:
The code continues as follows.
def callback(attr, old, new):
# recompute the clustering and update the colors of the data points based on the result
k = slider_k.valued_throttled
init = select_init.value
clustering_new, counter_new = k_means(data_np,k,500,init)
pass
# Create the dashboard
# 1. A Select widget to choose between random initialization or using the DEFAULT_CENTROIDS on top
select_init = Select(title='Random Centroids',value='False',options=['True','False'])
# 2. A Slider to choose a k between 2 and 10 (k being the number of clusters)
slider_k = Slider(start=2,end=10,value=3,step=1,title='k')
# 4. Connect both widgets to the callback
select_init.on_change('value',callback)
slider_k.on_change('value_throttled',callback)
# 3. A ColumnDataSource to hold the data and the color of each point you need
source = ColumnDataSource(dict(petal_length=data['petal_length'],sepal_length=data['sepal_length'],petal_width=data['petal_width'],clustering=clustering))
# 4. Two plots displaying the dataset based on the following table, have a look at the images
# in the handout if this confuses you.
#
# Axis/Plot Plot1 Plot2
# X Petal length Petal width
# Y Sepal length Petal length
#
# Use a categorical color mapping, such as Spectral10, have a look at this section of the bokeh docs:
# https://docs.bokeh.org/en/latest/docs/user_guide/categorical.html#filling
plot1 = figure(plot_width=100,plot_height=100,title='Scatterplot of flowers distribution by petal length and sepal length')
plot1.yaxis.axis_label = 'Sepal length'
plot1.xaxis.axis_label = 'Petal length'
scatter1 = plot1.scatter(x='petal_length',y='sepal_length',source=source,fill_color=factor_cmap('clustering', palette=Spectral10, factors=clustering))
plot2 = figure(plot_width=100,plot_height=100,title='Scatterplot of flowers distribution by petal width and petal length')
plot2.yaxis.axis_label = 'Petal length'
plot2.xaxis.axis_label = 'Petal width'
scatter2 = plot2.scatter(x='petal_width',y='petal_length',source=source,fill_color=factor_cmap('clustering', palette=Spectral10, factors=clustering))
# 5. A Div displaying the currently number of iterations it took the algorithm to update the plot.
div = Div(text='Number of iterations: ')
Thus the end result should look like so
I'm not sure I understand what you need.
If clustering contains a list of arrays where each array represent a cluster and the ith array contains the indices of the samples that belong to the ith cluster and what you need is to convert this to a single vector of size number_of_samples that represent the cluster each sample belongs to you can do it like this:
def to_classes(clustering):
# Get number of samples (you can pass it directly to the function)
num_samples = sum(x.shape[0] for x in clustering)
indices = np.empty((num_samples,)) # An empty array with correct size
for ith, cluster in enumerate(clustering):
# use cluster indices to assign to correct the cluster index
indices[cluster] = ith
return indices
The loops exists after a single iteration because the break condition is wrong, I think what you want is actually
# note the !=
if clustering is not None and (centroids != new_centroids).sum() == 0:
break

Creating a limit to find the sum of array values to a set number Numpy Python

I want to make a function where the sum of the Arrays and Arrays2 array is equivalent to val. The function should modify the Arrays and Arrays2 values so that the last index will output the sum of all values in the array to be val. How will be able to get the Expected Output?
import numpy as np
Arrays = np.array([50, 30, 25, 87, 44, 68, 45])
Arrays2 = np.array([320])
val = 300
Expected output:
[50, 30, 25, 87, 44, 64]
[300]
something like this?
import numpy as np
Arrays = np.array([50, 30, 25, 87, 44, 68, 45])
Arrays2 = np.array([320])
val = 300
def thisRareFunction(arr):
outArrays = []
acum = 0
for x in arr:
acum += x
if acum <=val:
outArrays.append(x)
else:
outArrays.append(x -(acum-val))
break
return outArrays
print(thisRareFunction(Arrays))
print(thisRareFunction(Arrays2))

Using curve_fit to a function defined by indefinite integral in Python

I'm trying to make a code to fit 2 curves with 5 parameters to real data. They are shown here:
The first curve only depends on a,b and gamma. So I decided to use curve_fit once to these 3 (which works) and then use it again on the second curve to adjust the last two alpha and k_0.
Problem is that the second is defined by this indefinite integral and i can't code it properly.
I have tried to treat x as a symbol and integrate using sym.integrate and just integrate normally with quad. Neither worked. In the second case, I get "ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()" in "mortes" function.
import matplotlib.pyplot as plt
from scipy.optimize import curve_fit
import scipy.integrate as integrate
import numpy as np
import sympy as sym
#Experimental x and y data points
#Dados de SP
xData = np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34])
ycasos = np.array([2, 13, 65, 459, 1406, 4466, 8419, 13894, 20004, 31174, 44411, 61183, 80558, 107142, 140549, 172875, 215793, 265581, 312530, 366890, 412027, 479481, 552318, 621731, 697530, 749244, 801422, 853085, 890690, 931673, 970888, 1003429, 1034816, 1062634, 1089255])
ymortes = np.array([0, 0, 15, 84, 260, 560, 991, 1667, 2586, 3608, 4688, 6045, 7532, 9058, 10581, 12494, 14263, 15996, 17702, 19647, 21517, 23236, 25016, 26780, 28392, 29944, 31313, 32567, 33927, 35063, 36136, 37223, 37992, 38726, 39311])
#Dados do Brasil
#xData = np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45])
#ycasos = np.array([2,9,121,1128,3912,10298,20818,36739,58973,96559,155939,233142, 347398, 498440, 672846, 850514, 1067579, 1313667, 1577004, 1839850, 2074860, 2394513, 2707877, 3012412, 3317096, 3582362, 3846153, 4123000, 4315687, 4528240, 4717991, 4906833, 5082637, 5224362, 5380635, 5535605, 5653561, 5848959, 6052786, 6290272, 6577177, 6880127, 7213155, 7465806, 7716405, 8013708])
#ymortes = np.array([])
u0 = ycasos[0]
v0 = ymortes[0]
#u(t)
def casos(x,a,b,gama):
return u0 * (a ** (1/gama)) * np.exp(a*x) *((a + b * (u0 ** gama) * (np.exp(a*gama*x)-1)) ** (-1/gama))
#Plot experimental data points
plt.plot(xData, ycasos, 'bo', label='reais')
# Initial guess for the parameters
#initialGuess = [3.0,1.5,0.05]
#Primeiro fit
copt, ccov = curve_fit(casos, xData, ycasos,bounds=(0, [1., 1., np.inf]),maxfev=100000)
a_opt = copt[0]
b_opt = copt[1]
gama_opt = copt[2]
print('Primeira etapa \n')
print('Parametros encontrados: a=%.9f, b=%.9f,gama=%.9f \n' % tuple(copt))
def integrand(t,alpha):
return np.exp((a_opt - alpha)*t) *((a_opt + b_opt * (u0 ** gama_opt) * (np.exp(a_opt*gama_opt*t)-1)) ** (-1/gama_opt))
def mortes(x,k0,alpha):
return u0 * (a_opt ** (1/gama_opt)) * k0 * integrate.quad(integrand, 0, x, args=(alpha)) + v0
#Segundo fit
mopt, mcov = curve_fit(mortes, xData, ymortes, bounds=(0, [np.inf, a_opt]), maxfev=100000)
print('Segunda etapa \n')
print('Parametros encontrados: k0=%.9f, alpha=%.9f \n' % tuple(mopt))
#x values for the fitted function
xFit = np.arange(0.0, float(len(xData)), 0.01)
#Plot the fitted function
plt.plot(xFit, casos(xFit, *copt), 'r', label='estimados')
plt.xlabel('t')
plt.ylabel('casos')
plt.legend()
plt.show()
The upper bound of an integral (integrate.quad) has to be a float, not an array as your x (argument of mortes()):
In this way it should work:
def mortes(x,k0,alpha):
integralRes = []
for upBound in x:
integralRes.append(integrate.quad(integrand, 0, upBound, args=(alpha))[0])
return u0 * (a_opt ** (1/gama_opt)) * k0 * np.array(integralRes) + v0
p.s. Elegant editions of the code style are more than welcomed (like allowing passing an array to upper and lower bounds of integrate.quad ).

Gaussian fit to histogram on python seems off. What could I change to improve the fit?

I have created a Gaussian fit to data plotted as a bar chart. However, the fit does not look right, and I don't know what to change to improve the fit. My code is as follows:
import matplotlib.pyplot as plt
import math
import numpy as np
from collections import Counter
import collections
from scipy.optimize import curve_fit
from scipy.stats import norm
from scipy import stats
import matplotlib.mlab as mlab
k_list = [-40, -32, -30, -28, -26, -24, -22, -20, -18, -16, -14, -12, -10, -8, -6, -4, -3, -2, 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 34]
v_list = [1, 2, 11, 18, 65, 122, 291, 584, 1113, 2021, 3335, 5198, 7407, 10043, 12552, 14949, 1, 16599, 16770, 16728, 14772, 12475, 9932, 7186, 4987, 3286, 1950, 1080, 546, 285, 130, 54, 18, 11, 2, 2]
def func(x, A, beta, B, mu, sigma):
return (A * np.exp(-x/beta) + B * np.exp(-100.0 * (x - mu)**2 / (2 * sigma**2))) #Normal distribution
popt, pcov = curve_fit(func, xdata=k_list, ydata=v_list, p0=[10000, 5, 10000, 10, 10])
print(popt)
x = np.linspace(-50, 50, 1000)
plt.bar(k_list, v_list, label='myPLOT', color = 'b', width = 0.75)
plt.plot(x, func(x, *popt), color='darkorange', linewidth=2.5, label=r'Fitted function')
plt.xlim((-30, 45))
plt.legend()
plt.show()
The plot I obtain is as follows:
How can I adjust my fit?
You have a significant outlier here, possibly caused by a typo: (k, v) == (-3, 1) at index 16 in the data.
The representation of the data as a bar chart is not optimal here. The issue would be clearly visible if you showed the data in the same format as you show the fit. Either of the following would work better:
The outlier forces the peak down. Here is the fit if we remove the outlier manually:
You can remove the outlier automatically by checking its individual residual against the RMSE of the entire fit:
popt, pcov = curve_fit(func, xdata=k_list, ydata=v_list, p0=[10000, 5, 10000, 10, 10])
resid = np.abs(func(k_list, *popt) - v_list)
rmse = np.std(resid)
keep = resid < 3 * rmse
if keep.sum() < keep.size:
popt, pcov = curve_fit(func, xdata=k_list[keep], ydata=v_list[keep], p0=popt)
Or even a repeated application:
popt = [10000, 5, 10000, 10, 10]
while True:
popt, pcov = curve_fit(func, xdata=k_list, ydata=v_list, p0=popt)
resid = np.abs(func(k_list, *popt) - v_list)
rmse = np.std(resid)
keep = resid < 5 * rmse
if keep.sum() == keep.size:
break
k_list = k_list[keep]
v_list = v_list[keep]
A 3-sigma outlier will trim everything off your data after a couple of iterations, so I used 5-sigma. Keep in mind that this is a very quick and dirty way to denoise data. It's really basically manual, since you have to re-check the data to make sure that your choice of factor was correct.

numpy/scipy, loop over subarrays

Lately I've been doing a lot of processing on 8x8 blocks of image-data.
Standard approach has been to use nested for-loops to extract the blocks, e.g.
for y in xrange(0,height,8):
for x in xrange(0,width,8):
d = image_data[y:y+8,x:x+8]
# further processing on the 8x8-block
I can't help to wonder if there is a way to vectorize this operation or another approach using numpy/scipy that I can use instead? An iterator of some kind?
A MWE1:
#!/usr/bin/env python
import sys
import numpy as np
from scipy.fftpack import dct, idct
import scipy.misc
import matplotlib.pyplot as plt
def dctdemo(coeffs=1):
unzig = np.array([
0, 1, 8, 16, 9, 2, 3, 10,
17, 24, 32, 25, 18, 11, 4, 5,
12, 19, 26, 33, 40, 48, 41, 34,
27, 20, 13, 6, 7, 14, 21, 28,
35, 42, 49, 56, 57, 50, 43, 36,
29, 22, 15, 23, 30, 37, 44, 51,
58, 59, 52, 45, 38, 31, 39, 46,
53, 60, 61, 54, 47, 55, 62, 63])
lena = scipy.misc.lena()
width, height = lena.shape
# reconstructed
rec = np.zeros(lena.shape, dtype=np.int64)
# Can this part be vectorized?
for y in xrange(0,height,8):
for x in xrange(0,width,8):
d = lena[y:y+8,x:x+8].astype(np.float)
D = dct(dct(d.T, norm='ortho').T, norm='ortho').reshape(64)
Q = np.zeros(64, dtype=np.float)
Q[unzig[:coeffs]] = D[unzig[:coeffs]]
Q = Q.reshape([8,8])
q = np.round(idct(idct(Q.T, norm='ortho').T, norm='ortho'))
rec[y:y+8,x:x+8] = q.astype(np.int64)
plt.imshow(rec, cmap='gray')
plt.show()
if __name__ == '__main__':
try:
c = int(sys.argv[1])
except ValueError:
sys.exit()
else:
if 1 <= int(sys.argv[1]) <= 64:
dctdemo(int(sys.argv[1]))
Footnotes:
Actual application: https://github.com/figgis/dctdemo
There's a function view_as_windows for this in Scikit Image
http://scikit-image.org/docs/dev/api/skimage.util.html#view-as-windows
Unfortunately I will have to finish this answer another time, but you can grab the windows in a form that you can pass to dct with:
from skimage.util import view_as_windows
# your code...
d = view_as_windows(lena.astype(np.float), (8, 8)).reshape(-1, 8, 8)
dct(d, axis=0)
There is a function called extract_patches in the scikit-learn feature extraction routines. You need to specify a patch_size and an extraction_step. The result will be a view on your image as patches, which may overlap. The resulting array is 4D, the first 2 index the patch, and the last two index the pixels of the patch. Try this
from sklearn.feature_extraction.image import extract_patches
patches = extract_patches(image_data, patch_size=(8, 8), extraction_step=(4, 4))
This gives (8, 8) size patches that overlap by half.
Note that up until now this uses no extra memory, because it is implemented using stride tricks. You can force a copy by reshaping
patches = patches.reshape(-1, 8, 8)
which will basically yield a list of patches.

Categories

Resources