I am trying to exercise with ML. Specifically, attempting to apply DBSCAN on precomputed distances matrix (just to check how this work). Yes, I know I could use Euclidean metrics but I wanted to test the precomputed.
I am unsure why the labels are all same value for a data set with random pairs in 3 different regions- expecting DBSCAN to separate those. Note: even if I use non-overlapping ranges for the data1/2/3 I still get a single cluster output.
Here is the code:
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.cluster import DBSCAN
from scipy.spatial.distance import pdist, squareform
import random
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
data1 = np.array ([[random.randint(1,400) for i in range(2)] for j in range (50)], dtype=np.float64)
data2 = np.array ([[random.randint(300,700) for i in range(2)] for j in range (50)], dtype=np.float64)
data3 = np.array ([[random.randint(600,900) for i in range(2)] for j in range (50)], dtype=np.float64)
data= np.append (np.append (data1,data2,axis=0), data3, axis=0)
d = pdist(data, lambda u, v: np.sqrt(((u-v)**2).sum()))
distance_matrix = squareform(d)
cluster = DBSCAN (eps=0.3, min_samples=2,metric='precomputed')
dbscan_model = cluster.fit_predict (distance_matrix)
plt.scatter (data[:,0], data[:,1], s=100, c=dbscan_model)
plt.show ()
How can I calculate the value of this integral:
f_tu(t) is given as numpy.array. The graph looks like this:
How can I implement this?
Everything I could find looks something like this
from scipy.integrate import quad
def f(x):
return 1/sin(x)
I = quad(f, 0, 1)
but I have an array there, not a specific function like sin.
How about auc from sklearn.metrics?
import numpy as np
import numpy as np
from scipy.integrate import quad
from sklearn.metrics import auc
x = np.arange(0, 100, 0.001)
y = np.sin(x)
print('auc:', auc(x,y))
print('quad:', quad(np.sin, 0, 100))
auc: 0.13818791291277366
quad: (0.1376811277123232, 9.459751315610276e-09)
Okay, so you have one of those pesky infinity integrals. Here is how I would deal with it:
import numpy as np
from scipy.integrate import quad
def f(x):
return(1/(x**2)) #put your function to integrate here
print(quad(f,0,np.Infinity)) #integrates from 0 to infinity
This returns two values. The first is the estimated value of the integral, and the second is the approximate absolute error of the integral which is useful to know.
If you want to integrate a numpy array here is a simple solution:
import numpy as np
print(np.trapz(your numpy array here))
Here are my codes to plot a stress-strain curve
import matplotlib.pyplot as plt
import numpy as np
import math
from scipy.interpolate import interp1d
from matplotlib.offsetbox import AnchoredText
import pandas as pd
#both strain is a column in the given dataframe, and I manually calculated stress
df_1 = pd.read_csv('1045.csv',skiprows=25,header=[0,1])
print(df_1.head())
A1 = 40.602*(10e-6)
stress1 = ((df_1.Load)/A1)
plt.figure(figsize=(12,9))
plt.plot(df_1.Strain1.values,df_1.Load.values,'g')
plt.ylabel('stress(Pa)',fontsize=13)
plt.xlabel('Strain(%)',fontsize=13)
plt.xticks(np.arange(-6e-5,0.15,step=0.005),rotation = 45)
plt.yticks(np.arange(0,42000,step=1000))
strain = df_1.Strain1.values
stress = np.array(((df_1.Load.values)/A1))
strain = np.array((df_1.Strain1.values))
LinearLimit=1
Strain_values_linear = np.linspace(strain[0], strain[LinearLimit], num=50, endpoint=True)
Strain_values_eng = np.linspace(strain[LinearLimit], strain[-1], num=50, endpoint=True)
f1 = interp1d(strain, stress, fill_value='extrapolate')
f2 = interp1d(strain, stress, kind=3, fill_value='extrapolate')
Now I keep getting a value error saying : "x and y arrays must be equal in length along interpolation axis." I don't understand this...i printed the shape of strain and stress and they are the same
Btw here is a screenshot of the csv file:
enter image description here
You probably are passing an array of shape (..., N) as the first argument (meaning strain has shape of the form (..., N)). SciPy doesn't allow that and throws a ValueError. See the documentation for details. You should run a for loop if you have multiple vectors in strain array. The following code should work, considering you want to interpolate 1 function for each row in strain (and that strain is a 2-d array. If it isn't, you can easily convert it using strain.reshape(-1, N)):
import matplotlib.pyplot as plt
import numpy as np
import math
from scipy.interpolate import interp1d
from matplotlib.offsetbox import AnchoredText
import pandas as pd
#both strain is a column in the given dataframe, and I manually calculated stress
df_1 = pd.read_csv('1045.csv',skiprows=25,header=[0,1])
print(df_1.head())
A1 = 40.602*(10e-6)
stress1 = ((df_1.Load)/A1)
plt.figure(figsize=(12,9))
plt.plot(df_1.Strain1.values,df_1.Load.values,'g')
plt.ylabel('stress(Pa)',fontsize=13)
plt.xlabel('Strain(%)',fontsize=13)
plt.xticks(np.arange(-6e-5,0.15,step=0.005),rotation = 45)
plt.yticks(np.arange(0,42000,step=1000))
strain = df_1.Strain1.values
stress = np.array(((df_1.Load.values)/A1))
strain = np.array((df_1.Strain1.values))
LinearLimit=1
Strain_values_linear = np.linspace(strain[0], strain[LinearLimit], num=50, endpoint=True)
Strain_values_eng = np.linspace(strain[LinearLimit], strain[-1], num=50, endpoint=True)
f1, f2 = [], []
for row in range(len(strain)):
f1.append(interp1d(strain[row], stress, fill_value='extrapolate'))
f2.append(interp1d(strain[row], stress, kind=3, fill_value='extrapolate'))
Edit: From the comment, you have strain array of shape (222, 1). This means you already have a vector but the shape is not compatible with what SciPy accepts. In this case, you will have to reshape the strain and sress array to have the shape of the form (N,). Following code should work:
import matplotlib.pyplot as plt
import numpy as np
import math
from scipy.interpolate import interp1d
from matplotlib.offsetbox import AnchoredText
import pandas as pd
#both strain is a column in the given dataframe, and I manually calculated stress
df_1 = pd.read_csv('1045.csv',skiprows=25,header=[0,1])
print(df_1.head())
A1 = 40.602*(10e-6)
stress1 = ((df_1.Load)/A1)
plt.figure(figsize=(12,9))
plt.plot(df_1.Strain1.values,df_1.Load.values,'g')
plt.ylabel('stress(Pa)',fontsize=13)
plt.xlabel('Strain(%)',fontsize=13)
plt.xticks(np.arange(-6e-5,0.15,step=0.005),rotation = 45)
plt.yticks(np.arange(0,42000,step=1000))
strain = df_1.Strain1.values
stress = np.array(((df_1.Load.values)/A1))
strain = np.array((df_1.Strain1.values))
strain = strain.reshape(-1,)
stress = stress.reshape(-1,)
LinearLimit=1
Strain_values_linear = np.linspace(strain[0], strain[LinearLimit], num=50, endpoint=True)
Strain_values_eng = np.linspace(strain[LinearLimit], strain[-1], num=50, endpoint=True)
f1 = interp1d(strain, stress, fill_value='extrapolate')
f2 = interp1d(strain, stress, kind=3, fill_value='extrapolate')
Hi I'm working on a code that calculates the median values for a given window size in my data set. I'm using medfilt from SciPy. I don't understand why the median array returned is all zeroes. I've changed the kernel size but that didn't affect anything, and I'm wondering if the shape of my array affects medfilt. Here is my code:
from __future__ import division
import numpy as np
import matplotlib.pyplot as plt
import scipy as sp
from scipy import signal
filename = "hbond.txt"
hbond_val = []
with open(filename,'r') as f:
for line in f:
f_line = np.array(line.split())
hbond_val.append(f_line)
bond_array = np.asarray(hbond_val)
bond_array_float = bond_array.astype(float)
bond_1d = np.reshape(bond_array_float,50001)
#print bond_1d.shape
median = sp.signal.medfilt(bond_array_float,101)
#plt.plot(range(len(bond_array)),bond_array,'b')
#plt.plot(range(len(median)),median,'r')
#plt.show()
print median #median returns array full of zeros
I think numpy or scipy will do it, but didn't find. Thanks!
import numpy as np
import scipy.stats as stats
np.random.seed(0)
gaussian = stats.norm
Generating some random, normal data:
data = gaussian.rvs(loc = 5, scale = 22, size = 1000)
Computing descriptive statistics:
print(data.mean())
# 4.00435243522
print(data.std())
# 21.7147294907
Fitting the data to a normal distribution:
mean, std = gaussian.fit(data)
print(mean, std)
# (4.0043524352157016, 21.714729490718568)