def Data_to_array(file):
r = int
x, y=[],[]
data = []
line_num = 0
#call data
P = open(file,'r')
data = P.readlines()
#Get it to ignore strings
for line in data:
line_num += 1
if line.find("[data]") >= 0:
r = (line_num+1)
# Data = P.readlines()[:r]
# print (Data)
if "Sampling Rate" in line:
SR = float(line[15:])
if "temperature=" in line:
T = float(line[12:18])
print(str("Temperature = "))
print(T)
Data = data[r:line_num]
#assign data into dataframe
df = pd.DataFrame(Data)
#rename column in data
df = df.rename(columns = {0: 'volts'})
#get it to recognise the index
df.index.name = 'Index'
#get it to recognise the data as number
df = df.astype({'volts': float})
#get index to start at 1
df.index += 1
#assign data to lists
I = df.index.to_list()
t = df['volts'].to_list()
#get it to invert data
y = [element * -1 for element in t]
#multiply by sampling rate
x = [element /(SR) for element in I]
return x,y
#This is to create the exponential function
def Exponential_func(file):
temp_array = Data_to_array(file)
X = np.asarray(temp_array[0])
a,b = float()
#Y = temp_array[1]
f = np.exp(a*X) + b
return f
#This is to get the optomize function to work
def Exponential_model(file):
temp_array = Data_to_array(file)
X = np.asarray(temp_array[0])
Y = np.asarray(temp_array[1])
#f = np.exp(X)
#exp_mod = lf.ExponentialModel(X,Y)
#pars = exp_mod.guess(Y, X)
r = sp.optimize.curve_fit(X,Y,Exponential_func.f)
return r
#This is to plot the data
def Plot_Data (file):
temp_array = Data_to_array(file)
X = np.asarray(temp_array[0])
Y = np.asarray(temp_array[1])
#p_0 = np.exp(X)
#sp.optimize.curve_fit(X,Y,p_0)
plt.scatter(X,Y)
#plt.plot(Exponential_model.r)
plt.show()
plt.xlabel("Time (s)")
plt.ylabel("Capacitence (μF)")
# print(Data_to_array('Cz-Si-T-1.txt')[1])
Plot_Data("Cz-Si-T-82.txt")
Exponential_func("Cz-Si-T-82.txt")
Exponential_model("Cz-Si-T-82.txt")
When I try to use the sp.optomize function, I get the error "'function' object has no attribute 'f'" but when looking up this problem I have the function and variables in the correct order.
I need this piece of code to fit an exponential curve to my data, which does have an exp fit, can anyone help? It would also be helpful for the code to print the function of the fitted curve as I will be integrating under this later.
Related
Can someone help me, I'm trying to create a dataset and always the same error only integer scalar arrays can be converted to a scalar index for the same code snippet
Y = pd.Series(dataset_df['bin'].values.flatten().tolist(), index=X.index).to_frame('bin'))
I don't understand why this error occurs. If anyone can explain it to me, thank you in advance.
obs.: the data variable is a dataframe, the FEATURE_COLUMNS variable is the data column (like feat_mean_frac_close) and the daily_vol variable is float.
def create_dataset(data, daily_vol):
TP = 0.01
SL = - TP
T, P, X, Y, Y2 = [], [], [], [], []
TIME = []
for i in daily_vol.index:
window = data.loc[i][FEATURE_COLUMNS].values
if np.isnan(window).any():
continue
now = data.close[i]
future_window = data.loc[i:i + timedelta(days=HORIZON)].close
Ti = daily_vol.loc[i]
min_ret_situation, take_action, timings = get_meta_barier(future_window, now, Ti, TP, SL, False)
X.append(window)
Y.append(min_ret_situation)
Y2.append(take_action)
T.append(timings)
P.append(data.loc[i].close)
TIME.append(i)
dataset_df = pd.DataFrame(np.array(X), columns = [FEATURE_COLUMNS], index = TIME)
dataset_df['bin'] = np.argmax(Y, axis=1)
dataset_df['t1'] = pd.Series(dataset_df.index, index = dataset_df.index)
dataset_df['w'] = 1. / len(Y)
X = dataset_df[FEATURE_COLUMNS]
Y = pd.Series(dataset_df['bin'].values.flatten().tolist(), index=X.index).to_frame('bin')
Y['w'] = 1./Y.shape[0]
Y['t1'] = pd.Series(Y.index, index = Y.index)
X.columns = FEATURE_COLUMNS
return X, Y, T, P
from datetime import datetime
t = datetime.strptime('2010-01-01', '%Y-%m-%d') # 2017
X, Y, T, P = create_dataset(data[:t], daily_vol[:t])
def get_nouns (text):
tagger = MeCab.Tagger()
words = []
for c in tagger.parse(text).splitlines()[:-1]:
if len(c.split('\t')) < 2:
continue
surface, feature = c.split('\t')
pos = feature.split(',')[0]
if pos == '名詞': # noun
words.append(surface)
return ' '.join(words)
def bio():
biolist =[]
howmany = 10
for giin in read:
if len(giin["education"]) < 1:
continue
biolist.append(get_nouns(" ".join(giin["education"])))
######################################################
nparray = np.array (biolist)
cv = CountVectorizer()
bags = cv.fit_transform(nparray)
tfidf=TfidfTransformer(norm='l2', sublinear_tf=True).fit_transform(bags)
km_model = KMeans(n_clusters=howmany, init='k-means++')
km_model.fit_transform(tfidf)
lsa2 = TruncatedSVD(2)
compressed_text_list = lsa2.fit_transform(tfidf)
compressed_center_list = lsa2.fit_transform(km_model.cluster_centers_)
X = []
Y = []
X_cent = []
Y_cent = []
for x, y in compressed_text_list:
X.append(x)
Y.append(y)
for x, y in compressed_center_list:
X_cent.append(x)
Y_cent.append(y)
clus_list = []
for i in range (howmany):
clus_list.append([])
for a in biolist:
if km_model.labels_[biolist.index(a)] == i:
clus_list[i].append(a)
for a in clus_list:
print (a)
print (" ")
plt.scatter(X, Y, c = km_model.labels_)
plt.scatter(X_cent, Y_cent, c="r", marker = "+")
plt.show()
I have this code that clusters educational history of people into 10 groups. My scatter plot looks like this.
As you can see, the scatter plot is not really sorted into groups and different colors are mixed up with one another. How could this code be changed to make the grouping more precise?
I have been trying to implement k-means clustering with a heatmap, but have been unsuccessful.
Here is the initial data set:
https://raw.githubusercontent.com/gsprint23/cpts215/master/progassignments/files/simple.csv
And here is my code:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import math
import random
#%matplotlib inline
def truncate(f, n):
return math.floor(f * 10 ** n) / 10 ** n
def chooseCenter(data, centers):
length = data.shape
cent = []
while len(cent) < centers :
x = random.randrange(0,length[0])
y = random.randrange(0,length[1])
if data.iloc[x][y] not in cent:
d = truncate(data.iloc[x][y],2)
cent.append(d)
return cent
def distance(val, center):
return math.sqrt((val- center)**2)
def getDistances(centers, data):
length = data.shape
dist = []
for i in range(length[0]):
for j in range(length[1]):
y = []
for k in range(len(centers)):
val = distance(data.iloc[i][j], centers[k])
y.append(truncate(val,3))
dist.append(y)
return dist
def findClosest(data, dist):
close = data.copy()
length = close.shape
indexes = []
for i in range(len(dist)):
pt = min(dist[i])
idx = dist[i].index(pt)
indexes.append(idx)
#print(indexes)
length = data.shape
n = np.array(indexes)
n = pd.DataFrame(np.reshape(n, (length[0],length[1])))
#reshape this data frame into the same shape as the data
#keep running the find closest until there is no change
#try heatmap on this?
#this should cluster it, but to make sure test it
#might need to do some tweaking to this
return n
# for i in range(length[0]):
# for j in range(length[1]):
# print('dist[i]', dist[j])
# pt = min(dist[j])
# print(pt)
# idx = dist[j].index(pt)
# close.iloc[i][j] = int(idx)
#return close
def computeNewCenter(data, close):
d = dict()
for i in range(len(close)):
for j in range(len(close[0])):
d[close.iloc[i][j]] = []
for i in range(len(data)):
for j in range(len(data[0])):
if close.iloc[i][j] in d:
d[close.iloc[i][j]].append(data.iloc[i][j])
newCenters = []
for key, value in d.items():
m = np.mean(value)
newCenters.append(truncate(m, 3))
return newCenters
# lst = [[] * numcenters]
# for i in range(len(close)):
# for j in range(len(close[0])):
# if close.iloc[i][j]
def main():
data = np.array(pd.read_csv('https://raw.githubusercontent.com/gsprint23/cpts215/master/progassignments/files/simple.csv', header=None))
data = data.T
#print(data)
df = pd.DataFrame(data[1:], columns=data[0], dtype=float).T
df = df.iloc[::-1]
# print(df)
# print(df.iloc[1][9])
# print(df)
# print(df.iloc[0][1])
# heatmap = plt.pcolor(df, cmap=plt.cm.bwr)
# plt.colorbar(heatmap)
c = chooseCenter(df, 3)
print(c)
#print(len(c))
dist = getDistances(c, df)
#print(dist)
y = findClosest(df, dist)
# q = []
# for i in range(len(c)):
# q.append([])
# #print(q)
j = computeNewCenter(df, y)
#print(j)
length = df.shape
oldFrame = pd.DataFrame(np.ndarray((length[0],length[1])))
oldFrame = oldFrame.fillna(0)
ct=0
while y.equals(oldFrame) == False:
ct+=1
oldFrame = y.copy()
c = computeNewCenter(df, oldFrame)
#print(c)
dist = getDistances(c, df)
#print(dist)
y = findClosest(df, dist)
#print(y)
#plt.pcolor(df, cmap=plt.cm.bwr)
l = []
for i in range(len(y)):
for j in range(len(y[0])):
if y.iloc[i][j] == 1:
l.append(df.iloc[i][j])
for i in range(len(y)):
for j in range(len(y[0])):
if y.iloc[i][j] == 2:
l.append(df.iloc[i][j])
for i in range(len(y)):
for j in range(len(y[0])):
if y.iloc[i][j] == 0:
l.append(df.iloc[i][j])
l = np.ndarray((length[0],length[1]))
l = pd.DataFrame(l)
print(l)
hm = plt.pcolor(l, cmap=plt.cm.bwr)
plt.colorbar(hm)
# print(y)
# print(c)
# print(ct)
#plt.pcolor(y, cmap=plt.cm.bwr)
if __name__ == '__main__':
main()
My line of thinking was this:
My current thought process was to first randomly choose the centers.
Then create a list of lists for each point for the distance to each center.
Find the index of the minimum distance for each point for each center.
Create a data frame of the same size as the data set and fill each index for each element with the index of the center the point is closest to.
Recompute the center by taking the mean of the points with the same center index
Repeat this process multiple times until the index data frame does not change.
Create a new data frame and add the points which have the same center point close together in the frame.
Then create the heatmap.
This did not seem to work though.
Just wondering, am I on the right track or am I completely off, and if I am on the right track which parts would I need to change in order to fix the issue. If not could you please point me on the right track.
Here is a comparison of the maps:
Here are the maps
The first one is the one my program generated while the second is the way it is supposed to look.
I know my problem lies in some part of the k-means clustering algorithm, and my guess is it is either in the reassignment stage where you reassign the points to the centroids and calculate the new centroids or in the stopping condition in that the algorithm does not run long enough. Also in the back of my head, something tells me that I am not doing this as efficiently as I could have and that I am missing something key. I have watched several videos on K-means clustering and understand it conceptually, I'm just having a hard time implementing it.
I have the following code which I am writing as part of a simple movie recommender in python so I can mimic the results I get as part of coursera's Machine Learning Course taught by Andrew NG.
I want to modify the numpy.ndarray that I get after calling as_matrix() on the pandas dataframe and add a column vector to it like we can in MATLAB
Y = [ratings Y]
Following is my python code
dataFile='/filepath/'
userItemRatings = pd.read_csv(dataFile, sep="\t", names=['userId', 'movieId', 'rating','timestamp'])
movieInfoFile = '/filepath/'
movieInfo = pd.read_csv(movieInfoFile, sep="|", names=['movieId','Title','Release Date','Video Release Date','IMDb URL','Unknown','Action','Adventure','Animation','Childrens','Comedy','Crime','Documentary','Drama','Fantasy','Film-Noir','Horror','Musical','Mystery','Romance','Sci-Fi','Thriller','War','Western'], encoding = "ISO-8859-1")
userMovieMatrix=pd.merge(userItemRatings, movieInfo, left_on='movieId', right_on='movieId')
userMovieSubMatrix = userMovieMatrix[['userId', 'movieId', 'rating','timestamp','Title']]
Y = pd.pivot_table(userMovieSubMatrix, values='rating', index=['movieId'], columns=['userId'])
Y.fillna(0,inplace=True)
movies = Y.shape[0]
users = Y.shape[1] +1
ratings = np.zeros((1682, 1))
ratings[0] = 4
ratings[6] = 3
ratings[11] = 5
ratings[53] = 4
ratings[63] = 5
ratings[65] = 3
ratings[68] = 5
ratings[97] = 2
ratings[182] = 4
ratings[225] = 5
ratings[354] = 5
features = 10
theta = pd.DataFrame(np.random.rand(users,features))# users 943*3
X = pd.DataFrame(np.random.rand(movies,features))# movies 1682 * 3
X = X.as_matrix()
theta = theta.as_matrix()
Y = Y.as_matrix()
"""want to insert a column vector into this Y to get a new Y of dimension
1682*944, but only seeing 1682*943 after the following statement
"""
np.insert(Y, 0, ratings, axis=1)
R = Y.copy()
R[R!=0] = 1
Ymean = np.zeros((movies, 1))
Ynorm = np.zeros((movies, users))
for i in range(movies):
idx = np.where(R[i,:] == 1)[0]
Ymean[i] = Y[i,idx].mean()
Ynorm[i,idx] = Y[i,idx] - Ymean[i]
print(type(Ymean), type(Ynorm), type(Y), Y.shape)
Ynorm[np.isnan(Ynorm)] = 0.
Ymean[np.isnan(Ymean)] = 0.
There is an inline comment inserted, but my problem is when I create a new numpy array and call insert, it works just fine. However the numpy array I get after calling as_matrix() on pandas dataframe on which pivot_table() is called doesn't work. Is there any alternative?
insert does not operate in place, you need to assign the output to a variable. Try:
Y = np.insert(Y, 0, ratings, axis=1)
I'm trying to do some bayesian probit code using data augmentation. I can get it to work if I loop over the rows of the output matrix, but I'd like to vectorize it and do it all in one shot (presumably that's faster).
import numpy as np
from numpy import random
import statsmodels.api as sm
from scipy import stats
from scipy.stats import norm, truncnorm
##################################
### Create some simulated data ###
num_leg = 50
num_bills = 20
a = np.random.uniform(-1,1,num_bills).reshape(num_bills, 1)
b = np.random.uniform(-2,2,num_bills).reshape(num_bills, 1)
x = np.random.standard_normal(num_leg).reshape(num_leg, 1)
ystar_base = a + np.dot(b,x.T)
epsilon = np.random.standard_normal(num_leg * num_bills).reshape(num_bills, num_leg)
ystar = ystar_base + epsilon
y = 1*(ystar >0)
### Initialize some stuff I need ###
avec = [0]*num_bills # These are bill parameters
bvec = [0]*num_bills
betavec = [np.matrix(zip(avec,bvec))]
xvec = [0]*num_leg # these are legislator parameters
_ones = np.ones(num_leg)
def init_y(mat): # initialize a latent y matrix
if mat==1: return truncnorm.rvs(0,10000)
else: return truncnorm.rvs(-10000,0)
vectorize_y = np.vectorize(init_y)
latent_y = np.matrix(vectorize_y(y))
burn = 500 # How long to run the MCMC
runs = 500
### define the functions ###
def sample_params(xnow,ynow): # This is the function I'd like to vectorize
if type(xnow) == list:
xnow = np.array(xnow)
if type(ynow) == list:
ynow = np.array(ynow)
ynow = ynow.T #reshape(ynow.shape[0],1)
sigma = np.linalg.inv(np.dot(xnow.T,xnow)) ###This is the line that produces an error###
xy = np.dot(xnow.T,ynow)
mu = np.dot(sigma, xy) # this is just (x'x)inv x'y
return np.random.multivariate_normal(np.array(mu).flatten(), sigma)
vecparams = np.vectorize(sample_params)
def get_mu(xnow, bnow): # getting the updated mean to draw the latent ys
if type(xnow) == list:
xnow = np.array(xnow)
if type(bnow) == list:
bnow = np.array(bnow)
mu = np.dot(xnow,bnow.T)
mu = np.matrix(mu)
return mu
def sample_y(mu, ynow): # generate latent y matrix
if ynow==1:
a, b = (0 - mu),(10000-mu)
else:
a, b = (-10000 - mu),(0-mu)
return truncnorm.rvs(a,b)
vector_sample = np.vectorize(sample_y) # I'd like to be able to do something like this
### Here's the MCMC loop with the internal loop over rows(bills)
for i in range(burn+runs):
this_beta = []
this_x = []
this_y = []
for j in range(num_bills): #I'd like to get rid of this loop
ex = zip(x_ones, x)
newbeta = sample_params(ex, latent_y[j])
this_beta.append(newbeta)
#ex = np.array(zip(x_ones, x))
#this_beta = vecparams(ex, latent_y[:,]) # and call the vectorized function here
betavec.append(this_beta)
#Note, I can vectorize the latent outputs easily enough here
mean = get_mu(ex, betavec[-1])
latent_y = np.matrix(vector_sample(mean, np.matrix(y).T).T.reshape(latent_y.shape[0], latent_y.shape[1]))
### Now a bit of code to check to see if I've recovered what I want ###
test_beta = [zip(*(z)) for z in betavec[burn:]]
test_a = np.array([z[0] for z in test_beta])
test_b = np.array([z[1] for z in test_beta])
amean = test_a.sum(axis = 0)/float(runs)
bmean = test_b.sum(axis = 0)/float(runs)
print 'a mean'
print np.corrcoef([amean, np.array(a)])
print
print 'b mean'
print np.corrcoef([bmean, np.array(b)])
If I comment out the loop and use the commented out lines just above, I get the following error at the line I indicated earlier (the one that defines sigma):
LinAlgError: 0-dimensional array given. Array must be at least two-dimensional