Seeking advice for classifying 3d mobile data - python

Project
I'm working on a project who's end goal will be to classify user input/behaviour on a smartphone, based on readings of 3D (accelerometer and gyroscope) movements (displacement, velocity, acceleration and jerk) as well as 2D interactions (velocity, acceleration, number of "touches" etc). My classifier will have to output whether the user is engaged/frustrated/perhaps other emotions subject to this paper. These aren't necessary at this step however.
Data
The smartphone produces JSON files, with the position of the phone in terms of x, y and z, as well as the timestamp. There's a new timestamp every 20milliseconds.
Idea
I've done some Python tutorials and produced the code at the bottom so far. I've been advised to use NumPy and SciPy to make my life easier. Aside from the obvious kinematics, I need to include a filtering process. Google suggested Kalman filters.
Question
If anyone has experience in the matter, can they recommend an approach to this, perhaps you've encountered a similar project with a nice methodology.
import sqlalchemy
import json
import ReferenceFrame, get_motion_params, dynamicsymbols, symbols
con = sqlalchemy.create_engine('postgresql+psycopg2://postgres:#localhost/airlib')
meta = sqlalchemy.MetaData(bind=con, reflect=True)
dataObjects = []
dataAccObjects = []
dataGyroObjects = []
displacementChangeInX = []
displacementChangeInY = []
displacementChangeInZ = []
t = [] #time
v = [] #velocity
a = [] #acceleration
j = [] #jerk
results = meta.tables['rawmobiledata']
class DataEntry(object):
item = 0
time = 0
x = 0
y = 0
z = 0
def __init__(self, item, time, x, y, z):
self.item = item
self.time = time
self.x = x
self.y = y
self.z = z
def make_accelerometerEntry(item, time, x, y, z):
dataAcc = DataEntry(item ,time, x, y, z)
#print str(dataAccObjects)
return dataAcc
def make_gyroEntry(item, time, x, y, z):
dataGyro = DataEntry(item, time, x, y, z)
#print str(dataGyroObjects)
return dataGyro
for row in con.execute(results.select()):
r = row[1]
r = json.dumps(r)
loaded_r = json.loads(r)
#print(loaded_r)
if loaded_r['sensor'] == 'accelerometer':
for item in range(0, 250):
time = loaded_r['data'][item]['time']
x = loaded_r['data'][item]['x']
y = loaded_r['data'][item]['y']
z = loaded_r['data'][item]['z']
dataAccObjects.append(make_accelerometerEntry(item, time, x, y, z))
#print "this is an accelerometerObj"
elif loaded_r['sensor'] == 'gyroscope':
for item in range(0, 250):
time = loaded_r['data'][item]['time']
x = loaded_r['data'][item]['x']
y = loaded_r['data'][item]['y']
z = loaded_r['data'][item]['z']
dataGyroObjects.append(make_gyroEntry(item, time, x, y, z))
#print "gyroObj with time " + str(time) + ", reading " + str(item+1) + "/250, test#" + str((row[0]/2)+1)
for row in con.execute(results.select()):
r = row[1]
r = json.dumps(r)
loaded_r = json.loads(r)
if loaded_r['sensor'] == 'gyroscope':
for item in range (0,250):
#rate of change
# ti = |t2-t1|
t[item] = abs(int(dataGyroObjects[item].time) - int(dataGyroObjects[item+1].time))
#calculate change in displacement over the 3 axes
displacementChangeInX[item] = abs(int(dataGyroObjects[item].x) - int(dataGyroObjects[item+1].x))
displacementChangeInY[item] = abs(int(dataGyroObjects[item].y) - int(dataGyroObjects[item+1].y))
displacementChangeInZ[item] = abs(int(dataGyroObjects[item].z) - int(dataGyroObjects[item+1].z))
# v = dx/dt
v[item] = abs(t[item] - 0)
#calculate acceleration
# a = dv/dx
#calculate jerk
# j = da/dt

Related

JAX Tridiagonal Jacobians

What is the most efficient implementation of a scalable autonomous tridiagonal system using JAX?
import functools as ft
import jax as jx
import jax.numpy as jnp
import jax.random as jrn
import jax.lax as jlx
def make_T(m):
# Create a psuedo-random tridiagonal Jacobian and store band
T = jnp.zeros((3,m), dtype='f8')
T = T.at[0, 1: ].set(jrn.normal(jrn.PRNGKey(0), shape=(m-1,)))
T = T.at[1, : ].set(jrn.normal(jrn.PRNGKey(1), shape=(m ,)))
T = T.at[2, :-1].set(jrn.normal(jrn.PRNGKey(2), shape=(m-1,)))
return T
def make_y(m):
# Create a pseudo-random state array
y = jrn.normal(jrn.PRNGKey(3), shape=(m ,))
return y
def calc_f_base(y, T):
# Calculate the rate given the current state
f = T[1,:]*y
f = f.at[ 1: ].set(f[ 1: ]+T[0, 1: ]*y[ :-1])
f = f.at[ :-1].set(f[ :-1]+T[2, :-1]*y[ 1: ])
return f
m = 2**22 # potentially exhausts resources
T = make_T(m)
y = make_y(m)
calc_f = ft.partial(calc_f_base, T=T)
Using jax.jacrev or jax.jacfwd will generate a full Jacobian which limits the size of the system.
One attempt to overcome this limitation is as follows
#ft.partial(jx.jit, static_argnums=(0,))
def calc_jacfwd_trid(calc_f, y):
# Determine the Jacobian (forward-mode) tridiagonal band
def scan_body(carry, i):
t, T = carry
t = t.at[i ].set(1.0)
f, dfy = jx.jvp(calc_f, (y,), (t,))
T = T.at[2,i-1].set(dfy[i-1])
T = T.at[1,i ].set(dfy[i ])
T = T.at[0,i+1].set(dfy[i+1])
t = t.at[i-1].set(0.0)
return (t, T), None
# Initialise
m = y.size
t = jnp.zeros_like(y)
T = jnp.zeros((3,m), dtype=y.dtype)
# Differentiate wrt y[0]
t = t.at[0].set(1.0)
f, dfy = jx.jvp(calc_f, (y,), (t,))
idxs = jnp.array([1,0]), jnp.array([0,1])
T = T.at[idxs].set(dfy[0:2])
# Differentiate wrt y[1:-1]
(t, T), empty = jlx.scan(scan_body, (t,T), jnp.arange(1,m-1))
# Differentiate wrt y[-1]
t = t.at[m-2:].set(jnp.array([0.0,1.0]))
f, dfy = jx.jvp(calc_f, (y,), (t,))
idxs = jnp.array([2,1]), jnp.array([m-2,m-1])
T = T.at[idxs].set(dfy[-2:])
return T
which permits
T = jacfwd_trid(calc_f, y)
df = jrn.normal(jrn.PRNGKey(4), shape=y.shape)
dx = jlx.linalg.tridiagonal_solve(*T,df[:,None]).flatten()
Is there a better approach and/or can the time complexity of calc_jacfwd_trid be reduced further?
EDIT
The following implementation is more compact, but run times are slightly slower
#ft.partial(jx.jit, static_argnums=(0,))
def calc_jacfwd_trid_map(calc_f, y):
# Determine the Jacobian (forward-mode) tridiagonal band with lax map
def map_body(i, t):
t = t.at[i-1].set(0.0)
f, dfy = jx.jvp(calc_f, (y,), (t,))
im1 = jnp.where(i > 0, i-1, 0)
Ti = jlx.dynamic_slice(dfy, (im1,), (3,))
Ti = jnp.where(i > 0, Ti, jnp.roll(Ti, shift=+1))
Ti = jnp.where(i < m-1, Ti, jnp.roll(Ti, shift=-1))
t = t.at[i ].set(1.0)
return Ti
# Initialise
m = y.size
t = jnp.zeros_like(y)
# Differentiate wrt y[:]
T = jlx.map(lambda i : map_body(i, t=t), jnp.arange(m))
# Correct the orientation of T
T = T.transpose()
T = jnp.flip(T, axis=0)
T = T.at[0,:].set(jnp.roll(T[0,:], shift=+1))
T = T.at[2,:].set(jnp.roll(T[2,:], shift=-1))
return T

Logistic regression - strange behaviour of the decision boundary when additional parameters are added

I am trying to build a logistic regression model for a dataset consisting of two parameters
x1 and x2, but instead of analyzing just the two of them, I have added their squares as well - x12, x22 and x1· x2.
At the first glance everything looks fine and the error function is decreasing, but whilist drawing the plot of the decision boundary I have noticed, that after circa 500 iterations something strange happens to it.
Here is an animation of the error function as a function of iterations and a respective plot of the decision boundary:
Now,I interpret the decision boundary as a quadratic function
x2=f(x1), where
the relation between both parameters is given like this:
0.5 = θ0 + θ1x1 + θ2x2 + θ3x12 + θ4x1x2
+ θ5x22
Here is the python code I use to do everything:
#!/usr/bin/python3
import numpy as np
import matplotlib.pyplot as plt
from math import log
from matplotlib.animation import FuncAnimation
def sigmoid(x):
return 1.0 / (1.0 + np.exp(-x))
def loadData(filepath):
source=""
try:
f = open(filepath, "r")
source = f.read()
f.close()
except IOError:
print("Error while reading file (" + filepath + ")")
return ""
raw_data = source.split("\n")
raw_data = [x.split(",") for x in raw_data if x !=""]
raw_data = np.matrix(raw_data).astype(float)
return (raw_data[:,:np.size(raw_data,1)-1], raw_data[:,np.size(raw_data, 1)-1:])
def standardize(dataset, skipfirst=True):
means = np.amin(dataset, 0)
deviation = np.std(dataset, 0)
if skipfirst:
dataset[:,1:] -= means[:,1:]
dataset[:,1:] /= deviation[:,1:]
return dataset
else:
dataset -= means
dataset /= deviation
return dataset
def error(X, Y, Theta):
"Calculates error values"
v_sigm = np.vectorize(sigmoid)
h_x = X # Theta
sigmo = v_sigm(h_x)
partial_vect = (Y-1).T # np.log(1-sigmo) - Y.T # np.log(sigmo)
return 1/(2*np.size(Y, axis=0))*np.sum(partial_vect)
def gradientStep(X, Y, Theta, LR):
"Returns new theta Values"
v_sigm = np.vectorize(sigmoid)
h_x = X # Theta
modif = -1*LR/np.size(Y, 0)*(h_x-Y)
sums = np.sum(modif.T # X, axis = 0)
return Theta + sums.T
X, Y = loadData("ex2data1.txt")
#add bias to X
X = np.append(np.ones((np.size(X, 0), 1)), X, axis=1)
added_params = [[x[1]**2, x[1]*x[2], x[2]**2] for x in np.array(X)]
X = np.append(X, np.matrix(added_params), axis=1)
#standardize X
X = standardize(X)
#create vector of parameters
Theta=np.zeros((np.size(X, 1), 1))
iterations = 3000
Theta_vals = []
Error_vals = []
for i in range(0, iterations):
Theta_vals.append(np.asarray(Theta).flatten())
Error_vals.append(error(X, Y, Theta))
Theta = gradientStep(X, Y, Theta, 0.07)
#CALCULATING FINISHES HERE
#plot data:
fig = plt.figure()
def_ax = fig.add_subplot(211)
def_ax.set_xlim(np.amin(X[:,1:2]), np.amax(X[:,1:2]))
def_ax.set_ylim(np.amin(X[:,2:3]), np.amax(X[:,2:3]))
err_ax = fig.add_subplot(212)
err_ax.set_ylim(0, error(X, Y, Theta))
err_ax.set_xlim(0, iterations)
positive_X1 = []
positive_X2 = []
negative_X1 = []
negative_X2 = []
for i in range(0, np.size(Y, 0)):
if(Y[i, 0] == 1):
positive_X1.append(X[i, 1])
positive_X2.append(X[i, 2])
else:
negative_X1.append(X[i, 1])
negative_X2.append(X[i, 2])
err_ax.set_ylim(np.amin(Error_vals), np.amax(Error_vals))
def animation(frame):
global Theta_vals, Error_vals, def_ax, err_ax, positive_X1, positive_X2, negative_X1, negative_X2
def_limX = def_ax.get_xlim()
def_limY = def_ax.get_ylim()
err_limX = err_ax.get_xlim()
err_limY = err_ax.get_ylim()
def_ax.clear()
err_ax.clear()
def_ax.set_xlim(def_limX)
def_ax.set_ylim(def_limY)
err_ax.set_xlim(err_limX)
err_ax.set_ylim(err_limY)
def_ax.scatter(positive_X1, positive_X2, marker="^")
def_ax.scatter(negative_X1, negative_X2, marker="o")
Theta = Theta_vals[frame]
res_x = np.linspace(*def_ax.get_xlim(), num=5)
delta_x = [(Theta[4]*x+Theta[2])**2-4*Theta[5]*(Theta[3]*x**2+Theta[1]*x+Theta[0]-0.5) for x in res_x]
delta_x = [np.sqrt(x) if x >= 0 else 0 for x in delta_x]
minb = [-(Theta[4]*x+Theta[2]) for x in res_x]
res_1 = []
res_2 = []
for i in range(0, len(res_x)):
if Theta[5] == 0:
res_1.append(0)
res_2.append(0)
else:
res_1.append((minb[i]+delta_x[i])/(2*Theta[5]))
res_2.append((minb[i]-+delta_x[i])/(2*Theta[5]))
def_ax.plot(res_x, res_1)
def_ax.plot(res_x, res_2)
err_x = np.linspace(0, frame, frame)
err_y = Error_vals[0:frame]
err_ax.plot(err_x, err_y)
anim = FuncAnimation(fig, animation, frames=iterations, interval=3, repeat_delay=2000)
print(error(X, Y, Theta))
anim.save("anim.mp4")
What could be the reason of such a strange behaviour?

FCBF Python feature selection technique

I want to use FCBF technique from github https://github.com/shiralkarprashant/FCBF
The problem i faced is that i am working on Python 3 ad the module is implemented for python 2 users . I got the following error that describes that name 'xrange' is not defined because i work on python3
i think to solve the issue just by changing range by xrange
from FCBF_module import FCBF, FCBFK, FCBFiP, get_i
from sklearn.datasets import load_digits
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
import time
from sklearn.grid_search import GridSearchCV
classifiers = [('DecisionTree', DecisionTreeClassifier(), {'max_depth' : [5, 10, 15]}),
('LogisticRegression', LogisticRegression(), {'C' : [0.1, 1, 10]})]
n_features = dataCAD.shape[1]
npieces = get_i(n_features)
The module code contains just one xrange occurence i tried to change it by range but it does not solve the problem:
# -*- coding: utf-8 -*-
import numpy as np
def count_vals(x):
vals = np.unique(x)
occ = np.zeros(shape = vals.shape)
for i in range(vals.size):
occ[i] = np.sum(x == vals[i])
return occ
def entropy(x):
n = float(x.shape[0])
ocurrence = count_vals(x)
px = ocurrence / n
return -1* np.sum(px*np.log2(px))
def symmetricalUncertain(x,y):
n = float(y.shape[0])
vals = np.unique(y)
# Computing Entropy for the feature x.
Hx = entropy(x)
# Computing Entropy for the feature y.
Hy = entropy(y)
#Computing Joint entropy between x and y.
partial = np.zeros(shape = (vals.shape[0]))
for i in range(vals.shape[0]):
partial[i] = entropy(x[y == vals[i]])
partial[np.isnan(partial)==1] = 0
py = count_vals(y).astype(dtype = 'float64') / n
Hxy = np.sum(py[py > 0]*partial)
IG = Hx-Hxy
return 2*IG/(Hx+Hy)
def suGroup(x, n):
m = x.shape[0]
x = np.reshape(x, (n,m/n)).T
m = x.shape[1]
SU_matrix = np.zeros(shape = (m,m))
for j in range(m-1):
x2 = x[:,j+1::]
y = x[:,j]
temp = np.apply_along_axis(symmetricalUncertain, 0, x2, y)
for k in range(temp.shape[0]):
SU_matrix[j,j+1::] = temp
SU_matrix[j+1::,j] = temp
return 1/float(m-1)*np.sum(SU_matrix, axis = 1)
def isprime(a):
return all(a % i for i in xrange(2, a))
"""
get
"""
def get_i(a):
if isprime(a):
a -= 1
return filter(lambda x: a % x == 0, range(2,a))
"""
FCBF - Fast Correlation Based Filter
L. Yu and H. Liu. Feature Selection for High‐Dimensional Data: A Fast Correlation‐Based Filter Solution.
In Proceedings of The Twentieth International Conference on Machine Leaning (ICML‐03), 856‐863.
Washington, D.C., August 21‐24, 2003.
"""
class FCBF:
idx_sel = []
def __init__(self, th = 0.01):
'''
Parameters
---------------
th = The initial threshold
'''
self.th = th
def fit(self, x, y):
'''
This function executes FCBF algorithm and saves indexes
of selected features in self.idx_sel
Parameters
---------------
x = dataset [NxM]
y = label [Nx1]
'''
self.idx_sel = []
"""
First Stage: Computing the SU for each feature with the response.
"""
SU_vec = np.apply_along_axis(symmetricalUncertain, 0, x, y)
SU_list = SU_vec[SU_vec > self.th]
SU_list[::-1].sort()
m = x[:,SU_vec > self.th].shape
x_sorted = np.zeros(shape = m)
for i in range(m[1]):
ind = np.argmax(SU_vec)
SU_vec[ind] = 0
x_sorted[:,i] = x[:,ind].copy()
self.idx_sel.append(ind)
"""
Second Stage: Identify relationships between feature to remove redundancy.
"""
j = 0
while True:
"""
Stopping Criteria:The search finishes
"""
if j >= x_sorted.shape[1]: break
y = x_sorted[:,j].copy()
x_list = x_sorted[:,j+1:].copy()
if x_list.shape[1] == 0: break
SU_list_2 = SU_list[j+1:]
SU_x = np.apply_along_axis(symmetricalUncertain, 0,
x_list, y)
comp_SU = SU_x >= SU_list_2
to_remove = np.where(comp_SU)[0] + j + 1
if to_remove.size > 0:
x_sorted = np.delete(x_sorted, to_remove, axis = 1)
SU_list = np.delete(SU_list, to_remove, axis = 0)
to_remove.sort()
for r in reversed(to_remove):
self.idx_sel.remove(self.idx_sel[r])
j = j + 1
def fit_transform(self, x, y):
'''
This function fits the feature selection
algorithm and returns the resulting subset.
Parameters
---------------
x = dataset [NxM]
y = label [Nx1]
'''
self.fit(x, y)
return x[:,self.idx_sel]
def transform(self, x):
'''
This function applies the selection
to the vector x.
Parameters
---------------
x = dataset [NxM]
'''
return x[:, self.idx_sel]
"""
FCBF# - Fast Correlation Based Filter
B. Senliol, G. Gulgezen, et al. Fast Correlation Based Filter (FCBF) with a Different Search Strategy.
In Computer and Information Sciences (ISCIS ‘08) 23rd International Symposium on, pages 1‐4.
Istanbul, October 27‐29, 2008.
"""
class FCBFK(FCBF):
idx_sel = []
def __init__(self, k = 10):
'''
Parameters
---------------
k = Number of features to include in the
subset.
'''
self.k = k
def fit(self, x, y):
'''
This function executes FCBFK algorithm and saves indexes
of selected features in self.idx_sel
Parameters
---------------
x = dataset [NxM]
y = label [Nx1]
'''
self.idx_sel = []
"""
First Stage: Computing the SU for each feature with the response.
"""
SU_vec = np.apply_along_axis(symmetricalUncertain, 0, x, y)
SU_list = SU_vec[SU_vec > 0]
SU_list[::-1].sort()
m = x[:,SU_vec > 0].shape
x_sorted = np.zeros(shape = m)
for i in range(m[1]):
ind = np.argmax(SU_vec)
SU_vec[ind] = 0
x_sorted[:,i] = x[:,ind].copy()
self.idx_sel.append(ind)
"""
Second Stage: Identify relationships between features to remove redundancy with stopping
criteria (features in x_best == k).
"""
j = 0
while True:
y = x_sorted[:,j].copy()
SU_list_2 = SU_list[j+1:]
x_list = x_sorted[:,j+1:].copy()
"""
Stopping Criteria:The search finishes
"""
if x_list.shape[1] == 0: break
SU_x = np.apply_along_axis(symmetricalUncertain, 0,
x_list, y)
comp_SU = SU_x >= SU_list_2
to_remove = np.where(comp_SU)[0] + j + 1
if to_remove.size > 0 and x.shape[1] > self.k:
for i in reversed(to_remove):
x_sorted = np.delete(x_sorted, i, axis = 1)
SU_list = np.delete(SU_list, i, axis = 0)
self.idx_sel.remove(self.idx_sel[i])
if x_sorted.shape[1] == self.k: break
if x_list.shape[1] == 1 or x_sorted.shape[1] == self.k:
break
j = j + 1
if len(self.idx_sel) > self.k:
self.idx_sel = self.idx_sel[:self.k]
"""
FCBFiP - Fast Correlation Based Filter in Pieces
"""
class FCBFiP(FCBF):
idx_sel = []
def __init__(self, k = 10, npieces = 2):
'''
Parameters
---------------
k = Number of features to include in the
subset.
npieces = Number of pieces to divide the
feature space.
'''
self.k = k
self.npieces = npieces
def fit(self, x, y):
'''
This function executes FCBF algorithm and saves indexes
of selected features in self.idx_sel
Parameters
---------------
x = dataset [NxM]
y = label [Nx1]
'''
"""
First Stage: Computing the SU for each feature with the response. We sort the
features. When we have a prime number of features we remove the last one from the
sorted features list.
"""
m = x.shape
nfeaturesPieces = int(m[1] / float(self.npieces))
SU_vec = np.apply_along_axis(symmetricalUncertain, 0, x, y)
x_sorted = np.zeros(shape = m, dtype = 'float64')
idx_sorted = np.zeros(shape = m[1], dtype = 'int64')
for i in range(m[1]):
ind = np.argmax(SU_vec)
SU_vec[ind] = -1
idx_sorted[i]= ind
x_sorted[:,i] = x[:,ind].copy()
if isprime(m[1]):
x_sorted = np.delete(x_sorted, m[1]-1, axis = 1 )
ind_prime = idx_sorted[m[1]-1]
idx_sorted = np.delete(idx_sorted, m[1]-1)
#m = x_sorted.shape
"""
Second Stage: Identify relationships between features into its vecinity
to remove redundancy with stopping criteria (features in x_best == k).
"""
x_2d = np.reshape(x_sorted.T, (self.npieces, nfeaturesPieces*m[0])).T
SU_x = np.apply_along_axis(suGroup, 0, x_2d, nfeaturesPieces)
SU_x = np.reshape(SU_x.T, (self.npieces*nfeaturesPieces,))
idx_sorted2 = np.zeros(shape = idx_sorted.shape, dtype = 'int64')
SU_x[np.isnan(SU_x)] = 1
for i in range(idx_sorted.shape[0]):
ind = np.argmin(SU_x)
idx_sorted2[i] = idx_sorted[ind]
SU_x[ind] = 10
"""
Scoring step
"""
self.scores = np.zeros(shape = m[1], dtype = 'int64')
for i in range(m[1]):
if i in idx_sorted:
self.scores[i] = np.argwhere(i == idx_sorted) + np.argwhere(i == idx_sorted2)
if isprime(m[1]):
self.scores[ind_prime] = 2*m[1]
self.set_k(self.k)
def set_k(self, k):
self.k = k
scores_temp = -1*self.scores
self.idx_sel = np.zeros(shape = self.k, dtype = 'int64')
for i in range(self.k):
ind = np.argmax(scores_temp)
scores_temp[ind] = -100000000
self.idx_sel[i] = ind
Try using 2to3 package for python to automatically convert the files. Worked for me!
https://docs.python.org/2/library/2to3.html

Python- name not defined [duplicate]

This question already has answers here:
Why doesn't calling a string method (such as .replace or .strip) modify (mutate) the string?
(3 answers)
Closed 7 years ago.
I am having trouble getting my code to run. I keep getting the error that my x variable such as 'hsGPA' is not defined. Below is my code. Ive tried the solutions posted on the pother thread and none have helped so please don't mark this as a duplicate. THANKS!
def readData(fileName):
hsGPA = [] #High School GPA
mathSAT = [] #Math SAT scores
crSAT = [] #Verbal SAT scores
collegeGPA = [] #College GPA
FullList=[]
inputFile = open(fileName, 'r', encoding = 'utf-8')
for line in inputFile:
FullList=line.split(',')
hsGPA.append(float(FullList[0]))
mathSAT.append(int(FullList[1]))
crSAT.append(int(FullList[2]))
collegeGPA.append(float(FullList[3]))
return hsGPA, mathSAT, crSAT, collegeGPA
def plotData(hsGPA, mathSAT, crSAT, collegeGPA):
GPA1 = [] #High School GPA
Score1 = [] #Math SAT scores
Score2= [] #Verbal SAT scores
GPA2 = [] #College GPA
hsGPA, mathGPA, crSAT, collegeGPA = readData('SAT.txt')
pyplot.figure(1)
pyplot.subplot(4,1,1)
for line in range(len(hsGPA)):
GPA1.append(line)
pyplot.plot(GPA1,hsGPA)
pyplot.subplot(4,1,2)
for line in range(len(mathSAT)):
Score1.append(line)
pyplot.plot(Score1,mathSAT)
pyplot.subplot(4,1,3)
for line in range(len(crSAT)):
Score2.append(line)
pyplot.plot(Score2,crSAT)
pyplot.subplot(4,1,4)
for line in range(len(collegeGPA)):
GPA2.append(line)
pyplot.plot(GPA2,collegeGPA)
pyplot.show()
def LinearRegression(xList, yList):
'''
This function finds the constants in the y = mx+b, or linear regression
forumula
xList - a list of the x values
yList - a list of the y values
m - the slope f the line
b - where the line intercepts the y axis
'''
n = len(xList)
sumX = 0
sumXX = 0
sumXY = 0
sumY = 0
for index in range(n):
sumX += xList[index]
sumXY += xList[index] * yList[index]
sumXX += xList[index]**2
sumY += yList[index]
#the components needed to find m and b
m = (n*(sumXY - (sumX*sumY)))/(n*(sumXX - (sumX**2)))
b = (sumY - (m*sumX))/n
#actually implements formula
return m, b
def plotRegression(x,y, xLabel, yLabel):
ScoreT = []
pyplot.scatter(x,y)
m,b = linearRegression(xList,yList)
minX = min(x)
maxX = max(x)
pyplot.plot([minX, maxX], [m * minX + b, m * maxX + b], color ='red')
pyplot.xlabel(xLabel)
pyplot.ylabel(yLabel)
pyplot.show()
for index in range(len(mathSAT)):
sumscore = mathSAT[index] + crSAT[index]
ScoreT.append(sumscore)
return ScoreT
def rSquared(x,y,m,b):
n = len(x)
R=0
sumS=0
sumT=0
sumY=0
for index in range(n):
a=(y[index]-((m*x[index])+b))**2
sumS = sumS+a
for index in range(len(y)):
sumY = sumY= y[index]
MeanY= sumY/(len(y))
e=(y[index]-MeanY)**2
sumT = sumT+e
m,b= LinearRegression(xList, yList)
RG=1-(sumS/sumT)
def main():
print(readData('SAT.txt'))
plotData(*readData('SAT.txt'))
plotRegression(hsGPA,collegeGPA, 'highGPA', 'collegeGPA')
plotRegression(mathSAT,collegeGPA, 'highGPA' , 'collegeGPA')
plotRegression(crSAT,collegeGPA, 'highGPA' , 'collegeGPA')
plotRegression(ScoreT,collegeGPA, 'highGPA' , 'collegeGPA')
main()
It's giving the error in main, after plotRegression for each of the x variables. Please Help! Thanks!
Try this:
def plotRegression(x,y, xLabel, yLabel):
# I deleted ScoreT = [] here
pyplot.scatter(x,y)
m,b = linearRegression(x,y)
minX = min(x)
maxX = max(x)
pyplot.plot([minX, maxX], [m * minX + b, m * maxX + b], color ='red')
pyplot.xlabel(xLabel)
pyplot.ylabel(yLabel)
pyplot.show()
# I deleted the loop and return statement here
# ....
def main():
data = readData('SAT.txt')
print(data)
plotData(*data)
hsGPA, mathSAT, crSAT, collegeGPA = data
# added ScoreT calculation here
ScoreT = [sum(x) for x in zip(mathSAT, crSAT)]
plotRegression(hsGPA,collegeGPA, 'highGPA', 'collegeGPA')
plotRegression(mathSAT,collegeGPA, 'highGPA' , 'collegeGPA')
plotRegression(crSAT,collegeGPA, 'highGPA' , 'collegeGPA')
plotRegression(ScoreT,collegeGPA, 'highGPA' , 'collegeGPA')
In your main(), hsGPA is never defined. It's defined inside other function and is not shared in the global context. So main cannot access it.
You need to it from readData()'s return

Python Scatter plot

Got this question from 'how to think like a computer scientist' course:
Interpret the data file labdata.txt such that each line contains a an x,y coordinate pair. Write a function called plotRegression that reads the data from this file and uses a turtle to plot those points and a best fit line according to the following formulas:
y=y¯+m(x−x¯)
m=∑xiyi−nx¯y¯∑x2i−nx¯2
http://interactivepython.org/runestone/static/thinkcspy/Files/Exercises.html?lastPosition=1308
my code doesnt seem to be working and i cant figure out why. it looks like python is interpreting the data as str as opposed to float.
def plotregression(t):
labfile = open('labdata.txt','r')
sumx = 0
sumy = 0
count = 0
sumprod = 0
sumsqrx =0
sumsqrnx = 0
for i in labfile:
points = i.split()
print (points)
t.up()
t.setpos(points[0],points[1])
t.stamp()
sumx = sumx + int(points[0])
sumy = sumy + int(points[1])
prod = points[0]*int(points[1])
sumprod = sumprod + prod
count += 1
sqrx = int(points[0])**2
sumsqrx = sumsqrx + sqrx
sqrnx = int(points[0])**(-2)
sumsqrnx = sumsqrnx + sqrnx
avgx = sumx/count
avgy = sumy/count
m = (sumprod - count(avgx*avgy))/sumsqrx- (count(avgx**2))
print(m)
for bestline in labfile:
line = bestline.split()
y= avgy + m(int(line[0])-avgx)
t.down()
t.setpos(0,0)
t.setpos(line[0],y)
plotregression(kj)
Appreciate your help.
Thnx
I actually worked out the problem myself and it finally seems to be doing what i'm telling it to. But i would love to know if i can cut out any unnecessary lines of code. I'm thinking its a bit too long and i'm missing out something which would make this more simpler to do.
import turtle
wn= turtle.Screen()
kj = turtle.Turtle()
kj.shape('circle')
kj.turtlesize(0.2)
kj.color('blue')
kj.speed(1)
def plotregression(t):
sumx = 0
sumy = 0
count = 0
sumprod = 0
sumsqrx =0
labfile = open('labdata.txt','r')
for i in labfile:
points = i.split()
print (points)
t.up()
t.setpos(int(points[0]),int(points[1]))
t.stamp()
sumx = sumx + int(points[0])
sumy = sumy + int(points[1])
prod = int(points[0])*int(points[1])
sumprod = sumprod + prod
count += 1
sqrx = int(points[0])**2
sumsqrx = sumsqrx + sqrx
avgx = sumx/count
avgy = sumy/count
m = (sumprod - count*(avgx*avgy))/(sumsqrx- (count*(avgx**2)))
print('M is: ',m )
labfile.close()
labfile = open('labdata.txt','r')
besttfit = open('bestfit.txt','w')
for bestline in labfile:
line = bestline.split()
y = avgy + m*(int(line[0])-avgx)
print('y is:' ,y)
besttfit.write((line[0])+'\t'+str(y)+'\n')
labfile.close()
besttfit.close()
bestfitline = open('bestfit.txt','r')
for regline in bestfitline:
reg = regline.split()
t.goto(float(reg[0]),float(reg[1]))
t.down()
t.write('Best fit line')
bestfitline.close()
wn.setworldcoordinates(-10,-10,120,120)
figure = plotregression(kj)
wn.exitonclick()
please let me know if i can cut down anywhere
I was solving the same problem form the interactive python. Here is how I did it.
import turtle
def plotRegression(data):
win = turtle.Screen()
win.bgcolor('pink')
t = turtle.Turtle()
t.shape('circle')
t.turtlesize(0.2)
x_list, y_list = [i[0] for i in plot_data], [i[1] for i in plot_data]
x_list, y_list = [float(i) for i in x_list], [float(i) for i in y_list]
x_sum, y_sum = sum(x_list), sum(y_list)
x_bar, y_bar = x_sum / len(x_list), y_sum / len(y_list)
x_list_square = [i ** 2 for i in x_list]
x_list_square_sum = sum(x_list_square)
xy_list = [x_list[i] * y_list[i] for i in range(len(x_list))]
xy_list_sum = sum(xy_list)
m = (xy_list_sum - len(x_list) * x_bar * y_bar) / (x_list_square_sum - len(x_list) * x_bar ** 2)
# best y
y_best = [ (y_bar + m * (x_list[i] - x_bar)) for i in range( len(x_list) ) ]
# plot points
max_x = max(x_list)
max_y = max(y_list)
win.setworldcoordinates(0, 0, max_x, max_y)
for i in range(len(x_list)):
t.penup()
t.setposition(x_list[i], y_list[i])
t.stamp()
#plot best y
t.penup()
t.setposition(0,0)
t.color('blue')
for i in range(len(x_list)):
t.setposition(x_list[i],y_best[i])
t.pendown()
win.exitonclick()
with open('files/labdata.txt', 'r') as f:
plot_data = [aline.split() for aline in f]
plotRegression(plot_data)
I am about 5 years too late but here is my two cents.
The problem might be in the line:
t.setpos(points[0],points[1])
This is telling the turtle to go to the string value of the points[0] and points[1].
For example, if points[0] stores the value of "50" and points[1] holds the value "60" then "50" + "60" would be return the string "5060"
This line might have problems as well:
prod = points[0]*int(points[1])
This is adding the string value in points[0] to the integer value in points[1]
In this case, using the previous values points[0] would be "50" and int(points[1]) would be 60. That is 60 and not "60". So you cant add the string "50" with the integer 60.
Here is how I worked out the problem:
import turtle
import math
import statistics as stats
def get_line(means, slope, xlist):
"""Return a list of best y values."""
line = [(means[1] + slope * (xlist[x] + means[0]))
for x in range(len(xlist))]
return line
def get_mtop(xlist, ylist, n, means):
"""Return top half of m expression."""
xbyy_list = [xlist[x] * ylist[x] for x in range(len(xlist))]
xbyy_sum = sum(xbyy_list)
nby_means = n * (means[0] * means[1])
top = xbyy_sum - nby_means
return top
def get_mbot(xlist, n, means):
"""Return bottom half of m expression."""
sqr_comprehension = [x**2 for x in xlist]
sqr_sum = sum(sqr_comprehension)
nbymean_sqr = n * means[0]**2
bot = sqr_sum - nbymean_sqr
return bot
def get_mean(xlist, ylist):
"""Return a tuple that contains the means of xlist and ylist
in form of (xmean,ymean)."""
xmean = stats.mean(xlist)
ymean = stats.mean(ylist)
return xmean, ymean
def plotRegression(input_file, input_turtle):
"""Draw the plot regression.""""
infile = open(input_file, 'r')
input_turtle.shape("circle")
input_turtle.penup()
# Get a list of xcoor and a list of ycoor
xcoor = []
ycoor = []
for line in infile:
coor = line.split()
xcoor.append(int(coor[0]))
ycoor.append(int(coor[1]))
# Plot and count the points
num_points = 0
for count in range(len(xcoor)):
input_turtle.goto(xcoor[count], ycoor[count])
input_turtle.stamp()
num_points += 1
# Get the mean values of the xcoor and ycoor lists
means_tup = get_mean(xcoor, ycoor)
print(means_tup)
# Get the value for M
mtop = get_mtop(xcoor, ycoor, num_points, means_tup)
mbot = get_mbot(xcoor, num_points, means_tup)
m = mtop / mbot
print(m)
# Draw the line
yline = get_line(means_tup, m, xcoor)
input_turtle.color("green")
input_turtle.goto(xcoor[0], yline[0])
input_turtle.pendown()
for x in range(len(xcoor)):
print(xcoor[x], yline[x])
input_turtle.goto(xcoor[x], yline[x])
input_turtle.hideturtle()
def main():
"""Create the canvas and the turtle. Call the function(s)"""
# Set up the screen
sc = turtle.Screen()
sc.setworldcoordinates(0, 0, 100, 100)
sc.bgcolor("black")
# Create the turtle
Donatello = turtle.Turtle()
Donatello.color("purple")
# Run plot Regression
labdata = """C:\\Users\\user\\pathtofile\\labdata.txt"""
plotRegression(labdata, Donatello)
sc.exitonclick()
if __name__ == "__main__":
main()
I don't know if this is the correct slope but it seems to be in the right direction. Hopefully this helps some one who has the same problem.

Categories

Resources