Making linear regression more compact (python) - python

Im trying to make a linear expression for a dataset. I have plotted the data and plottet the regression, but my code is not very efficient. Is there any way to make it more compact?
import numpy as np
import matplotlib.pyplot as plt
temp1, tid0 = np.genfromtxt("forsok1.txt", dtype=float, skip_header=41, usecols = (1,2)).T
tid1 = tid0 - 200
temp2, tid2 = np.genfromtxt("forsok2.txt", dtype=float, skip_header=1, usecols = (1,2)).T
temp3, tid3 = np.genfromtxt("forsok3.txt", dtype=float, skip_header=1, usecols = (1,2)).T
tempreg1_1 = np.zeros(88)
tidreg1_1 = np.zeros(88)
for i in range(0, 88):
tempreg1_1[i] = temp1[i]
tidreg1_1[i] = tid1[i]
tempreg2_1 = np.zeros(65)
tidreg2_1 = np.zeros(65)
tempreg3_1 = np.zeros(65)
tidreg3_1 = np.zeros(65)
for i in range(0, 65):
tempreg2_1[i] = temp2[i]
tidreg2_1[i] = tid2[i]
tempreg3_1[i] = temp3[i]
tidreg3_1[i] = tid3[i]
tempreg1_2 = np.zeros(59)
tidreg1_2 = np.zeros(59)
for i in range(0, 59):
tempreg1_2[i] = temp1[i+112]
tidreg1_2[i] = tid1[i+112]
tempreg2_2 = np.zeros(76)
tidreg2_2 = np.zeros(76)
for i in range(0, 76):
tempreg2_2[i] = temp2[i+93]
tidreg2_2[i] = tid2[i+93]
tempreg3_2 = np.zeros(55)
tidreg3_2 = np.zeros(55)
for i in range(0,55):
tempreg3_2[i] = temp3[i+100]
tidreg3_2[i] = tid3[i+100]
tempreg1_3 = np.zeros(76)
tidreg1_3 = np.zeros(76)
for i in range(0, 76):
tempreg1_3[i] = temp1[i+210]
tidreg1_3[i] = tid1[i+210]
tempreg2_3 = np.zeros(80)
tidreg2_3 = np.zeros(80)
for i in range(0, 80):
tempreg2_3[i] = temp2[i+207]
tidreg2_3[i] = tid2[i+207]
tempreg3_3 = np.zeros(91)
tidreg3_3 = np.zeros(91)
for i in range(0,91):
tempreg3_3[i] = temp3[i+181]
tidreg3_3[i] = tid3[i+181]
R1_1, b1_1 = np.polyfit(tidreg1_1, tempreg1_1, 1)
R2_1, b2_1 = np.polyfit(tidreg2_1, tempreg2_1, 1)
R3_1, b3_1 = np.polyfit(tidreg3_1, tempreg3_1, 1)
R1_2, b1_2 = np.polyfit(tidreg1_2, tempreg1_2, 1)
R2_2, b2_2 = np.polyfit(tidreg2_2, tempreg2_2, 1)
R3_2, b3_2 = np.polyfit(tidreg3_2, tempreg3_2, 1)
R1_3, b1_3 = np.polyfit(tidreg1_3, tempreg1_3, 1)
R2_3, b2_3 = np.polyfit(tidreg2_3, tempreg2_3, 1)
R3_3, b3_3 = np.polyfit(tidreg3_3, tempreg3_3, 1)
tempreg1_1[0] = b1_1
tempreg2_1[0] = b2_1
tempreg3_1[0] = b3_1
for j in range(1, 88):
tempreg1_1[j] = tempreg1_1[j-1] + 5*R1_1
for j in range(1, 65):
tempreg2_1[j] = tempreg2_1[j-1] + 5*R2_1
tempreg3_1[j] = tempreg3_1[j-1] + 5*R3_1
tempreg1_2[0] = b1_2 + 560*R1_2
tempreg2_2[0] = b2_2 + 465*R2_2
tempreg3_2[0] = b3_2 + 500*R3_2
for j in range(1, 59):
tempreg1_2[j] = tempreg1_2[j-1] + 5*R1_2
for j in range(1, 76):
tempreg2_2[j] = tempreg2_2[j-1] + 5*R2_2
for j in range(1, 55):
tempreg3_2[j] = tempreg3_2[j-1] + 5*R3_2
tempreg1_3[0] = b1_3 + 1050*R1_3
tempreg2_3[0] = b2_3 + 1035*R2_3
tempreg3_3[0] = b3_3 + 905*R3_3
for j in range(1, 76):
tempreg1_3[j] = tempreg1_3[j-1] + 5*R1_3
for j in range(1, 80):
tempreg2_3[j] = tempreg2_3[j-1] + 5*R2_3
for j in range(1, 91):
tempreg3_3[j] = tempreg3_3[j-1] + 5*R3_3
plt.figure()
ax1 = plt.subplot(311)
ax2 = plt.subplot(312)
ax3 = plt.subplot(313)
ax1.plot(tid1, temp1, ':', color="g")
ax1.plot(tidreg1_1, tempreg1_1, '-.',color="b")
ax1.plot(tidreg1_2, tempreg1_2, '-.',color="b")
ax1.plot(tidreg1_3, tempreg1_3, '-.',color="b")
ax2.plot(tid2, temp2, ':', color="g")
ax2.plot(tidreg2_1, tempreg2_1, '-.',color="b")
ax2.plot(tidreg2_2, tempreg2_2, '-.',color="b")
ax2.plot(tidreg2_3, tempreg2_3, '-.',color="b")
ax3.plot(tid3, temp3, ':', color="g")
ax3.plot(tidreg3_1, tempreg3_1, '-.',color="b")
ax3.plot(tidreg3_2, tempreg3_2, '-.',color="b")
ax3.plot(tidreg3_3, tempreg3_3, '-.',color="b")
The code i have used is making arrays from small parts of the dataset, then making a linear regression from those arrays. The regression is then made into another array, whitch is plotted in the subplots. This is done for three different dataplots.
I have tried to make it more compact but havent foud a function to use. Thanks for the help and sorry for bad english.

This:
tempreg1_1 = np.zeros(88)
tidreg1_1 = np.zeros(88)
for i in range(0, 88):
tempreg1_1[i] = temp1[i]
tidreg1_1[i] = tid1[i]
Is the same as this:
tempreg1_1 = temp1[:88]
tidreg1_1 = tid1[:88]
So you may not even need make those arrays, since you can potentially just use the 'slices' directly.
In general, you rarely need to pre-create an empty array then fill it with a loop. If you find yourself doing this in NumPy, there's almost certainly a better way.

You don't have to do all of this explicitly, you can iterate through these almost-all-the-same works. Here's a simplified case, sorry your variables is a bit too much, so I use some easy names:
#read data
plt.figure()
ax1 = plt.subplot(311)
ax2 = plt.subplot(312)
ax3 = plt.subplot(313)
plots = [ax1, ax2, ax3]
for subplot in plots:
#operating tidreg and tempreg here
xCordinate = #should be your tidreg
y1 = tempreg1
y2 = tempreg2
regression1 = np.poly1d(np.polyfit(xCordinate , y1, 1))
regression2 = np.poly1d(np.polyfit(xCordinate , y2, 1))
subplot.plot(xCordinate, regression1(xCordinate), 'b-')
subplot.plot(xCordinate, regression2(xCordinate), 'b-')
plt.show()
Each for loop corresponds to a subplot, you need only operating data that would be used in that subplot. During each loop, the variable is renewed, so you also don't have to create so many variables. theoretically, that could cut down two third of the work and save a lot of memory.
For indexing or slicing arrays, you can refer this question and this numpy manual

Related

Scipy Curve_fit gives a rather weird fit

Dear Python programmers,
I am currently working with curve_fit from scipy inorder to find out what correlation the x and y data have with echouter. However, the curve fit becomes really weird even when I fit a simple lineair formule towards it. I've tried changing the array to a numpy array at the: def func(x, a, b, c): "Fit functie" return a * np.asarray(x) + b part but it still gives me a graph that looks like a 3 year old who scratched with some red pencil.
One thing I do remember is sorting the values of massflows and rms_smote from low to high. Which you can view above the def func(x, a, b, c) bit. Since the curve_fit was giving me a fit. Yet also kinda scratched out as if you're sketching when the values ware unsorted. I don't know if curve_fit considers data differently if it's sorted or not.
If you need any more information, let me know :) Any suggestion is welcome!
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from scipy.stats import linregress
from scipy.optimize import curve_fit
data_15 = pd.read_csv(r"C:\Users\Thomas\Documents\Pythondata\2022-01-15_SMOTERapport.csv", header= 0, sep=';', decimal=',')
data_06 = pd.read_csv(r"C:\Users\Thomas\Documents\Pythondata\2022-02-06_SMOTERapport.csv", header= 0, sep=';', decimal=',')
data_10 = pd.read_csv(r"C:\Users\Thomas\Documents\Pythondata\2022-02-10_SMOTERapport.csv", header= 0, sep=';', decimal=',')
speed_15 = data_15['SPEED_ACT']
speed_06 = data_06['SPEED_ACT']
speed_10 = data_10['SPEED_ACT']
"Data filter 01_15"
filter = [i for i, e in enumerate(speed_15) if e >= 80]
s_15 = pd.DataFrame(data_15)
speed15 = s_15.filter(items = filter, axis=0)
speed15.reset_index(drop=True, inplace=True)
temp15 = speed15['TP_SMOTE']
foutmetingen2 = [i for i, e in enumerate(temp15) if e < 180]
speed15 = speed15.drop(foutmetingen2)
tp_strip15 = speed15['TP_AMBIENT']
tp_target15 = speed15['TP_TARGET']
tp_smote15 = speed15['TP_SMOTE']
v_15 = speed15['SPEED_ACT']
width15 = speed15['STRIP_WIDTH']
thickness15 = speed15['STRIP_THICKNESS']
power15 = speed15['POWER_INVERTER_PRE']
voltage15 = speed15['VOLTAGE_INVERTER_PRE']
"Data filter 02_06"
filter = [i for i, e in enumerate(speed_06) if e >= 80]
s_06 = pd.DataFrame(data_06)
speed06 = s_06.filter(items = filter, axis=0)
speed06.reset_index(drop=True, inplace=True)
temp06 = speed06['TP_SMOTE']
foutmetingen2 = [i for i, e in enumerate(temp06) if e < 180]
speed06 = speed06.drop(foutmetingen2)
tp_strip06 = speed06['TP_AMBIENT']
tp_target06 = speed06['TP_TARGET']
tp_smote06 = speed06['TP_SMOTE']
v_06 = speed06['SPEED_ACT']
width06 = speed06['STRIP_WIDTH']
thickness06 = speed06['STRIP_THICKNESS']
power06 = speed06['POWER_INVERTER_PRE']
voltage06 = speed06['VOLTAGE_INVERTER_PRE']
"Data filter 02_10"
filter = [i for i, e in enumerate(speed_10) if e >= 80]
s_10 = pd.DataFrame(data_10)
speed10 = s_10.filter(items = filter, axis=0)
speed10.reset_index(drop=True, inplace=True)
temp_01 = speed10['TP_SMOTE']
foutmetingen2 = [i for i, e in enumerate(temp_01) if e < 180]
speed10 = speed10.drop(foutmetingen2)
tp_strip10 = speed10['TP_AMBIENT']
tp_target10 = speed10['TP_TARGET']
tp_smote10 = speed10['TP_SMOTE']
v_10 = speed10['SPEED_ACT']
width10 = speed10['STRIP_WIDTH']
thickness10 = speed10['STRIP_THICKNESS']
power10 = speed10['POWER_INVERTER_PRE']
voltage10 = speed10['VOLTAGE_INVERTER_PRE']
"Constanten"
widthmax = 1253
Kra = 0.002033636
Kosc = 0.073086272
Pnominal = 2200
meting_15 = np.arange(0, len(speed15), 1)
meting_06 = np.arange(0, len(speed06), 1)
meting_10 = np.arange(0, len(speed10), 1)
cp = 480
rho = 7850
"---------------------------------------------------------------------"
def temp(power, speed, width, thickness, tp_strip, tp_target, tp_smote,
voltage):
"Berekende temperatuur vergelijken met target temperatuur"
massflow = (speed/60)*width*10**-3*thickness*10**-3*rho
LossesRA = Kra*Pnominal*(width/widthmax)
LossesOSC = Kosc*Pnominal*(voltage/100)**2
Plosses = (LossesRA + LossesOSC)
power_nl = (power/100)*Pnominal - Plosses
temp_c = ((power_nl*1000)/(massflow*cp)) + tp_strip
verschil_t = (temp_c/tp_target)*100-100
verschil_smote = (temp_c/tp_smote)*100-100
return temp_c, verschil_t, verschil_smote, massflow
temp_15 = temp(power15, v_15, width15, thickness15, tp_strip15, tp_target15,
tp_smote15, voltage15)
temp_06 = temp(power06, v_06, width06, thickness06, tp_strip06, tp_target06,
tp_smote06, voltage06)
temp_10 = temp(power10, v_10, width10, thickness10, tp_strip10, tp_target10,
tp_smote10, voltage10)
"---------------------------------------------------------------------"
def rms(Temperatuurberekend, TemperatuurGemeten):
"De Root Mean Square berekenen tussen berekend en gemeten data"
rootmeansquare = (TemperatuurGemeten - Temperatuurberekend)
rootmeansquare_totaal = np.sum(rootmeansquare)
rootmeansquare_gem = rootmeansquare_totaal/len(rootmeansquare)
return rootmeansquare, rootmeansquare_totaal, rootmeansquare_gem
rms_tp_smote15 = (rms(temp_15[0], tp_smote15))
rms_tp_smote06 = (rms(temp_06[0], tp_smote06))
rms_tp_smote10 = (rms(temp_10[0], tp_smote10))
"----------------------------------------------------------------------"
massflows = [np.sum(temp_06[3])/len(temp_06[3]), np.sum(temp_15[3])/
len(temp_15[3]), np.sum(temp_10[3])/len(temp_10[3])]
rms_smote = [rms_tp_smote06[2], rms_tp_smote10[2], rms_tp_smote15[2]]
rms_tp_smote_pre = np.append(rms_tp_smote15[0].tolist(),
rms_tp_smote06[0].tolist())
rms_tp_smote = np.append(rms_tp_smote_pre, rms_tp_smote10[0].tolist())
massflow_pre = np.append(temp_15[3].tolist(), temp_06[3].tolist())
massflow = np.append(massflow_pre, temp_10[3].tolist())
massflow_sort = np.sort(massflow)
rms_tp_smote_sort = [x for _, x in sorted(zip(massflow, rms_tp_smote))]
a,b,r,p, s_a= linregress (massflows,rms_smote)
print('RC: ' ,a ,'\n','std: ', s_a , '\n', 'Offset: ', b)
def func(x, a, b, c):
"Fit functie"
return a * np.asarray(x) + b
popt, pcov = curve_fit(func, massflow_sort, rms_tp_smote_sort)
popt
functie = func(massflow_sort, *popt)
sns.set_theme(style='whitegrid')
fig, axs = plt.subplots(2, figsize=(10, 10))
axs[0].plot(massflows, rms_smote, label='Temp afwijking als f(massflow)')
axs[0].plot ([massflows[0] ,massflows[len (massflows) -1]] ,
[a*massflows [0]+b,a*massflows[len (massflows) -1]+b] ,
label ='trendlijn')
axs[0].set(xlabel='Mass flow ($kg/s$)',
ylabel='Temperatuur afwijking gem ($\u00b0C$)', title='Met Verliezen')
axs[0].legend(loc='upper right')
axs[1].plot(massflow_sort, rms_tp_smote_sort, 'o', label='Temp/Massflow 01-15')
#axs[1].plot(temp_06[3], rms_tp_smote06[0], 'o', label='Temp/Massflow 02-06')
#axs[1].plot(temp_10[3], rms_tp_smote10[0], 'o', label='Temp/Massflow 02-10')
axs[1].plot(massflow, func(massflow_sort, *popt), 'r-',
label='fit: a=%5.3f, b=%5.3f, c=%5.3f' % tuple(popt))
axs[1].set(xlabel='Mass flow ($kg/s$)',
ylabel='Temperatuur afwijking gem ($\u00b0C$)')
axs[1].legend(loc='upper right')
print("Gemiddelde verschil temperatuur smote: ", rms_tp_smote15[1])
print("Gemiddelde uitwijking temperatuur smote: ", rms_tp_smote15[2])

Splittig data in python dataframe and getting the array values automatically

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
data = pd.read_csv('D:\ history/segment.csv')
data = pd.DataFrame(data)
data = data.sort_values(['Prob_score'], ascending=[False])
one = len(data)
actualpaid_overall = len(data.loc[data['paidstatus'] == 1])
data_split = np.array_split(data, 10)
data1 = data_split[0]
actualpaid_ten = len(data1.loc[data1['paidstatus'] == 1])
percent_ten = actualpaid_ten/actualpaid_overall
data2 = data_split[1]
actualpaid_twenty = len(data2.loc[data2['paidstatus'] == 1])
percent_twenty = (actualpaid_twenty/actualpaid_overall) + percent_ten
data3 = data_split[2]
actualpaid_thirty = len(data3.loc[data3['paidstatus'] == 1])
percent_thirty = (actualpaid_thirty/actualpaid_overall) + percent_twenty
data4 = data_split[3]
actualpaid_forty = len(data4.loc[data4['paidstatus'] == 1])
percent_forty = (actualpaid_forty/actualpaid_overall) + percent_thirty
data5 = data_split[4]
actualpaid_fifty = len(data5.loc[data5['paidstatus'] == 1])
percent_fifty = (actualpaid_fifty/actualpaid_overall) + percent_forty
data6 = data_split[5]
actualpaid_sixty = len(data6.loc[data6['paidstatus'] == 1])
percent_sixty = (actualpaid_sixty/actualpaid_overall) + percent_fifty
data7 = data_split[6]
actualpaid_seventy = len(data7.loc[data7['paidstatus'] == 1])
percent_seventy = (actualpaid_seventy/actualpaid_overall) + percent_sixty
data8 = data_split[7]
actualpaid_eighty = len(data8.loc[data8['paidstatus'] == 1])
percent_eighty = (actualpaid_eighty/actualpaid_overall) + percent_seventy
data9 = data_split[8]
actualpaid_ninenty = len(data9.loc[data9['paidstatus'] == 1])
percent_ninenty = (actualpaid_ninenty/actualpaid_overall) + percent_eighty
data10 = data_split[9]
actualpaid_hundred = len(data10.loc[data10['paidstatus'] == 1])
percent_hundred = (actualpaid_hundred/actualpaid_overall) + percent_ninenty
array_x = [10,20,30,40,50,60,70,80,90,100]
array_y = [ percent_ten, percent_twenty, percent_thirty, percent_forty,percent_fifty, percent_sixty, percent_seventy, percent_eighty, percent_ninenty, percent_hundred]
plt.xlabel(' Base')
plt.ylabel(' percent')
ax = plt.plot(array_x,array_y)
plt.minorticks_on()
plt.grid(which='major', linestyle='-', linewidth=0.5, color='0.1')
plt.grid( which='both', axis = 'both', linewidth=0.5,color='0.75')
The above is my python code i have splitted my dataframe into 10 equal sections and plotted the graph but I'm not satisfied with this i have two concerns:
array_x = [10,20,30,40,50,60,70,80,90,100] in this line of code i have manually taken the x values, is there any possible way to process automatically as i have taken split(data,10) it should show 10 array values
As we can see the whole data1,2,3,4...10 is being repeated again and again is there a solution to write this in a function or loop.
Any help with codes will be appreciated. Thanks
I believe you need list comprehension and for count is possible use simplier way - sum of boolean mask, True values are processes like 1, then convert list to numpy array and use numpy.cumsum:
data = pd.read_csv('D:\ history/segment.csv')
data = data.sort_values('Prob_score', ascending=False)
one = len(data)
actualpaid_overall = (data['paidstatus'] == 1).sum()
data_split = np.array_split(data, 10)
x = [len(x) for x in data_split]
y = [(x['paidstatus'] == 1).sum()/actualpaid_overall for x in data_split]
array_x = np.cumsum(np.array(x))
array_y = np.cumsum(np.array(y))
plt.xlabel(' Base')
plt.ylabel(' percent')
ax = plt.plot(array_x,array_y)
plt.minorticks_on()
plt.grid(which='major', linestyle='-', linewidth=0.5, color='0.1')
plt.grid( which='both', axis = 'both', linewidth=0.5,color='0.75')
Sample:
np.random.seed(2019)
N = 1000
data = pd.DataFrame({'paidstatus':np.random.randint(3, size=N),
'Prob_score':np.random.randint(100, size=N)})
#print (data)
data = data.sort_values(['Prob_score'], ascending=[False])
actualpaid_overall = (data['paidstatus'] == 1).sum()
data_split = np.array_split(data, 10)
x = [len(x) for x in data_split]
y = [(x['paidstatus'] == 1).sum()/actualpaid_overall for x in data_split]
array_x = np.cumsum(np.array(x))
array_y = np.cumsum(np.array(y))
print (array_x)
[ 100 200 300 400 500 600 700 800 900 1000]
print (array_y)
[0.09118541 0.18844985 0.27963526 0.38601824 0.49848024 0.61702128
0.72036474 0.81155015 0.9331307 1. ]

Why is a 1D k-means clustering slower than a k-means initialized mixture model fit?

My timing shows that k-means consistently loses out on timing, compared to a mixture model, initialized using k-means.
What's the explanation for this? Is the GMM using a different k-means algorithm? Am I misunderstanding how it works? Does it use a differently sized dataset (smaller than I'm drawing from?).
import sklearn.cluster
import sklearn.mixture
import numpy as np
import time
import matplotlib.pyplot as plt
k = 3
N = 100
def clust():
m = sklearn.cluster.KMeans(n_clusters = k)
m.fit(X.reshape(-1, 1))
return m.cluster_centers_
def fit():
m = sklearn.mixture.GaussianMixture(n_components = k, init_params = "kmeans")
m.fit(X.reshape(-1, 1))
return m.means_
duration_clust = []
duration_fit = []
ctrs_clust = []
ctrs_fit = []
for i in range(N):
_1 = np.random.normal(0.25, 0.15, 50)
_2 = np.random.normal(0.50, 0.15, 50)
_3 = np.random.normal(0.75, 0.15, 50)
X = np.concatenate((_1, _2, _3)).reshape(-1, 1)
ts = time.time()
c = clust()
te = time.time()
time_clust = (te - ts) * 1e3
ts = time.time()
f = fit()
te = time.time()
time_fit = (te - ts) * 1e3
duration_clust.append(time_clust)
duration_fit.append(time_fit)
ctrs_clust.append(c)
ctrs_fit.append(f)
bins0 = np.arange(0, 20, 1)
bins1 = np.linspace(0,1,30)
fig, ax = plt.subplots(nrows = 2)
ax[0].hist(duration_clust, label = "Kmeans", bins = bins0, alpha = 0.5)
ax[0].hist(duration_fit, label = "GMM with Kmeans", bins = bins0, alpha = 0.5)
ax[0].set_xlabel("duration (ms)")
ax[0].legend(loc = "upper right")
ax[1].hist(np.ravel(ctrs_clust), label = "Kmeans centers", bins = bins1, alpha = 0.5)
ax[1].hist(np.ravel(ctrs_fit), label = "GMM centers", bins = bins1, alpha = 0.5)
ax[1].set_xlabel("Center location")
ax[1].axvline([0.25], label = "Truth", color = "black")
ax[1].axvline([0.50], color = "black")
ax[1].axvline([0.75], color = "black")
ax[1].legend(loc = "upper right")
plt.tight_layout()
plt.show()

ValueError: x and y must have the same first dimension, but have different shapes

import urllib.request
from math import sqrt, fabs, exp
import matplotlib.pyplot as plot
from sklearn.linear_model import enet_path
from sklearn.metrics import roc_auc_score, roc_curve
import numpy
target_url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/undocumented/connectionist-bench/sonar/sonar.all-data'
data = urllib.request.urlopen(target_url)
xList = []
for line in data:
#split on comma
row = line.strip().split(",".encode(encoding='utf-8'))
xList.append(row)
xNum = []
labels = []
for row in xList:
lastCol = row.pop()
if lastCol == b'M':
labels.append(1.0)
else:
labels.append(0.0)
attrRow = [float(elt) for elt in row]
xNum.append(attrRow)
nrow = len(xNum)
ncol = len(xNum[1])
alpha = 1.0
xMeans = []
xSD = []
for i in range(ncol):
col = [xNum[j][i] for j in range(nrow)]
mean = sum(col)/nrow
xMeans.append(mean)
colDiff = [(xNum[j][i] - mean) for j in range(nrow)]
sumSq = sum([colDiff[i] * colDiff[i] for i in range(nrow)])
stdDev = sqrt(sumSq/nrow)
xSD.append(stdDev)
xNormalized = []
for i in range(nrow):
rowNormalized = [(xNum[i][j] - xMeans[j])/xSD[j] for j in range(ncol)]
xNormalized.append(rowNormalized)
meanLabel = sum(labels)/nrow
sdLabel = sqrt(sum([(labels[i] - meanLabel) * (labels[i] - meanLabel) for i in range (nrow)])/nrow)
labelNormalized = [(labels[i] - meanLabel)/sdLabel for i in range(nrow)]
nxval = 10
for ixval in range(nxval):
idxTest = [a for a in range (nrow) if a%nxval == ixval]
idxTrain = [a for a in range(nrow) if a%nxval != ixval]
xTrain = numpy.array([xNormalized[r] for r in idxTrain])
xTest = numpy.array([xNormalized[r] for r in idxTest])
labelTrain = numpy.array([labelNormalized[r] for r in idxTrain])
labelTest = numpy.array([labelNormalized[r] for r in idxTest])
alphas, coefs, _ = enet_path(xTrain, labelTrain, l1_ratio = 0.8, fit_intercept=False, return_models=False)
if ixval == 0:
pred = numpy.dot(xTest, coefs)
yOut = labelTest
else:
yTemp = numpy.array(yOut)
yOut = numpy.concatenate((yTemp, labelTest), axis = 0)
predTemp = numpy.array(pred)
pred = numpy.concatenate((predTemp, numpy.dot(xTest, coefs)), axis = 0)
misClassRate = []
_,nPred = pred.shape
for iPred in range(1, nPred):
predList = list(pred[:, iPred])
errCnt = 0.0
for irow in range(nrow):
if (predList[irow] < 0.0) and (yOut[irow] >= 0.0):
errCnt += 1.0
elif (predList[irow] >= 0.0) and (yOut[irow] < 0.0):
errCnt += 1.0
misClassRate.append(errCnt/nrow)
minError = min(misClassRate)
idxMin = misClassRate.index(minError)
plotAlphas = numpy.array(alphas[1:len(alphas)])
misClassRate_np = numpy.array(misClassRate)
plot.figure()
plot.plot(plotAlphas, misClassRate_np, label='Misclassification Error Across Folds', linewidth=2)
plot.axvline(plotAlphas[idxMin], linestyle='--', label='CV Estimate of Best alpha')
plot.legend()
plot.semilogx()
ax = plot.gca()
ax.invert_xaxis()
plot.xlabel('alpha')
plot.ylabel('Misclassification Error')
plot.axis('tight')
plot.show()
When I executed the code above, it returns: ValueError: x and y must have same first dimension, but have shapes (99,) and (1,).
It seems the problem is due to unequal length in x and y.
Then I checked both plotAlphas and misClassRate_np, they show the same length. Also, both of them has been changed to array but still fail to fix the problem. Can't figure out what's happening.

Looping same program for different data files

For the following program, I am trying to save time copying and pasting tons of code. I would like this program to plot using the data file 19_6.txt and aux.19_6, and then continue by plotting the files with 11,12,20,28,27, and 18 in 19's place with the same code and onto the same plot. Any help would be appreciated. Thanks!
from numpy import *
import matplotlib.pyplot as plt
datasim19 = loadtxt("/home/19_6.txt")
data19 = loadtxt("/home/aux.19_6")
no1=1
no2=2
no3=3
no4=4
no5=5
no7=7
no8=8
no9=9
no10=10
simrecno1inds19 = nonzero(datasim19[:,1]==no1)[0]
simrecno2inds19 = nonzero(datasim19[:,1]==no2)[0]
simrecno3inds19 = nonzero(datasim19[:,1]==no3)[0]
simrecno4inds19 = nonzero(datasim19[:,1]==no4)[0]
simrecno5inds19 = nonzero(datasim19[:,1]==no5)[0]
simrecno7inds19 = nonzero(datasim19[:,1]==no7)[0]
simrecno8inds19 = nonzero(datasim19[:,1]==no8)[0]
simrecno9inds19 = nonzero(datasim19[:,1]==no9)[0]
simrecno10inds19 = nonzero(datasim19[:,1]==no10)[0]
recno1inds19 = nonzero(data19[:,1]==no1)[0]
recno2inds19 = nonzero(data19[:,1]==no2)[0]
recno3inds19 = nonzero(data19[:,1]==no3)[0]
recno4inds19 = nonzero(data19[:,1]==no4)[0]
recno5inds19 = nonzero(data19[:,1]==no5)[0]
recno7inds19 = nonzero(data19[:,1]==no7)[0]
recno8inds19 = nonzero(data19[:,1]==no8)[0]
recno9inds19 = nonzero(data19[:,1]==no9)[0]
recno10inds19 = nonzero(data19[:,1]==no10)[0]
q1sim19 = qsim19[simrecno1inds19]
q2sim19 = qsim19[simrecno2inds19]
q3sim19 = qsim19[simrecno3inds19]
q4sim19 = qsim19[simrecno4inds19]
q5sim19 = qsim19[simrecno5inds19]
q7sim19 = qsim19[simrecno7inds19]
q8sim19 = qsim19[simrecno8inds19]
q9sim19 = qsim19[simrecno9inds19]
q10sim19 = qsim19[simrecno10inds19]
q1_19 = q19[recno1inds19]
q2_19 = q19[recno2inds19]
q3_19 = q19[recno3inds19]
q4_19 = q19[recno4inds19]
q5_19 = q19[recno5inds19]
q7_19 = q19[recno7inds19]
q8_19 = q19[recno8inds19]
q9_19 = q19[recno9inds19]
q10_19 = q19[recno10inds19]
sumq1sim19 = sum(q1sim19)
sumq2sim19 = sum(q2sim19)
sumq3sim19 = sum(q3sim19)
sumq4sim19 = sum(q4sim19)
sumq5sim19 = sum(q5sim19)
sumq7sim19 = sum(q7sim19)
sumq8sim19 = sum(q8sim19)
sumq9sim19 = sum(q9sim19)
sumq10sim19 = sum(q10sim19)
sumq1_19 = sum(q1_19)
sumq2_19 = sum(q2_19)
sumq3_19 = sum(q3_19)
sumq4_19 = sum(q4_19)
sumq5_19 = sum(q5_19)
sumq7_19 = sum(q7_19)
sumq8_19 = sum(q8_19)
sumq9_19 = sum(q9_19)
sumq10_19 = sum(q10_19)
xsim = [no1, no2, no3, no4, no5, no7, no8, no9, no10]
ysim = [sumq1sim_19, sumq2sim_19, sumq3sim_19, sumq4sim_19, sumq5sim_19, sumq7sim_19, sumq8sim_19, sumq9sim_19, sumq10sim_19]
x = [no1, no2, no3, no4, no5,no7, no8, no9, no10]
y = [sumq1_19, sumq2_19, sumq3_19, sumq4_19, sumq5_19, sumq7_19, sumq8_19, sumq9_19, sumq10_19]
plt.plot(x,log(y),'b',label='Data')
plt.plot(xsim,log(ysim),'r',label='Simulation')
plt.legend()
plt.title('Data vs. Simulation')
plt.show()
Tip: when you find yourself using lots of variables called n1, n2, n3 etc. you should probably use lists, dictionaries or other such containers, and loops instead.
For example, try replacing the following code:
simrecno1inds19 = nonzero(datasim19[:,1]==no1)[0]
simrecno2inds19 = nonzero(datasim19[:,1]==no2)[0]
simrecno3inds19 = nonzero(datasim19[:,1]==no3)[0]
simrecno4inds19 = nonzero(datasim19[:,1]==no4)[0]
simrecno5inds19 = nonzero(datasim19[:,1]==no5)[0]
simrecno7inds19 = nonzero(datasim19[:,1]==no7)[0]
simrecno8inds19 = nonzero(datasim19[:,1]==no8)[0]
simrecno9inds19 = nonzero(datasim19[:,1]==no9)[0]
simrecno10inds19 = nonzero(datasim19[:,1]==no10)[0]
With this:
simrecinds19 = [nonzero(datasim19[:,1] == i)[0] for i in range(1, 11)]
Then you can use simrecinds19[0] instead of simrecno1inds19.
You can do something like this:
nList = [19,11,12,20,28,27,18]
for n in nList:
file1 = "/home/" + str(n) + "_6.txt"
file2 = "/home/aux." + str(n) + "_6"
datasim19 = loadtxt(file1)
data19 = loadtxt(file2)
# do the rest of the plotting
You can greatly reduce the size of this script. I'm not quite sure where qsim19 and qsim come from, but take a look:
import numpy as np
import matplotlib.pyplot as plt
for index in [19, 11, 12, 20, 28, 27, 18]:
datasim = loadtxt("/home/%i_6.txt"%index)
data = loadtxt("/home/aux.%i_6"%index)
nos = range(1, 6) + range(7, 11)
simrecno = [np.nonzero(datasim[:,1] == n)[0] for n in nos]
recno = [np.nonzero(data[:,1] == n)[0] for n in nos]
qsim = [qsim[simrecno_i] for simrecno_i in simrecno]
q = [q[recno_i] for recno_i in recno]
sumqsim = [sum(qsim_i) for qsim_i in qsim]
sumq = [sum(q_i) for q_i in q]
xsim = nos
ysim = sumqsim
x = nos
y = sumq
plt.plot(x, log(y), 'b', label='Data')
plt.plot(xsim, log(ysim), 'r', label='Simulation')
plt.legend()
plt.title('Data vs. Simulation')
plt.show()

Categories

Resources