Splittig data in python dataframe and getting the array values automatically - python

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
data = pd.read_csv('D:\ history/segment.csv')
data = pd.DataFrame(data)
data = data.sort_values(['Prob_score'], ascending=[False])
one = len(data)
actualpaid_overall = len(data.loc[data['paidstatus'] == 1])
data_split = np.array_split(data, 10)
data1 = data_split[0]
actualpaid_ten = len(data1.loc[data1['paidstatus'] == 1])
percent_ten = actualpaid_ten/actualpaid_overall
data2 = data_split[1]
actualpaid_twenty = len(data2.loc[data2['paidstatus'] == 1])
percent_twenty = (actualpaid_twenty/actualpaid_overall) + percent_ten
data3 = data_split[2]
actualpaid_thirty = len(data3.loc[data3['paidstatus'] == 1])
percent_thirty = (actualpaid_thirty/actualpaid_overall) + percent_twenty
data4 = data_split[3]
actualpaid_forty = len(data4.loc[data4['paidstatus'] == 1])
percent_forty = (actualpaid_forty/actualpaid_overall) + percent_thirty
data5 = data_split[4]
actualpaid_fifty = len(data5.loc[data5['paidstatus'] == 1])
percent_fifty = (actualpaid_fifty/actualpaid_overall) + percent_forty
data6 = data_split[5]
actualpaid_sixty = len(data6.loc[data6['paidstatus'] == 1])
percent_sixty = (actualpaid_sixty/actualpaid_overall) + percent_fifty
data7 = data_split[6]
actualpaid_seventy = len(data7.loc[data7['paidstatus'] == 1])
percent_seventy = (actualpaid_seventy/actualpaid_overall) + percent_sixty
data8 = data_split[7]
actualpaid_eighty = len(data8.loc[data8['paidstatus'] == 1])
percent_eighty = (actualpaid_eighty/actualpaid_overall) + percent_seventy
data9 = data_split[8]
actualpaid_ninenty = len(data9.loc[data9['paidstatus'] == 1])
percent_ninenty = (actualpaid_ninenty/actualpaid_overall) + percent_eighty
data10 = data_split[9]
actualpaid_hundred = len(data10.loc[data10['paidstatus'] == 1])
percent_hundred = (actualpaid_hundred/actualpaid_overall) + percent_ninenty
array_x = [10,20,30,40,50,60,70,80,90,100]
array_y = [ percent_ten, percent_twenty, percent_thirty, percent_forty,percent_fifty, percent_sixty, percent_seventy, percent_eighty, percent_ninenty, percent_hundred]
plt.xlabel(' Base')
plt.ylabel(' percent')
ax = plt.plot(array_x,array_y)
plt.minorticks_on()
plt.grid(which='major', linestyle='-', linewidth=0.5, color='0.1')
plt.grid( which='both', axis = 'both', linewidth=0.5,color='0.75')
The above is my python code i have splitted my dataframe into 10 equal sections and plotted the graph but I'm not satisfied with this i have two concerns:
array_x = [10,20,30,40,50,60,70,80,90,100] in this line of code i have manually taken the x values, is there any possible way to process automatically as i have taken split(data,10) it should show 10 array values
As we can see the whole data1,2,3,4...10 is being repeated again and again is there a solution to write this in a function or loop.
Any help with codes will be appreciated. Thanks

I believe you need list comprehension and for count is possible use simplier way - sum of boolean mask, True values are processes like 1, then convert list to numpy array and use numpy.cumsum:
data = pd.read_csv('D:\ history/segment.csv')
data = data.sort_values('Prob_score', ascending=False)
one = len(data)
actualpaid_overall = (data['paidstatus'] == 1).sum()
data_split = np.array_split(data, 10)
x = [len(x) for x in data_split]
y = [(x['paidstatus'] == 1).sum()/actualpaid_overall for x in data_split]
array_x = np.cumsum(np.array(x))
array_y = np.cumsum(np.array(y))
plt.xlabel(' Base')
plt.ylabel(' percent')
ax = plt.plot(array_x,array_y)
plt.minorticks_on()
plt.grid(which='major', linestyle='-', linewidth=0.5, color='0.1')
plt.grid( which='both', axis = 'both', linewidth=0.5,color='0.75')
Sample:
np.random.seed(2019)
N = 1000
data = pd.DataFrame({'paidstatus':np.random.randint(3, size=N),
'Prob_score':np.random.randint(100, size=N)})
#print (data)
data = data.sort_values(['Prob_score'], ascending=[False])
actualpaid_overall = (data['paidstatus'] == 1).sum()
data_split = np.array_split(data, 10)
x = [len(x) for x in data_split]
y = [(x['paidstatus'] == 1).sum()/actualpaid_overall for x in data_split]
array_x = np.cumsum(np.array(x))
array_y = np.cumsum(np.array(y))
print (array_x)
[ 100 200 300 400 500 600 700 800 900 1000]
print (array_y)
[0.09118541 0.18844985 0.27963526 0.38601824 0.49848024 0.61702128
0.72036474 0.81155015 0.9331307 1. ]

Related

Scipy Curve_fit gives a rather weird fit

Dear Python programmers,
I am currently working with curve_fit from scipy inorder to find out what correlation the x and y data have with echouter. However, the curve fit becomes really weird even when I fit a simple lineair formule towards it. I've tried changing the array to a numpy array at the: def func(x, a, b, c): "Fit functie" return a * np.asarray(x) + b part but it still gives me a graph that looks like a 3 year old who scratched with some red pencil.
One thing I do remember is sorting the values of massflows and rms_smote from low to high. Which you can view above the def func(x, a, b, c) bit. Since the curve_fit was giving me a fit. Yet also kinda scratched out as if you're sketching when the values ware unsorted. I don't know if curve_fit considers data differently if it's sorted or not.
If you need any more information, let me know :) Any suggestion is welcome!
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from scipy.stats import linregress
from scipy.optimize import curve_fit
data_15 = pd.read_csv(r"C:\Users\Thomas\Documents\Pythondata\2022-01-15_SMOTERapport.csv", header= 0, sep=';', decimal=',')
data_06 = pd.read_csv(r"C:\Users\Thomas\Documents\Pythondata\2022-02-06_SMOTERapport.csv", header= 0, sep=';', decimal=',')
data_10 = pd.read_csv(r"C:\Users\Thomas\Documents\Pythondata\2022-02-10_SMOTERapport.csv", header= 0, sep=';', decimal=',')
speed_15 = data_15['SPEED_ACT']
speed_06 = data_06['SPEED_ACT']
speed_10 = data_10['SPEED_ACT']
"Data filter 01_15"
filter = [i for i, e in enumerate(speed_15) if e >= 80]
s_15 = pd.DataFrame(data_15)
speed15 = s_15.filter(items = filter, axis=0)
speed15.reset_index(drop=True, inplace=True)
temp15 = speed15['TP_SMOTE']
foutmetingen2 = [i for i, e in enumerate(temp15) if e < 180]
speed15 = speed15.drop(foutmetingen2)
tp_strip15 = speed15['TP_AMBIENT']
tp_target15 = speed15['TP_TARGET']
tp_smote15 = speed15['TP_SMOTE']
v_15 = speed15['SPEED_ACT']
width15 = speed15['STRIP_WIDTH']
thickness15 = speed15['STRIP_THICKNESS']
power15 = speed15['POWER_INVERTER_PRE']
voltage15 = speed15['VOLTAGE_INVERTER_PRE']
"Data filter 02_06"
filter = [i for i, e in enumerate(speed_06) if e >= 80]
s_06 = pd.DataFrame(data_06)
speed06 = s_06.filter(items = filter, axis=0)
speed06.reset_index(drop=True, inplace=True)
temp06 = speed06['TP_SMOTE']
foutmetingen2 = [i for i, e in enumerate(temp06) if e < 180]
speed06 = speed06.drop(foutmetingen2)
tp_strip06 = speed06['TP_AMBIENT']
tp_target06 = speed06['TP_TARGET']
tp_smote06 = speed06['TP_SMOTE']
v_06 = speed06['SPEED_ACT']
width06 = speed06['STRIP_WIDTH']
thickness06 = speed06['STRIP_THICKNESS']
power06 = speed06['POWER_INVERTER_PRE']
voltage06 = speed06['VOLTAGE_INVERTER_PRE']
"Data filter 02_10"
filter = [i for i, e in enumerate(speed_10) if e >= 80]
s_10 = pd.DataFrame(data_10)
speed10 = s_10.filter(items = filter, axis=0)
speed10.reset_index(drop=True, inplace=True)
temp_01 = speed10['TP_SMOTE']
foutmetingen2 = [i for i, e in enumerate(temp_01) if e < 180]
speed10 = speed10.drop(foutmetingen2)
tp_strip10 = speed10['TP_AMBIENT']
tp_target10 = speed10['TP_TARGET']
tp_smote10 = speed10['TP_SMOTE']
v_10 = speed10['SPEED_ACT']
width10 = speed10['STRIP_WIDTH']
thickness10 = speed10['STRIP_THICKNESS']
power10 = speed10['POWER_INVERTER_PRE']
voltage10 = speed10['VOLTAGE_INVERTER_PRE']
"Constanten"
widthmax = 1253
Kra = 0.002033636
Kosc = 0.073086272
Pnominal = 2200
meting_15 = np.arange(0, len(speed15), 1)
meting_06 = np.arange(0, len(speed06), 1)
meting_10 = np.arange(0, len(speed10), 1)
cp = 480
rho = 7850
"---------------------------------------------------------------------"
def temp(power, speed, width, thickness, tp_strip, tp_target, tp_smote,
voltage):
"Berekende temperatuur vergelijken met target temperatuur"
massflow = (speed/60)*width*10**-3*thickness*10**-3*rho
LossesRA = Kra*Pnominal*(width/widthmax)
LossesOSC = Kosc*Pnominal*(voltage/100)**2
Plosses = (LossesRA + LossesOSC)
power_nl = (power/100)*Pnominal - Plosses
temp_c = ((power_nl*1000)/(massflow*cp)) + tp_strip
verschil_t = (temp_c/tp_target)*100-100
verschil_smote = (temp_c/tp_smote)*100-100
return temp_c, verschil_t, verschil_smote, massflow
temp_15 = temp(power15, v_15, width15, thickness15, tp_strip15, tp_target15,
tp_smote15, voltage15)
temp_06 = temp(power06, v_06, width06, thickness06, tp_strip06, tp_target06,
tp_smote06, voltage06)
temp_10 = temp(power10, v_10, width10, thickness10, tp_strip10, tp_target10,
tp_smote10, voltage10)
"---------------------------------------------------------------------"
def rms(Temperatuurberekend, TemperatuurGemeten):
"De Root Mean Square berekenen tussen berekend en gemeten data"
rootmeansquare = (TemperatuurGemeten - Temperatuurberekend)
rootmeansquare_totaal = np.sum(rootmeansquare)
rootmeansquare_gem = rootmeansquare_totaal/len(rootmeansquare)
return rootmeansquare, rootmeansquare_totaal, rootmeansquare_gem
rms_tp_smote15 = (rms(temp_15[0], tp_smote15))
rms_tp_smote06 = (rms(temp_06[0], tp_smote06))
rms_tp_smote10 = (rms(temp_10[0], tp_smote10))
"----------------------------------------------------------------------"
massflows = [np.sum(temp_06[3])/len(temp_06[3]), np.sum(temp_15[3])/
len(temp_15[3]), np.sum(temp_10[3])/len(temp_10[3])]
rms_smote = [rms_tp_smote06[2], rms_tp_smote10[2], rms_tp_smote15[2]]
rms_tp_smote_pre = np.append(rms_tp_smote15[0].tolist(),
rms_tp_smote06[0].tolist())
rms_tp_smote = np.append(rms_tp_smote_pre, rms_tp_smote10[0].tolist())
massflow_pre = np.append(temp_15[3].tolist(), temp_06[3].tolist())
massflow = np.append(massflow_pre, temp_10[3].tolist())
massflow_sort = np.sort(massflow)
rms_tp_smote_sort = [x for _, x in sorted(zip(massflow, rms_tp_smote))]
a,b,r,p, s_a= linregress (massflows,rms_smote)
print('RC: ' ,a ,'\n','std: ', s_a , '\n', 'Offset: ', b)
def func(x, a, b, c):
"Fit functie"
return a * np.asarray(x) + b
popt, pcov = curve_fit(func, massflow_sort, rms_tp_smote_sort)
popt
functie = func(massflow_sort, *popt)
sns.set_theme(style='whitegrid')
fig, axs = plt.subplots(2, figsize=(10, 10))
axs[0].plot(massflows, rms_smote, label='Temp afwijking als f(massflow)')
axs[0].plot ([massflows[0] ,massflows[len (massflows) -1]] ,
[a*massflows [0]+b,a*massflows[len (massflows) -1]+b] ,
label ='trendlijn')
axs[0].set(xlabel='Mass flow ($kg/s$)',
ylabel='Temperatuur afwijking gem ($\u00b0C$)', title='Met Verliezen')
axs[0].legend(loc='upper right')
axs[1].plot(massflow_sort, rms_tp_smote_sort, 'o', label='Temp/Massflow 01-15')
#axs[1].plot(temp_06[3], rms_tp_smote06[0], 'o', label='Temp/Massflow 02-06')
#axs[1].plot(temp_10[3], rms_tp_smote10[0], 'o', label='Temp/Massflow 02-10')
axs[1].plot(massflow, func(massflow_sort, *popt), 'r-',
label='fit: a=%5.3f, b=%5.3f, c=%5.3f' % tuple(popt))
axs[1].set(xlabel='Mass flow ($kg/s$)',
ylabel='Temperatuur afwijking gem ($\u00b0C$)')
axs[1].legend(loc='upper right')
print("Gemiddelde verschil temperatuur smote: ", rms_tp_smote15[1])
print("Gemiddelde uitwijking temperatuur smote: ", rms_tp_smote15[2])

How to increase planes' size in Plotly

Got the following code
import pandas as pd
import plotly.graph_objects as go
import numpy as np
df = pd.read_csv('https://raw.githubusercontent.com/tiago-peres/immersion/master/Platforms_dataset.csv')
fig = px.scatter_3d(df, x='Functionality ', y='Accessibility', z='Immersion', color='Platforms')
grey = [[0,'#C0C0C0'],[1,'#C0C0C0']]
zero_pt = pd.Series([0])
z = zero_pt.append(df['Immersion'], ignore_index = True).reset_index(drop = True)
y = zero_pt.append(df['Accessibility'], ignore_index = True).reset_index(drop = True)
x = zero_pt.append(df['Functionality '], ignore_index = True).reset_index(drop = True)
length_data = len(z)
z_plane_pos = 66.5*np.ones((length_data,length_data))
fig.add_trace(go.Surface(x=x, y=y, z=z_plane_pos, colorscale=grey, showscale=False))
fig.add_trace(go.Surface(x=x.apply(lambda x: 15.69), y=y, z = np.array([z]*length_data), colorscale= grey, showscale=False))
fig.add_trace(go.Surface(x=x, y= y.apply(lambda x: 55), z = np.array([z]*length_data).transpose(), colorscale=grey, showscale=False))
fig.update_layout(scene = dict(
xaxis = dict(nticks=4, range=[0,31.38],),
yaxis = dict(nticks=4, range=[0,110],),
zaxis = dict(nticks=4, range=[0,133],),),
legend_orientation="h",margin=dict(l=0, r=0, b=0, t=0))
that can be opened in Google Colab which produces the following output
As you can see, the planes are not filling up the entire axis space, they should respect the axis range. In other words, the planes
z=66.5 - should exist between [0, 31.38] in x and [0, 110] in y
x=15.59 - should exist between [0, 110] in y and [0, 133] in z
y=55 - should exist between [0, 31.38] in x and [0, 133] in z
How can that be done?
With this new adjustment,
import plotly.express as px
import pandas as pd
import plotly.graph_objects as go
import numpy as np
df = pd.read_csv('https://raw.githubusercontent.com/tiago-peres/immersion/master/Platforms_dataset.csv')
fig = px.scatter_3d(df, x='Functionality ', y='Accessibility', z='Immersion', color='Platforms')
grey = [[0,'#C0C0C0'],[1,'#C0C0C0']]
zero_pt = pd.Series([0])
z1 = np.arange(0, 134, 1)
print(z1)
y1 = np.arange(0, 111, 1)
print(z1)
x1 = np.arange(0, 32.38, 1)
print(z1)
z = zero_pt.append(df['Immersion'], ignore_index = True).reset_index(drop = True)
y = zero_pt.append(df['Accessibility'], ignore_index = True).reset_index(drop = True)
x = zero_pt.append(df['Functionality '], ignore_index = True).reset_index(drop = True)
print(zero_pt)
print(z)
test1 = pd.Series([133])
test = z.append(test1)
length_data1 = len(z1)
z_plane_pos = 66.5*np.ones((length_data1,length_data1))
length_data2 = len(y1)
y_plane_pos = 55*np.ones((length_data2,length_data2))
length_data3 = len(x1)
x_plane_pos = 15.69*np.ones((length_data3,length_data3))
fig.add_trace(go.Surface(x=x1, y=y1, z=z_plane_pos, colorscale=grey, showscale=False))
fig.add_trace(go.Surface(x=x.apply(lambda x: 15.69), y=y1, z = np.array([test]*length_data1), colorscale= grey, showscale=False))
fig.add_trace(go.Surface(x=x1, y= y.apply(lambda x: 55), z = np.array([test]*length_data1).transpose(), colorscale=grey, showscale=False))
fig.update_layout(scene = dict(
xaxis = dict(nticks=4, range=[0,31.38],),
yaxis = dict(nticks=4, range=[0,110],),
zaxis = dict(nticks=4, range=[0,133],),),
legend_orientation="h",margin=dict(l=0, r=0, b=0, t=0))
nearly having the job done but the planes x=15.59 and y=55 aren't going to the maximum 133 in Immersion
The issue was that the arrays I was plotting weren't the right shapes.
By properly splitting the bit where created the input arrays and the plotting, was able to discover this, and consequently made input arrays (for plotting) of the right size and appropriate content.
import plotly.express as px
import pandas as pd
import plotly.graph_objects as go
import numpy as np
df = pd.read_csv('https://raw.githubusercontent.com/tiago-peres/immersion/master/Platforms_dataset.csv')
zero_pt = pd.Series([0])
z1 = np.arange(0, 134, 1)
y1 = np.arange(0, 111, 1)
x1 = np.arange(0, 32.38, 1)
z = zero_pt.append(df['Immersion'], ignore_index = True).reset_index(drop = True)
y = zero_pt.append(df['Accessibility'], ignore_index = True).reset_index(drop = True)
x = zero_pt.append(df['Functionality '], ignore_index = True).reset_index(drop = True)
test1 = pd.Series([133])
test = z.append(test1)
length_data1 = len(z1)
z_plane_pos = 66.5*np.ones((length_data1,length_data1))
length_data2 = len(y1)
y_plane_pos = 55*np.ones((length_data2,length_data2))
length_data3 = len(x1)
x_plane_pos = 15.69*np.ones((length_data3,length_data3))
xvals = x.apply(lambda x: 15.69)
xvals2 = x1
yvals = y1
yvals2 = y.apply(lambda x: 55)
zvals = np.zeros((len(yvals), len(xvals)))
zvals[:, -1] = 133 # np.array([test]*length_data2)
zvals2 = np.zeros((len(yvals2), len(xvals2)))
zvals2[-1, :] = 133
fig = px.scatter_3d(df, x='Functionality ', y='Accessibility', z='Immersion', color='Platforms')
grey = [[0,'#C0C0C0'],[1,'#C0C0C0']]
fig.add_trace(go.Surface(x=x1, y=y1, z=z_plane_pos, colorscale=grey, showscale=False))
fig.add_trace(go.Surface(x=xvals, y=yvals, z = zvals, colorscale= grey, showscale=False))
fig.add_trace(go.Surface(x=xvals2, y=yvals2, z = zvals2, colorscale=grey, showscale=False))
fig.update_layout(scene = dict(
xaxis = dict(nticks=4, range=[0,31.38],),
yaxis = dict(nticks=4, range=[0,110],),
zaxis = dict(nticks=4, range=[0,133],),),
legend_orientation="h",margin=dict(l=0, r=0, b=0, t=0))
See it here.

Generate synthetic time series data from existing sample data

Are there any good library/tools in python for generating synthetic time series data from existing sample data? For example I have sales data from January-June and would like to generate synthetic time series data samples from July-December )(keeping time series factors intact, like trend, seasonality, etc).
Leaving the question about quality of such data aside, here is a simple approach you can use Gaussian distribution to generate synthetic data based-off a sample. Below is the critical part.
import numpy as np
x # original sample np.array of features
feature_means = np.mean(x, axis=1)
feature_std = np.std(x, axis=1)
random_normal_feature_values = np.random.normal(feature_means, feature_std)
Here is a fully functioning code I used,
def generate_synthetic_data(sample_dataset, window_mean, window_std, fixed_window=None, variance_range =1 , sythesize_ratio = 2, forced_reverse = False):
synthetic_data = pd.DataFrame(columns=sample_dataset.columns)
synthetic_data.insert(len(sample_dataset.columns), "synthesis_seq", [], True)
for k in range(sythesize_ratio):
if len(synthetic_data) >= len(sample_dataset) * sythesize_ratio:
break;
#this loop generates a set that resembles the entire dataset
country_synthetic = pd.DataFrame(columns=synthetic_data.columns)
if fixed_window != None:
input_sequence_len = fixed_window
else:
input_sequence_len = int(np.random.normal(window_mean, window_std))
#population data change
country_data_i = sample_dataset
if len(country_data_i) < input_sequence_len :
continue
feature_length = configuration['feature_length'] #number of features to be randomized
country_data_array = country_data_i.to_numpy()
country_data_array = country_data_array.T[:feature_length]
country_data_array = country_data_array.reshape(feature_length,len(country_data_i))
x = country_data_array[:feature_length].T
reversed = np.random.normal(0,1)>0
if reversed:
x = x[::-1]
sets =0
x_list = []
dict_x = dict()
for i in range(input_sequence_len):
array_len = ((len(x) -i) - ((len(x)-i)%input_sequence_len))+i
if array_len <= 0:
continue
sets = int( array_len/ input_sequence_len)
if sets <= 0:
continue
x_temp = x[i:array_len].T.reshape(sets,feature_length,input_sequence_len)
uniq_keys = np.array([i+(input_sequence_len*k) for k in range(sets)])
x_temp = x_temp.reshape(feature_length,sets,input_sequence_len)
arrays_split = np.hsplit(x_temp,sets)
dict_x.update(dict(zip(uniq_keys, arrays_split)))
temp_x_list = [dict_x[i].T for i in sorted(dict_x.keys())]
temp_x_list = np.array(temp_x_list).squeeze()
feature_means = np.mean(temp_x_list, axis=1)
feature_std = np.std(temp_x_list, axis=1) /variance_range
random_normal_feature_values = np.random.normal(feature_means, feature_std).T
random_normal_feature_values = np.round(random_normal_feature_values,0)
random_normal_feature_values[random_normal_feature_values < 0] = 0
if reversed:
random_normal_feature_values = random_normal_feature_values.T[::-1]
random_normal_feature_values = random_normal_feature_values.T
for i in range(len(random_normal_feature_values)):
country_synthetic[country_synthetic.columns[i]] = random_normal_feature_values[i]
country_synthetic['synthesis_seq'] = k
synthetic_data = synthetic_data.append(country_synthetic, ignore_index=True)
return synthetic_data
for i in range(1):
directory_name = '/synthetic_'+str(i)
mypath = source_path+ '/cleaned'+directory_name
if os.path.exists(mypath) == False:
os.mkdir(mypath)
data = generate_synthetic_data(original_data, window_mean = 0, window_std= 0, fixed_window=2 ,variance_range = 10**i, sythesize_ratio = 1)
synthetic_data.append(data)
#data.to_csv(mypath+'/synthetic_'+str(i)+'_dt31_05_.csv', index=False )
print('synth step : ', i, ' len : ', len(synthetic_data))
Good luck!

Making linear regression more compact (python)

Im trying to make a linear expression for a dataset. I have plotted the data and plottet the regression, but my code is not very efficient. Is there any way to make it more compact?
import numpy as np
import matplotlib.pyplot as plt
temp1, tid0 = np.genfromtxt("forsok1.txt", dtype=float, skip_header=41, usecols = (1,2)).T
tid1 = tid0 - 200
temp2, tid2 = np.genfromtxt("forsok2.txt", dtype=float, skip_header=1, usecols = (1,2)).T
temp3, tid3 = np.genfromtxt("forsok3.txt", dtype=float, skip_header=1, usecols = (1,2)).T
tempreg1_1 = np.zeros(88)
tidreg1_1 = np.zeros(88)
for i in range(0, 88):
tempreg1_1[i] = temp1[i]
tidreg1_1[i] = tid1[i]
tempreg2_1 = np.zeros(65)
tidreg2_1 = np.zeros(65)
tempreg3_1 = np.zeros(65)
tidreg3_1 = np.zeros(65)
for i in range(0, 65):
tempreg2_1[i] = temp2[i]
tidreg2_1[i] = tid2[i]
tempreg3_1[i] = temp3[i]
tidreg3_1[i] = tid3[i]
tempreg1_2 = np.zeros(59)
tidreg1_2 = np.zeros(59)
for i in range(0, 59):
tempreg1_2[i] = temp1[i+112]
tidreg1_2[i] = tid1[i+112]
tempreg2_2 = np.zeros(76)
tidreg2_2 = np.zeros(76)
for i in range(0, 76):
tempreg2_2[i] = temp2[i+93]
tidreg2_2[i] = tid2[i+93]
tempreg3_2 = np.zeros(55)
tidreg3_2 = np.zeros(55)
for i in range(0,55):
tempreg3_2[i] = temp3[i+100]
tidreg3_2[i] = tid3[i+100]
tempreg1_3 = np.zeros(76)
tidreg1_3 = np.zeros(76)
for i in range(0, 76):
tempreg1_3[i] = temp1[i+210]
tidreg1_3[i] = tid1[i+210]
tempreg2_3 = np.zeros(80)
tidreg2_3 = np.zeros(80)
for i in range(0, 80):
tempreg2_3[i] = temp2[i+207]
tidreg2_3[i] = tid2[i+207]
tempreg3_3 = np.zeros(91)
tidreg3_3 = np.zeros(91)
for i in range(0,91):
tempreg3_3[i] = temp3[i+181]
tidreg3_3[i] = tid3[i+181]
R1_1, b1_1 = np.polyfit(tidreg1_1, tempreg1_1, 1)
R2_1, b2_1 = np.polyfit(tidreg2_1, tempreg2_1, 1)
R3_1, b3_1 = np.polyfit(tidreg3_1, tempreg3_1, 1)
R1_2, b1_2 = np.polyfit(tidreg1_2, tempreg1_2, 1)
R2_2, b2_2 = np.polyfit(tidreg2_2, tempreg2_2, 1)
R3_2, b3_2 = np.polyfit(tidreg3_2, tempreg3_2, 1)
R1_3, b1_3 = np.polyfit(tidreg1_3, tempreg1_3, 1)
R2_3, b2_3 = np.polyfit(tidreg2_3, tempreg2_3, 1)
R3_3, b3_3 = np.polyfit(tidreg3_3, tempreg3_3, 1)
tempreg1_1[0] = b1_1
tempreg2_1[0] = b2_1
tempreg3_1[0] = b3_1
for j in range(1, 88):
tempreg1_1[j] = tempreg1_1[j-1] + 5*R1_1
for j in range(1, 65):
tempreg2_1[j] = tempreg2_1[j-1] + 5*R2_1
tempreg3_1[j] = tempreg3_1[j-1] + 5*R3_1
tempreg1_2[0] = b1_2 + 560*R1_2
tempreg2_2[0] = b2_2 + 465*R2_2
tempreg3_2[0] = b3_2 + 500*R3_2
for j in range(1, 59):
tempreg1_2[j] = tempreg1_2[j-1] + 5*R1_2
for j in range(1, 76):
tempreg2_2[j] = tempreg2_2[j-1] + 5*R2_2
for j in range(1, 55):
tempreg3_2[j] = tempreg3_2[j-1] + 5*R3_2
tempreg1_3[0] = b1_3 + 1050*R1_3
tempreg2_3[0] = b2_3 + 1035*R2_3
tempreg3_3[0] = b3_3 + 905*R3_3
for j in range(1, 76):
tempreg1_3[j] = tempreg1_3[j-1] + 5*R1_3
for j in range(1, 80):
tempreg2_3[j] = tempreg2_3[j-1] + 5*R2_3
for j in range(1, 91):
tempreg3_3[j] = tempreg3_3[j-1] + 5*R3_3
plt.figure()
ax1 = plt.subplot(311)
ax2 = plt.subplot(312)
ax3 = plt.subplot(313)
ax1.plot(tid1, temp1, ':', color="g")
ax1.plot(tidreg1_1, tempreg1_1, '-.',color="b")
ax1.plot(tidreg1_2, tempreg1_2, '-.',color="b")
ax1.plot(tidreg1_3, tempreg1_3, '-.',color="b")
ax2.plot(tid2, temp2, ':', color="g")
ax2.plot(tidreg2_1, tempreg2_1, '-.',color="b")
ax2.plot(tidreg2_2, tempreg2_2, '-.',color="b")
ax2.plot(tidreg2_3, tempreg2_3, '-.',color="b")
ax3.plot(tid3, temp3, ':', color="g")
ax3.plot(tidreg3_1, tempreg3_1, '-.',color="b")
ax3.plot(tidreg3_2, tempreg3_2, '-.',color="b")
ax3.plot(tidreg3_3, tempreg3_3, '-.',color="b")
The code i have used is making arrays from small parts of the dataset, then making a linear regression from those arrays. The regression is then made into another array, whitch is plotted in the subplots. This is done for three different dataplots.
I have tried to make it more compact but havent foud a function to use. Thanks for the help and sorry for bad english.
This:
tempreg1_1 = np.zeros(88)
tidreg1_1 = np.zeros(88)
for i in range(0, 88):
tempreg1_1[i] = temp1[i]
tidreg1_1[i] = tid1[i]
Is the same as this:
tempreg1_1 = temp1[:88]
tidreg1_1 = tid1[:88]
So you may not even need make those arrays, since you can potentially just use the 'slices' directly.
In general, you rarely need to pre-create an empty array then fill it with a loop. If you find yourself doing this in NumPy, there's almost certainly a better way.
You don't have to do all of this explicitly, you can iterate through these almost-all-the-same works. Here's a simplified case, sorry your variables is a bit too much, so I use some easy names:
#read data
plt.figure()
ax1 = plt.subplot(311)
ax2 = plt.subplot(312)
ax3 = plt.subplot(313)
plots = [ax1, ax2, ax3]
for subplot in plots:
#operating tidreg and tempreg here
xCordinate = #should be your tidreg
y1 = tempreg1
y2 = tempreg2
regression1 = np.poly1d(np.polyfit(xCordinate , y1, 1))
regression2 = np.poly1d(np.polyfit(xCordinate , y2, 1))
subplot.plot(xCordinate, regression1(xCordinate), 'b-')
subplot.plot(xCordinate, regression2(xCordinate), 'b-')
plt.show()
Each for loop corresponds to a subplot, you need only operating data that would be used in that subplot. During each loop, the variable is renewed, so you also don't have to create so many variables. theoretically, that could cut down two third of the work and save a lot of memory.
For indexing or slicing arrays, you can refer this question and this numpy manual

ValueError: x and y must have the same first dimension, but have different shapes

import urllib.request
from math import sqrt, fabs, exp
import matplotlib.pyplot as plot
from sklearn.linear_model import enet_path
from sklearn.metrics import roc_auc_score, roc_curve
import numpy
target_url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/undocumented/connectionist-bench/sonar/sonar.all-data'
data = urllib.request.urlopen(target_url)
xList = []
for line in data:
#split on comma
row = line.strip().split(",".encode(encoding='utf-8'))
xList.append(row)
xNum = []
labels = []
for row in xList:
lastCol = row.pop()
if lastCol == b'M':
labels.append(1.0)
else:
labels.append(0.0)
attrRow = [float(elt) for elt in row]
xNum.append(attrRow)
nrow = len(xNum)
ncol = len(xNum[1])
alpha = 1.0
xMeans = []
xSD = []
for i in range(ncol):
col = [xNum[j][i] for j in range(nrow)]
mean = sum(col)/nrow
xMeans.append(mean)
colDiff = [(xNum[j][i] - mean) for j in range(nrow)]
sumSq = sum([colDiff[i] * colDiff[i] for i in range(nrow)])
stdDev = sqrt(sumSq/nrow)
xSD.append(stdDev)
xNormalized = []
for i in range(nrow):
rowNormalized = [(xNum[i][j] - xMeans[j])/xSD[j] for j in range(ncol)]
xNormalized.append(rowNormalized)
meanLabel = sum(labels)/nrow
sdLabel = sqrt(sum([(labels[i] - meanLabel) * (labels[i] - meanLabel) for i in range (nrow)])/nrow)
labelNormalized = [(labels[i] - meanLabel)/sdLabel for i in range(nrow)]
nxval = 10
for ixval in range(nxval):
idxTest = [a for a in range (nrow) if a%nxval == ixval]
idxTrain = [a for a in range(nrow) if a%nxval != ixval]
xTrain = numpy.array([xNormalized[r] for r in idxTrain])
xTest = numpy.array([xNormalized[r] for r in idxTest])
labelTrain = numpy.array([labelNormalized[r] for r in idxTrain])
labelTest = numpy.array([labelNormalized[r] for r in idxTest])
alphas, coefs, _ = enet_path(xTrain, labelTrain, l1_ratio = 0.8, fit_intercept=False, return_models=False)
if ixval == 0:
pred = numpy.dot(xTest, coefs)
yOut = labelTest
else:
yTemp = numpy.array(yOut)
yOut = numpy.concatenate((yTemp, labelTest), axis = 0)
predTemp = numpy.array(pred)
pred = numpy.concatenate((predTemp, numpy.dot(xTest, coefs)), axis = 0)
misClassRate = []
_,nPred = pred.shape
for iPred in range(1, nPred):
predList = list(pred[:, iPred])
errCnt = 0.0
for irow in range(nrow):
if (predList[irow] < 0.0) and (yOut[irow] >= 0.0):
errCnt += 1.0
elif (predList[irow] >= 0.0) and (yOut[irow] < 0.0):
errCnt += 1.0
misClassRate.append(errCnt/nrow)
minError = min(misClassRate)
idxMin = misClassRate.index(minError)
plotAlphas = numpy.array(alphas[1:len(alphas)])
misClassRate_np = numpy.array(misClassRate)
plot.figure()
plot.plot(plotAlphas, misClassRate_np, label='Misclassification Error Across Folds', linewidth=2)
plot.axvline(plotAlphas[idxMin], linestyle='--', label='CV Estimate of Best alpha')
plot.legend()
plot.semilogx()
ax = plot.gca()
ax.invert_xaxis()
plot.xlabel('alpha')
plot.ylabel('Misclassification Error')
plot.axis('tight')
plot.show()
When I executed the code above, it returns: ValueError: x and y must have same first dimension, but have shapes (99,) and (1,).
It seems the problem is due to unequal length in x and y.
Then I checked both plotAlphas and misClassRate_np, they show the same length. Also, both of them has been changed to array but still fail to fix the problem. Can't figure out what's happening.

Categories

Resources