Using LMC in GPR with multi-outputs and multi-inputs - python

My input dataset is 9900 data with 4 inputs and 3 outputs
import numpy as np
import matplotlib.pyplot as plt
import gpflow as gpf
import tensorflow as tf
from gpflow.utilities import print_summary
from gpflow.ci_utils import ci_niter
import pandas as pd
MAXITER = ci_niter(1000)
###Using LMC into GPR
# kernel=gpf.kernels.Matern32(lengthscales=[1,1,1,1])
kern_list = [gpf.kernels.Matern32(lengthscales=[1,1,1,1]) for _ in range(3)]
kernel = gpf.kernels.LinearCoregionalization(
kern_list, W=np.random.randn(3,3)
)
m = gpf.models.GPR(data=(X, Y), kernel=kernel, mean_function=None)
When I try to optimise it
error image

Related

Encoding target column

I'm constructing an ANN in python and I have a trouble encoding column[-1] (y) into binary numbers.
There are 6 different parameters in this column and I want to encode each one into a separate column, like done in columns of X with onehotencoder,
Thanks
Ido
dataframe_screenshot
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder,StandardScaler,MinMaxScaler
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.linear_model import LinearRegression,Lasso,Ridge
from sklearn.metrics import confusion_matrix,accuracy_score,mean_squared_error,r2_score
import tensorflow as tf
from tensorflow import keras
Political_opinions = pd.read_csv("data.csv")
Political_opinions.drop(columns=['Timestamp','Yas','Bolge','Egitim'],axis=1,inplace=True)
print(Political_opinions)
one_hot_color = pd.get_dummies(Political_opinions.parti).values
print(Political_opinions.head(10))
Political_opinions["Cinsiyet"] = (Political_opinions["Cinsiyet"]=="Erkek").astype(int)
Political_opinions["soru1"] = (Political_opinions["soru1"]=="Hayır").astype(int)
Political_opinions["soru2"] = (Political_opinions["soru2"]=="Hayır").astype(int)
Political_opinions["soru3"] = (Political_opinions["soru3"]=="Hayır").astype(int)
Political_opinions["soru4"] = (Political_opinions["soru4"]=="Hayır").astype(int)
Political_opinions["soru5"] = (Political_opinions["soru5"]=="Hayır").astype(int)
Political_opinions["soru6"] = (Political_opinions["soru6"]=="Hayır").astype(int)
Political_opinions["soru7"] = (Political_opinions["soru7"]=="Hayır").astype(int)
Political_opinions["soru8"] = (Political_opinions["soru8"]=="Hayır").astype(int)
Political_opinions["soru9"] = (Political_opinions["soru9"]=="Hayır").astype(int)
Political_opinions["soru10"] = (Political_opinions["soru10"]=="Hayır").astype(int)
Political_opinions["parti"] = (Political_opinions["parti"]=="AKP").astype(int)
Political_opinions["parti"] = (Political_opinions["parti"]=="MHP").astype(int)
Political_opinions["parti"] = (Political_opinions["parti"]=="CHP").astype(int)
Political_opinions["parti"] = (Political_opinions["parti"]=="DIĞER").astype(int)
Political_opinions["parti"] = (Political_opinions["parti"]=="HDP").astype(int)

Why NegativeBinomialP gives different coefficients compared to R?

I am getting little difficulty to repeat the following R exercise into python to achive the same results. What am I missing?
R exercise
https://stats.idre.ucla.edu/r/dae/negative-binomial-regression/
data link
https://www.dropbox.com/s/mz4stp72eco3rfq/sampleNBdata2.dat?dl=0
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import statsmodels.distributions.discrete as distr
from statsmodels.discrete.discrete_model import NegativeBinomialP, NegativeBinomial, Poisson, GeneralizedPoisson
from statsmodels.discrete.count_model import (ZeroInflatedNegativeBinomialP, ZeroInflatedPoisson,
ZeroInflatedGeneralizedPoisson)
import statsmodels.discrete._diagnostics_count as dia
import statsmodels.api as sm
f=open('sampleNBdata2.dat')
id=[]
gender=[]
math=[]
daysabs=[]
prog=[]
x=[]
f.readline()
d={}
d['Academic']=1
d['Vocational']=2
d['General']=3
for line in f:
l=line.split(',')
id.append(l[1])
gender.append(l[2])
math.append(l[3]) #independent
daysabs.append(int(l[4])) #dependent y
prog.append(l[5]) #independent
#x.append([int(l[3]),d[l[5]], ] )
x.append([int(l[3]),int(l[5]), ] )
print(x,daysabs)
endog=np.array(daysabs)
exog=np.array(x)
print("endog",endog.shape)
print("exog",exog.shape)
#model_nb = NegativeBinomial(endog, exog, loglike_method='nb2')
model_nb = NegativeBinomialP(endog, exog, p=2)
res_nb = model_nb.fit(method='bfgs', maxiter=5000, maxfun=5000)
print(endog)
print(exog)
print(res_nb.summary())
Python output
R output
Following codes are reproducing the result of R almost with similar coefficients.
df=pd.read_csv('sampleNBdata.dat')
data=pd.concat((df,pd.get_dummies(df['prog'],drop_first=False)),axis=1)
endog=data['daysabs']
data['intercept'] = 1
exog=data.drop(['prog','daysabs','id','gender','Unnamed: 0','General'],axis=1)
model_nb = NegativeBinomialP(endog, exog, p=2)
res_nb = model_nb.fit(method='bfgs', maxiter=5000, maxfun=5000)
print(res_nb.summary())

ValueError: could not convert string to float in univariate_selection

Hello i'm using univariate_selection method for selecting a best features from a following data set:
https://i.stack.imgur.com/J31T0.png
But i got an error
Value Error: could not convert string to float: 'SUDMyYggegA'
Below is my code:
import pandas as pd
import numpy as np
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
data = pd.read_csv("C://Users/Shahnawaz Irfan/Desktop/demo.csv")
X = data.iloc[:,0:15]
y = data.iloc[:,-13]
bestfeatures = SelectKBest(score_func=chi2, k=10)
fit = bestfeatures.fit(X,y)
dfscores = pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(X.columns)
featureScores = pd.concat([dfcolumns,dfscores],axis=1)
featureScores.columns = ['features','Score']
print(featureScores.nlargest(15,'Score')) `

Getting the variance of each column in pandas

I want to calculate the variance of features saved in a Train and Test file a followed :
col1 Feature0 Feature1 Feature2 Feature3 Feature4 Feature5 Feature6 Feature7 Feature8 Feature9
col2 26658 40253.5 3.22115e+09 0.0277727 5.95939 266.56 734.248 307.364 0.000566779 0.000520574
col3 2658 4053.5 3.25e+09 0.0277 5.95939 266.56 734.248 307.364 0.000566779 0.000520574
....
for that I've wrote the following :
import numpy as np
from sklearn.decomposition import PCA
import pandas as pd
#from sklearn.preprocessing import StandardScaler
from sklearn import preprocessing
from matplotlib import pyplot as plt
# Reading csv file
training_file = 'Training.csv'
testing_file = 'Test.csv'
Training_Frame = pd.read_csv(training_file)
Testing_Frame = pd.read_csv(testing_file)
Training_Frame.shape
# Now we have the feature values saved we start
# with the standardisation of the those values
stdsc = preprocessing.MinMaxScaler()
np_scaled_train = stdsc.fit_transform(Training_Frame.iloc[:,:-2])
sel = VarianceThreshold(threshold=(.2 * (1 - .2)))
sel.fit_transform(np_scaled_train)
pd_scaled_train = pd.DataFrame(data=np_scaled_train)
pd_scaled_train.to_csv('variance_result.csv',header=False, index=False)
This obviously doesn't work. the result in variance_result.csv is just the train matrix normalized.
So my question how can I get the index of the columns(features) that have a variance bellow 20%.
thanks in advance !
Update
I've solved the variance issue this way :
import numpy as np
from sklearn.decomposition import PCA
import pandas as pd
#from sklearn.preprocessing import StandardScaler
from sklearn import preprocessing
from matplotlib import pyplot as plt
from sklearn.feature_selection import VarianceThreshold
# Reading csv file
training_file = 'Training.csv'
testing_file = 'Test.csv'
Training_Frame = pd.read_csv(training_file)
Testing_Frame = pd.read_csv(testing_file)
Training_Frame.shape
# Now we have the feature values saved we start
# with the standardisation of the those values
stdsc = preprocessing.MinMaxScaler()
np_scaled_train = stdsc.fit_transform(Training_Frame.iloc[:,:-2])
pd_scaled_train = pd.DataFrame(data=np_scaled_train)
variance =pd_scaled_train.apply(np.var,axis=0)
pd_scaled_train.to_csv('variance_result.csv',header=False, index=False)
temp_df = pd.DataFrame(variance.values,Training_Frame.columns.values[:-2])
temp_df.T.to_csv('Training_features_variance.csv',index=False)
No I still don't know how to get indeces of features with a variance say bigger than 0.2 from variance other thanks running a loop!
Just set the threshold to 0.0 and then use the variances_ attribute of the VarianceThreshold object to get the variances of all your features, then you can identify which of them have lower variance.
from sklearn.feature_selection import VarianceThreshold
X = [[0, 2, 0, 3], [0, 1, 4, 3], [0, 1, 1, 3]]
selector = VarianceThreshold()
selector.fit_transform(X)
selector.variances_
#Output: array([ 0. , 0.22222222, 2.88888889, 0. ])

denoise in scikit-learn for color image is not working

I was just seeing this example and trying to denoise an Image by using Dictionary Learning. This code is working for gray scale Image but not working for 3 chanel or Color Image.
here is my code =>
from time import time
import matplotlib.pyplot as plt
import numpy as np
import scipy as sp
from sklearn.decomposition import MiniBatchDictionaryLearning
from sklearn.feature_extraction.image import extract_patches_2d
from sklearn.feature_extraction.image import reconstruct_from_patches_2d
from sklearn.utils.fixes import sp_version
from sklearn.datasets import load_sample_image
from scipy import ndimage
from skimage import color
from skimage import io
from PIL import Image
from sklearn.decomposition import SparseCoder
from sklearn.decomposition import sparse_encode
from skimage import data,restoration
from scipy.misc import imfilter, imread
from scipy.signal import convolve2d as conv2
from skimage import data, img_as_float
from skimage.restoration import denoise_nl_means
from scipy import ndimage as ndi
from skimage import feature
c = np.asarray(Image.open('starfish.jpg'))
n0 = np.asarray(Image.open('starfish.jpg'))
c = c / 255
n0 = n0 / 255
height, width, chanel = n0.shape
n0 = n0 + 0.075 * np.random.randn(height, width, chanel)
patchsize = (7,14)
t0 = time()
data = extract_patches_2d(c,(7,14))
data = data.reshape(data.shape[0], -1)
data = data - np.mean(data, axis=0)
data = data / np.std(data, axis=0)
t1 = time()
print('Total time : ',round((t1-t0),2),' sec')
print('Learning the dictionary ....')
t2 = time()
n_iter = 1000
dico = MiniBatchDictionaryLearning(n_components=100,alpha=2,n_iter=n_iter)
V = dico.fit(data).components_
print(V.shape)
t3 = time()
print('No of iteration : ',n_iter)
print('Total time taken for Dictionary learning : ',round((t3-t2),2),' sec')
t4 = time()
n0_data = extract_patches_2d(n0,(7,14))
print(n0_data.shape)
print(n0_data)
n0_data = n0_data.reshape(n0_data.shape[0], -1)
intercept = np.mean(n0_data, axis=0)
n0_data = n0_data - intercept
dico.set_params(transform_algorithm='omp',transform_n_nonzero_coefs = 1)
code = dico.transform(n0_data)
patches = np.dot(code,V)
patches = patches + intercept
print(patches)
print(patches.shape)
print(patches[0].shape)
patches = patches.reshape(len(n0_data), *patchsize)
result = reconstruct_from_patches_2d(patches,(height, width))
denoise = denoise_nl_means(result, 1, 9, 0.08, multichannel=True)
edge =feature.canny(denoise,sigma=3)
print(edge)
plt.imshow(result,cmap='gray')
plt.show()
print('Total time taken for sparse modeling : ',round((time()-t4),2))
This same code is working perfectly if I convert in Gray Scale by using
this lines =>
c = np.asarray(Image.open('starfish.jpg').convert('L'))
n0 = np.asarray(Image.open('starfish.jpg').convert('L'))
error is =>
File "color.py", line 103, in
patches = patches.reshape(len(n0_data), *patchsize)
ValueError: total size of new array must be unchanged
How do I solve this problem??

Categories

Resources