svm CUSTOM KERNEL PYTHON ERROR - python

I am trying to make a custom kernel in python. This is my code :
from sklearn import datasets
from sklearn.svm import SVC
from sklearn import svm
from sklearn import metrics
import numpy as np
import matplotlib.pyplot as plt
import PIL
from PIL import Image
import pylab as pl
import math
digits = datasets.load_digits()
X = digits.data[:-200]
Y = digits.target[:-200]
def kernal6(x,y):
d=np.linalg.norm(x-y)
Xn=np.linalg.norm(x)
Yn=np.linalg.norm(y)
return (Xn+Yn-d)/np.sqrt(Xn*Yn)
clf5 = svm.SVC(kernel=kernal6)
clf5.fit(X,Y)
but I keep getting this error :
IndexError: tuple index out of range

You are returning the wrong value. The kernel function should return a matrix. Have a look at this to see an example of a proper kernel function
import numpy as np
import matplotlib.pyplot as plt
from sklearn import svm, datasets
digits = datasets.load_digits()
X = digits.data[:-200, :2] #You were doing this wrong too
Y = digits.target[:-200]
def my_kernel(x, y):
M = np.array([[2, 0], [0, 1.0]])
return np.dot(np.dot(x, M), y.T) #returns a matrix
def kernal6(x,y):
d=np.linalg.norm(x-y)
Xn=np.linalg.norm(x)
Yn=np.linalg.norm(y)
return (Xn+Yn-d)/np.sqrt(Xn*Yn) #returns a float
print "Testing SVC with my_kernel"
clf = svm.SVC(kernel=my_kernel)
clf.fit(X, Y) #works fine
print "Success!"
print "Testing kernal6"
print "kernal6 direct call:",kernal6(X, X) #will return a result
clf = svm.SVC(kernel=kernal6)
try:
clf.fit(X, Y)#fails
except:
print "Failed to fit with kernal6"

IndexError means that you are trying to access an array/tuple's value of an index that is not defined. The only time where you try to access a tuple (it says tuple index out of range) is when you declare X and Y. Therefore, it must be a problem with the slicing notation. I think that the reason is because the tuple does not have more or equal than 200 elements (the array[:-200] returns len(array) - 200, which may be a negative integer again); however, I cannot run your code because my interpreter throws an error, so I am sorry if I am wrong.

Related

Python logit regression matrix shape error "ValueError: endog and exog matrices are different sizes"

Basic setup: I'm trying to run a logit regression in python on the probability of founding a business (founder variable) the exogenous variables are year, age, edu_cat (education category), and sex.
The X matrix is (4, 650), and the y matrix(1, 650). All of the variables within the x matrix have 650 non-NaN observations.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
x=np.array ([ df_all['Year'], df_all['Age'], df_all['Edu_cat'], df_all['sex']])
y= np.array([df_all['founder']])
logit_model = sm.Logit(y, x)
result = logit_model.fit()
print(result)
So I'm tracking that the shape is good, but python is telling me otherwise. Am I missing something basic?
I believe the issue is with the Y array, being [650,1], when it should be [650,], which it defaults to. Additionally I needed to make the x array [650,4] through a transpose.

i can't run a linear regression and cross validation. can someone enlighten me? i get errors such as could not convert string to float

import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
from statsmodels.formula.api import ols
import statsmodels.api as sm
import scipy
import scipy.stats
import seaborn as sns
import numpy.random as npr
import math
from scipy.stats import norm
import sqlite3 as sql
import seaborn
from numba import jit, prange
df = pd.read_csv('ODI-2021.edited.csv')
df.info()
sr_targets = pd.Series(df['What is your stress level (0-100)?'])
sr_targets.describe()
df_features = df.drop('What is your stress level (0-100)?', axis=1)
print (df_features)
df_features.describe()
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm
df.isnull().sum()
df_missing = df.dropna()
df_missing.shape
X = df["What is your stress level (0-100)?"]
y = df["Time you went to be Yesterday"]
est = sm.OLS(y, X.astype(float)).fit()
model = sm.OLS(y, X).fit()
predictions = model.predict(X)
-ValueError: could not convert string to float: 'over 9000'
model.summary()
-AttributeError: 'LinearRegression' object has no attribute 'summary'
from sklearn import preprocessing
def convert(df):
number = preprocessing.LabelEncoder()
data['Date'] = number.fit_transform(df['Date'])
data=data.fillna(-999)
return data
model = LinearRegression(fit_intercept=True)
result = model.fit(df_features, sr_targets)
-ValueError: could not convert string to float: '3/16/2021'
sr_coef = pd.Series(result.coef_, index=df_features.columns)
sr_coef
-NameError: name 'result' is not defined
sr_endog = sr_targets.copy()
df_exog = sm.add_constant(df_features)
model = sm.OLS(sr_endog, df_exog)
result = model.fit()
-ValueError: Pandas data cast to numpy dtype of object. Check input data with
np.asarray(data)
result.summary()
-NameError: name 'result' is not defined
cross validation
from sklearn.linear_model import LassoCV
from sklearn.model_selection import Fold
nb_folds = 10
cv = KFold(n_splits=nb_folds)
model = LassoCV(fit_intercept=True, cv=cv, n_alphas=200, max_iter=2000)
result = model.fit(df_features_rescaled, sr_targets)
-NameError: name 'df_features_rescaled' is not defined
fig = plt.figure(figsize=[16,15])
xvalues = np.log10(result.alphas_)
rmse_path = np.sqrt(result.mse_path_)
for k in range (nb_folds):
yvalues = rmse_path[:,k]
plt.plot(xvalues, yvalues)
pos_ymin = yvalues.argmin()
plt.plot(xvalues[pos_ymin], yvalues[pos_ymin], marker='o')
plt.axvline(np.log10(result.alpha_))
plt.title('RMSE for differebt alpha', fontsize=20)
plt.grid()
-NameError: name 'result' is not define
sr_coef = pd.Series(result.coef_, index=df_features.columns)
sr_coef
this is my code and these are the errors im getting, could someone help me with what im doing wrong? i have looked up the errors and i have no clue how to fix those. my data set has numbers but also dates and answers such as yes no and university level education level responses which i have no clue how to convert in float. i have been trying to run a regression with two columns which consist of numbers and i get there errors. for the cross validation i am dropping one column and im using the rest and im getting the error that i havent defined the variable result which i have , im clueless
3.thanks in advance!
Each of your errors means something. Learning to read the errors is extremely important in understanding what is going on. For example,
est = sm.OLS(y, X.astype(float)).fit()
model = sm.OLS(y, X).fit()
predictions = model.predict(X)
-ValueError: could not convert string to float: 'over 9000'
This appears to suggest that someone place the phrase "over 9000" in the cell of the CSV file you are opening. Hence, python is having trouble figuring out how to convert that to a float. Same thing would happen if you tried to run
float("over 9000")
It appears the data needs to be cleaned up a bit before it can be used by your sm. It appears that python is trying to tell you the same thing here too:
-ValueError: could not convert string to float: '3/16/2021'
The string "3/16/2021" has symbols that are not apart of a float(), namely "/" symbol.
I think it would be helpful if you broke up your errors and concerns into separate questions, then people could tackle them one at a time for you.

scRNA-seq: How to use TSNE python implementation using precalculated PCA score/load?

Python t-sne implementation from this resource: https://lvdmaaten.github.io/tsne/
Btw I'm a beginner to scRNA-seq.
What I am trying to do: Use a scRNA-seq data set and run t-SNE on it but with using previously calculated PCAs (I have PCA.score and PCA.load files)
Q1: I should be able to use my selected calculated PCAs in the tSNE, but which file do I use the pca.score or pca.load when running Y = tsne.tsne(X)?
Q2: I've tried removing/replacing parts of the PCA calculating code to attempt to remove PCA preprocessing but it always seems to give an error. What should I change for it to properly use my already PCA data and not calculate PCA from it again?
The piece of PCA processing code is this in its raw form:
def pca(X=np.array([]), no_dims=50):
"""
Runs PCA on the NxD array X in order to reduce its dimensionality to
no_dims dimensions.
"""
print("Preprocessing the data using PCA...")
(n, d) = X.shape
X = X - np.tile(np.mean(X, 0), (n, 1))
(l, M) = X #np.linalg.eig(np.dot(X.T, X))
Y = np.dot(X, M[:, 0:no_dims])
return Y
You should use the PCA score.
As for not running pca, you can just comment out this line:
X = pca(X, initial_dims).real
What I did is to add a parameter do_pca and edit the function such:
def tsne(X=np.array([]), no_dims=2, initial_dims=50, perplexity=30.0,do_pca=True):
"""
Runs t-SNE on the dataset in the NxD array X to reduce its
dimensionality to no_dims dimensions. The syntaxis of the function is
`Y = tsne.tsne(X, no_dims, perplexity), where X is an NxD NumPy array.
"""
# Check inputs
if isinstance(no_dims, float):
print("Error: array X should have type float.")
return -1
if round(no_dims) != no_dims:
print("Error: number of dimensions should be an integer.")
return -1
# Initialize variables
if do_pca:
X = pca(X, initial_dims).real
(n, d) = X.shape
max_iter = 50
[.. rest stays the same..]
Using an example dataset, without commenting out that line:
import numpy as np
from sklearn.manifold import TSNE
from sklearn.datasets import load_digits
import matplotlib.pyplot as plt
import sys
import os
from tsne import *
X,y = load_digits(return_X_y=True,n_class=3)
If we run the default:
res = tsne(X=X,initial_dims=20,do_pca=True)
plt.scatter(res[:,0],res[:,1],c=y)
If we pass it a pca :
pc = pca(X)[:,:20]
res = tsne(X=pc,initial_dims=20,do_pca=False)
plt.scatter(res[:,0],res[:,1],c=y)

Sklearn.linear_model : ValueError: Found input variables with inconsistent numbers of samples: [1, 20]

I am trying to implement linear regression but when i run the code I get this error ValueError: Found input variables with inconsistent numbers of samples: [1, 20] in line-->linear.fit(x_train1,y_train1) [data type of x_train1,x is series & y_ is series].
I changed x=dataset.iloc[:,:-1] datatype of x_train, x changes to dataframe(y_ is still series) and it works correctly
So why it only works when x is dataframe eventhough y is still series??
import pandas as pd
import numpy as np
import matplotlib.pyplot
dataset=pd.read_csv('Salary_Data.csv')
x=dataset.iloc[:,0]
y=dataset.iloc[:,1]
from sklearn.model_selection import train_test_split
x_train1,x_test1,y_train1,y_test1=
train_test_split(x,y,test_size=1/3,random_state=0)
#implementing simple linear regression
from sklearn.linear_model import LinearRegression
linear=LinearRegression()
linear.fit(x_train1,y_train1)
y_pred=linear.predict(x_test1)
Scikit-Learn does not accept rank 1 array (1 dimensional data), i.e: if you call shape method on your x:
x.shape
it will return something that looks like this (23,), 23 being the number of rows where it should be (23,1).
In order for it to work, try using reshape:
x = dataset.iloc[:,0]
x = x.reshape((len(x),1))
...

Providing data to sklearn.svm.SVC()

I am trying to give those training data to sklearn.svm.SVC() but it returns the error ValueError: setting an array element with a sequence. when I try to clf.fit(v,v2). How do we process this data before giving it to SVC()?
from PIL import Image
from sklearn import svm
for i in xrange(1,55):
t = list(Image.open("train/"+str(i)+".png").getdata())
v.append(t)
v = np.asarray(v)
v2 = np.array(["1","F","9","D","E","E","E","9","0","D","0","3","C","B","F","9","A","E","B","8","A","8","7",
"9","9","3","C","6","1","E","6","6","C","C","F","A","8","0","1","F","F","E","9","4","6","0",
"7","2","D","9","A","C","7","E"])
clf = svm.SVC()
I think you are looking for something like this:
from scipy import misc
import glob
from sklearn import svm
filenames = glob.glob('train/*.png')
X = [misc.imread(each).flatten() for each in filenames]
y = ["1","F","9","D","E","E", ...]
model = svm.SVC().fit(X, y)
Notes:
X has the form (n_images, n_pixels) where n_pixels=width*height
y has length n_images (54 in your example)
This is just a start, you should try to feed the classifier with more meaningful features that single pixels.

Categories

Resources