I need to visualize this dataset. I first got an error saying that I had multiple dtypes, so I am trying to set low_memory to False. However I cannot find the right syntax.
import numpy as np
import pandas as pd
import sklearn
import os
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.manifold import TSNE
import io
from google.colab import files
uploaded = files.upload()
train_data = pd.read_csv(io.BytesIO(uploaded['train.csv'],
low_memory=False))
num_rows = train_data.shape[0]
counter_nan = train_data.isnull().sum()
counter_without_nan = counter_nan[counter_nan == 0]
train_data = train_data[counter_without_nan.keys()]
train_data = train_data.drop({"Team", "DisplayName" , "GameClock" ,
"PossessionTeam" ,"OffensePersonnel" , "DefensePersonnel" ,
"PlayDirection" , "TimeHandoff" , "TimeSnap" , "PlayerHeight" ,
"PlayerBirthDate" , "PlayerCollegeName" , "Position" , "HomeTeamAbbr" ,
"VisitorTeamAbbr" , "Stadium" , "Location", "Turf"},axis = 1)
c = train_data.iloc[:,:-1].values
standard_scalar = StandardScaler()
c_std = standard_scalar.fit_transform(c)
tsne = TSNE(n_components=2, random_state = 0)
c_test_2d = tsne.fit_transform(c_std)
markers = ('s', 'd', 'o', '^', 'v')
color_map = {0:'red', 1:'blue' ,2:'lightgreen',3:'purple', 4:'cyan'}
plt.figure()
for idx, cl in enumerate(np.unique(c_test_2d)):
plt.scatter(x=c_test_2d[cl,0], y= c_test_2d[cl,1], c=color_map[idx],
marker=markers[idx], label=cl)
plt.show()
I expect:
train_data = pd.read_csv(io.BytesIO(uploaded['train.csv'], low_memory=False))
to set the low_memory to False.
Welcome to StackOverflow!
try changing below line
train_data = pd.read_csv(io.BytesIO(uploaded['train.csv'], low_memory=False))
to
train_data = pd.read_csv(io.BytesIO(uploaded['train.csv']), low_memory=False)
You were passing the low_memory argument to io.BytesIO instead of pd.read_csv
Related
I'm constructing an ANN in python and I have a trouble encoding column[-1] (y) into binary numbers.
There are 6 different parameters in this column and I want to encode each one into a separate column, like done in columns of X with onehotencoder,
Thanks
Ido
dataframe_screenshot
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder,StandardScaler,MinMaxScaler
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.linear_model import LinearRegression,Lasso,Ridge
from sklearn.metrics import confusion_matrix,accuracy_score,mean_squared_error,r2_score
import tensorflow as tf
from tensorflow import keras
Political_opinions = pd.read_csv("data.csv")
Political_opinions.drop(columns=['Timestamp','Yas','Bolge','Egitim'],axis=1,inplace=True)
print(Political_opinions)
one_hot_color = pd.get_dummies(Political_opinions.parti).values
print(Political_opinions.head(10))
Political_opinions["Cinsiyet"] = (Political_opinions["Cinsiyet"]=="Erkek").astype(int)
Political_opinions["soru1"] = (Political_opinions["soru1"]=="Hayır").astype(int)
Political_opinions["soru2"] = (Political_opinions["soru2"]=="Hayır").astype(int)
Political_opinions["soru3"] = (Political_opinions["soru3"]=="Hayır").astype(int)
Political_opinions["soru4"] = (Political_opinions["soru4"]=="Hayır").astype(int)
Political_opinions["soru5"] = (Political_opinions["soru5"]=="Hayır").astype(int)
Political_opinions["soru6"] = (Political_opinions["soru6"]=="Hayır").astype(int)
Political_opinions["soru7"] = (Political_opinions["soru7"]=="Hayır").astype(int)
Political_opinions["soru8"] = (Political_opinions["soru8"]=="Hayır").astype(int)
Political_opinions["soru9"] = (Political_opinions["soru9"]=="Hayır").astype(int)
Political_opinions["soru10"] = (Political_opinions["soru10"]=="Hayır").astype(int)
Political_opinions["parti"] = (Political_opinions["parti"]=="AKP").astype(int)
Political_opinions["parti"] = (Political_opinions["parti"]=="MHP").astype(int)
Political_opinions["parti"] = (Political_opinions["parti"]=="CHP").astype(int)
Political_opinions["parti"] = (Political_opinions["parti"]=="DIĞER").astype(int)
Political_opinions["parti"] = (Political_opinions["parti"]=="HDP").astype(int)
I'm handling with multicollinearity problem with variance_inflation_factor() function.
But after running the function, I found that the function returned all the scores as infinite values.
Here's my code:
from rdkit import Chem
import pandas as pd
import numpy as np
from numpy import array
data = pd.read_csv('Descriptors_raw.csv')
class_ = pd.read_csv('class_file.csv')
class_tot = pd.read_csv('class_total.csv')
mols_A1 = Chem.SDMolSupplier('finaldata_A1.sdf')
mols_A2 = Chem.SDMolSupplier('finaldata_A2.sdf')
mols_B = Chem.SDMolSupplier('finaldata_B.sdf')
mols_C = Chem.SDMolSupplier('finaldata_C.sdf')
mols = []
mols.extend(mols_A1)
mols.extend(mols_A2)
mols.extend(mols_B)
mols.extend(mols_C)
mols_df = pd.DataFrame(mols)
mols = pd.concat([mols_df, class_tot, data], axis=1)
mols = mols.dropna(axis=0, thresh=1400)
mols.groupby('target_name_quarter').mean()
fill_mean_func = lambda g: g.fillna(g.mean())
mols = mols.groupby('target_name_quarter').apply(fill_mean_func)
molfiles = mols.loc[:, :'target_quarter']
descriptors = mols.loc[:, 'nAcid':'Zagreb']
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
fitted = scaler.fit(descriptors)
descriptors_scaled = scaler.transform(descriptors)
descriptors_scaled = pd.DataFrame(descriptors_scaled, columns=descriptors.columns, index = list(descriptors.index.values))
from sklearn.feature_selection import VarianceThreshold
def variance_threshold_selector(data, threshold):
selector = VarianceThreshold(threshold)
selector.fit(data)
return data[data.columns[selector.get_support(indices=True)]]
descriptors_del_lowvar = variance_threshold_selector(descriptors_scaled, 0.01)
mols = pd.concat([molfiles, descriptors_del_lowvar.loc[:, 'nAcid':'Zagreb']], axis=1)
mols.loc[:, 'nAcid':'Zagreb'].corr()
import seaborn as sns
from statsmodels.stats.outliers_influence import variance_inflation_factor
% matplotlib inline
sns.pairplot(mols[['apol', 'nAtom', 'nHeavyAtom', 'nH', 'nAcid']])
vif = pd.DataFrame()
des = mols.loc[:, 'nAcid':'Zagreb']
vif["VIF factor"] = [variance_inflation_factor(des.values, i) for i in range(des.shape[1])]
vif["features"] = des.columns
print(vif)
I used MinMaxScaler() when eliminate low-variance features so as to make all the variables in same range.
print(vif) returns a dataframe with all infinite values and I cannot figure out why.
Thank you in advance :)
This shows a perfect correlation between two independent variables. In the case of perfect correlation, we get R2 =1, which lead to 1/(1-R2) infinity. To solve this problem we need to drop one of the variables from the dataset which is causing this perfect multicollinearity.
We were given some code for a support vector machine where we are supposed to implement leave one out cross validation. If I understand it correctly leave one out will create as many test sets as there are samples, which means that for a big data set the process will be costly and most likely take quite long to generate results.
I have tried to implement leave one out to the given svm code with only one iteration and with 773 data points in total. I expected it to take some time but as of 2 h later the code is still running without any result, which makes me believe that it might be stuck in some loop or something...
Is there any suggestion as to what might be wrong? I'm not getting any error code either.
The entire code is as following, with the leave one out part is in the last function at the bottom (executed in jupyter notebook online binder):
import scipy as sp
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import gseapy as gp
from gseapy.plot import gseaplot
import qvalue
from ipywidgets import interact, interact_manual
from ipywidgets import IntSlider, FloatSlider, Dropdown, Text
import sklearn as skl
from sklearn.svm import LinearSVC
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.model_selection import LeaveOneOut
from sklearn import svm
interact_enrich=interact_manual.options(manual_name="Enrichment analysis")
interact_plot=interact_manual.options(manual_name="Plot")
interact_calc=interact_manual.options(manual_name="Calculate tests")
interact_gen=interact_manual.options(manual_name="Initialize data")
interact_SVM=interact_manual.options(manual_name="Train SVM")
clinical_data = pd.read_csv('../data/brca_clin.tsv.gz', sep ='\t', index_col=2)
clinical_data = clinical_data.iloc[4:,1:]
expression_data = pd.read_csv('../data/brca.tsv.gz', sep ='\t', index_col=1)
expression_data = expression_data.iloc[:,2:].T
def split_data(clinical_df, expression_df, separator, cond1, cond2):
try:
group1 = clinical_df[separator] == cond1
index1 = clinical_df[group1].index
group2 = clinical_df[separator] == cond2
index2 = clinical_df[group2].index
except:
print('Clinical condition wrong')
expression1 = expression_df.loc[index1].dropna()
expression2 = expression_df.loc[index2].dropna()
expression = pd.concat([expression1, expression2])
X = expression.values
y = np.append(np.repeat(0, len(expression1)), np.repeat(1, len(expression2)))
display(pd.DataFrame([len(index1),len(index2)], columns = ['Number of points'], index = ['Group 1', 'Group 2']))
return X, y
def plot_pca_variance(X, scale=False, ncomp = 1):
if scale:
scaler = StandardScaler()
X = scaler.fit_transform(X)
pca = PCA()
pca.fit(X)
plt.rcParams["figure.figsize"] = (20,10)
sns.set(style='darkgrid', context='talk')
plt.plot(np.arange(1,len(pca.explained_variance_ratio_)+1),np.cumsum(pca.explained_variance_ratio_))
plt.xlabel('Number of components')
plt.ylabel('Cumulative explained variance')
plt.vlines(ncomp, 0, plt.gca().get_ylim()[1], color='r', linestyles = 'dashed')
h = np.cumsum(pca.explained_variance_ratio_)[ncomp -1]
plt.hlines(h, 0, plt.gca().get_xlim()[1], color='r', linestyles = 'dashed')
plt.title(str(ncomp) + ' components, ' + str(round(h, 3)) + ' variance explained')
plt.show()
def reduce_data(X, n, scale=True):
if scale:
scaler = StandardScaler()
X = scaler.fit_transform(X)
pca = PCA(n_components=n)
Xr = pca.fit_transform(X)
return Xr
def interact_split_data(Criteria, Group_1, Group_2):
global BRCA_X, BRCA_y
BRCA_X, BRCA_y = split_data(clinical_data, expression_data, Criteria, Group_1, Group_2)
def interact_SVM_1(Rescale, Max_iterations):
max_iter = int(Max_iterations)
loo = LeaveOneOut()
ac_matrix_train, ac_matrix_test = np.array([]), np.array([])
for train_id, test_id in loo.split(BRCA_X, BRCA_y):
X_train, X_test, y_train, y_test = BRCA_X[train_id,:], BRCA_X[test_id,:], BRCA_y[train_id],BRCA_y[test_id]
clf = svm.LinearSVC(C=0.1,max_iter=100000).fit(X_train, y_train) # Train an SVM
y_train_pred = clf.predict(X_train)
ac_matrix_train = confusion_matrix(y_train, y_train_pred)
y_test_pred = clf.predict(X_test)
ac_matrix_test = confusion_matrix(y_test, y_test_pred)
display(pd.DataFrame(np.concatenate((ac_matrix_train,ac_matrix_test), axis =1), columns = ["predicted G1 (training)","predicted G2 (training)", "predicted G1 (test)","predicted G2 (test)"],index=["actual G1","actual G2"]))
interact_gen(interact_split_data, Criteria=Text('PR status by ihc'), Group_1 = Text('Positive'), Group_2=Text('Negative'))
interact_SVM(interact_SVM_1, Rescale = False, Max_iterations = Text('1')) ```
It seems that my code fails when I try to set what headers/columns of data I want to use giving me an index error when trying to parse headers
import pandas as pd
import quandl
import math, datetime
import numpy as np
from sklearn import preprocessing , cross_validation, svm
from sklearn.linear_model import LinearRegression
import scipy
import matplotlib.pyplot as plt
from matplotlib import style
import pickle
style.use('ggplot')
df = pd.read_csv('convertcsv.csv',sep='\t')
df = np.array(df)
print(df)
df = df[['Open','High','Low','Close','Volume (BTC)']]
print("ok")
df['HL_PCT'] = (df['High'] - df['Close']) / df['Close'] * 100.0
df['PCT_change'] = (df['Close'] - df['Open']) / df['Open'] * 100.0
df = df[['Close','HL_PCT','PCT_change','Volume (BTC)']]
forecast_col = 'Close'
df.fillna(-999999, inplace=True)
forecast_out = int(math.ceil(0.01*len(df)))
df['label'] = df[forecast_col].shift(-forecast_out)
X = np.array(df.drop(['label'],1))
X = preprocessing.scale(X)
X_lately = X[-forecast_out:]
X = X[:-forecast_out:]
df.dropna(inplace=True)
y = np.array(df['label'])
X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y,
test_size=0.2)
clf = LinearRegression(n_jobs=-1)
clf.fit(X_train, y_train)
with open('linearregression.pickle','wb') as f:
pickle.dump(clf, f)
pickle_in = open('linearregression.pickle','rb')
clf =pickle.load(pickle_in)
accuracy = clf.score(X_test,y_test)
print(accuracy)
forecast_set = clf.predict(X_lately)
df['Forecast'] = np.nan
last_date = df.iloc[-1].name
last_unix = last_date.timestamp()
one_day = 86400
next_unix = last_unix + one_day
for i in forecast_set:
next_date = datetime.datetime.fromtimestamp(next_unix)
next_unix += one_day
df.loc[next_date] = [np.nan for _ in range(len(df.columns)-1)] + [i]
df['Close'].plot()
df['Forecast'].plot()
plt.legend(loc=4)
plt.xlabel('Date')
plt.ylabel('Price')
plt.pause(1)
plt.show()
print("we done?")`
...
I cant seem to figure out what I am doing wrong, it worked with the previous data set I was using, if it helps here is the format of the csv file that I was pulling from:
Timestamp,Open,High,Low,Close,Volume (BTC),Volume (Currency),Weighted Price
2017-09-30 00:00:00,4162.04,4177.63,4154.28,4176.08,114.81,478389.12,4166.96
2017-09-30 01:00:00,4170.84,4224.6,4170.84,4208.14,348.45,1463989.18,4201.4
I am not too experienced with this sort of stuff, and I tried to find other people with the same error but everyone was having a different sort of problem, I can include more data if it is needed.
You're converting your dataframe to a numpy array with df = np.array(df).
Don't expect a numpy array to function as a pandas dataframe.
Remove
df = np.array(df)
and you should be able to slice your matrix by column name with
df = df[['Open','High','Low','Close','Volume (BTC)']]
I want to calculate the variance of features saved in a Train and Test file a followed :
col1 Feature0 Feature1 Feature2 Feature3 Feature4 Feature5 Feature6 Feature7 Feature8 Feature9
col2 26658 40253.5 3.22115e+09 0.0277727 5.95939 266.56 734.248 307.364 0.000566779 0.000520574
col3 2658 4053.5 3.25e+09 0.0277 5.95939 266.56 734.248 307.364 0.000566779 0.000520574
....
for that I've wrote the following :
import numpy as np
from sklearn.decomposition import PCA
import pandas as pd
#from sklearn.preprocessing import StandardScaler
from sklearn import preprocessing
from matplotlib import pyplot as plt
# Reading csv file
training_file = 'Training.csv'
testing_file = 'Test.csv'
Training_Frame = pd.read_csv(training_file)
Testing_Frame = pd.read_csv(testing_file)
Training_Frame.shape
# Now we have the feature values saved we start
# with the standardisation of the those values
stdsc = preprocessing.MinMaxScaler()
np_scaled_train = stdsc.fit_transform(Training_Frame.iloc[:,:-2])
sel = VarianceThreshold(threshold=(.2 * (1 - .2)))
sel.fit_transform(np_scaled_train)
pd_scaled_train = pd.DataFrame(data=np_scaled_train)
pd_scaled_train.to_csv('variance_result.csv',header=False, index=False)
This obviously doesn't work. the result in variance_result.csv is just the train matrix normalized.
So my question how can I get the index of the columns(features) that have a variance bellow 20%.
thanks in advance !
Update
I've solved the variance issue this way :
import numpy as np
from sklearn.decomposition import PCA
import pandas as pd
#from sklearn.preprocessing import StandardScaler
from sklearn import preprocessing
from matplotlib import pyplot as plt
from sklearn.feature_selection import VarianceThreshold
# Reading csv file
training_file = 'Training.csv'
testing_file = 'Test.csv'
Training_Frame = pd.read_csv(training_file)
Testing_Frame = pd.read_csv(testing_file)
Training_Frame.shape
# Now we have the feature values saved we start
# with the standardisation of the those values
stdsc = preprocessing.MinMaxScaler()
np_scaled_train = stdsc.fit_transform(Training_Frame.iloc[:,:-2])
pd_scaled_train = pd.DataFrame(data=np_scaled_train)
variance =pd_scaled_train.apply(np.var,axis=0)
pd_scaled_train.to_csv('variance_result.csv',header=False, index=False)
temp_df = pd.DataFrame(variance.values,Training_Frame.columns.values[:-2])
temp_df.T.to_csv('Training_features_variance.csv',index=False)
No I still don't know how to get indeces of features with a variance say bigger than 0.2 from variance other thanks running a loop!
Just set the threshold to 0.0 and then use the variances_ attribute of the VarianceThreshold object to get the variances of all your features, then you can identify which of them have lower variance.
from sklearn.feature_selection import VarianceThreshold
X = [[0, 2, 0, 3], [0, 1, 4, 3], [0, 1, 1, 3]]
selector = VarianceThreshold()
selector.fit_transform(X)
selector.variances_
#Output: array([ 0. , 0.22222222, 2.88888889, 0. ])