Hierarchical Clustering on pandas giving Value error - python

I am using python on clustering text documents which I have as a dataframe. This is what I am doing:
from __future__ import division
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.cluster.hierarchy import dendrogram, linkage
import numpy as np
import pandas as pd
data_lst = data_rd['text'].values.tolist()
tfidf_vectorizer = TfidfVectorizer( max_features=200000, stop_words='english',use_idf=True, tokenizer=lambda x: x.split(' '), ngram_range=(1,3))
tfidf_matrix = tfidf_vectorizer.fit_transform(data_lst)
print(tfidf_matrix.shape)
from sklearn.metrics.pairwise import cosine_similarity
dist = 1 - cosine_similarity(tfidf_matrix)
#(10193, 32757)
linkage_dist=ward(dist)
linkage_matrix = linkage(tfidf_matrix.todense(), 'ward')
dendrogram(linkage_matrix,truncate_mode="lastp",p=40,
show_leaf_counts=True,leaf_rotation=60.,leaf_font_size=8.,
show_contracted=True, )
is_valid_linkage(linkage_matrix)
is_valid_linkage(linkage_dist)
#False
#False
I get this error:
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "/usr/lib64/python2.6/site-packages/scipy/cluster/hierarchy.py", line
2227, in dendrogram
is_valid_linkage(Z, throw=True, name='Z')
File "/usr/lib64/python2.6/site-packages/scipy/cluster/hierarchy.py", line
1421, in is_valid_linkage
% name_str)
ValueError: Linkage 'Z' uses the same cluster more than once.
is there any other way apart from fastcluster to solve this and why is this happening?
There is a one row in the column that is blank and has no text.

Related

TypeError:'DataFrame' object is not callable

I have been trying to split the dataset into train and test data for deployment using Streamlit.
import streamlit as st
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, KFold,cross_val_score
from sklearn.cluster import KMeans
import xgboost as xgb
from xgboost import XGBClassifier
def load_dataset():
df = pd.read_csv('txn.csv')
return df
df = load_dataset()
#create X and y, X will be feature set and y is the label - LTV
X = df.drop(['LTVCluster','m1_Revenue'],axis=1)
y = df(['LTVCluster'])
But I,m getting this error while executing the file:
TypeError: 'DataFrame' object is not callable
Traceback:
File "c:\users\anish\anaconda3\lib\site-packages\streamlit\script_runner.py", line 333, in _run_script
exec(code, module.__dict__)
File "C:\Users\Anish\Desktop\myenv\P52 - Retail Ecommerce\new1.py", line 25, in <module>
y = df(['LTVCluster'],axis=1)
What can be the error??
You have a extra set of parentheses in your last line, so Python thinks you're calling df. To filter by columns in Pandas, you use square brackets, so remove the parentheses.
y = df['LTVCluster']
To select a column, remove the () from df(['LTVCluster']):
y = df['LTVCluster']

2D array error in python using scikitlearn package

i have used following code in my pycharm but i am constantly getting the error mentioned below:
import numpy as np
import seaborn as sns
from sklearn import linear_model
import matplotlib.pyplot as plt
df=pd.read_csv(r"C:\Users\gmcks\Downloads\Data samples\homeprices.csv")
df
https://docs.google.com/spreadsheets/d/1wxaadKAHTZtECv6gW6Mpreq3tFb2PWgVOhqANbWlIAk/edit?usp=sharing
x=df[["area"]]
y=df.price
reg=linear_model.LinearRegression()
reg.fit(x,y)
LinearRegression()
m=reg.coef_
c=reg.intercept_
print(m,c)
reg.predict(2000)
ERROR :
Traceback (most recent call last):
File "C:\Users\gmcks\PycharmProjects\using jupyter.py\venv\lib\site-packages\IPython\core\interactiveshell.py", line 3319, in run_code
exec(code_obj, self.user_global_ns, self.user_ns)
File "<ipython-input-30-b5b06b1b028e>", line 1, in <module>
reg.predict(2000)
File "C:\Users\gmcks\PycharmProjects\using jupyter.py\venv\lib\site-packages\sklearn\linear_model\_base.py", line 236, in predict
return self._decision_function(X)
File "C:\Users\gmcks\PycharmProjects\using jupyter.py\venv\lib\site-packages\sklearn\linear_model\_base.py", line 218, in _decision_function
X = check_array(X, accept_sparse=['csr', 'csc', 'coo'])
File "C:\Users\gmcks\PycharmProjects\using jupyter.py\venv\lib\site-packages\sklearn\utils\validation.py", line 72, in inner_f
return f(**kwargs)
File "C:\Users\gmcks\PycharmProjects\using jupyter.py\venv\lib\site-packages\sklearn\utils\validation.py", line 616, in check_array`enter code here`
"if it contains a single sample.".format(array))
ValueError: Expected 2D array, got scalar array instead:
array=2000.
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.
Why do I have to shape my data again as I have already written the code as df[["area"]]? This piece of code converts the array into (5,1), so 2D array is created.
You need to provide the input that is the same shape as your predictor:
from sklearn import linear_model
import numpy as np
import pandas as pd
np.random.seed(111)
df = pd.DataFrame({'x' : np.random.uniform(0,1,100),
'y' : np.random.uniform(0,1,100)})
reg=linear_model.LinearRegression()
reg.fit(df[["x"]],df['y'])
You can do:
reg.predict([[2000]])

How to read a csv file and plot confusion matrix in python

I have a CSV file with 2 columns as
actual,predicted
1,0
1,0
1,1
0,1
.,.
.,.
How do I read this file and plot a confusion matrix in Python?
I tried the following code from a program.
import pandas as pd
from sklearn.metrics import confusion_matrix
import numpy
CSVFILE='./mappings.csv'
test_df=pd.read_csv[CSVFILE]
actualValue=test_df['actual']
predictedValue=test_df['predicted']
actualValue=actualValue.values
predictedValue=predictedValue.values
cmt=confusion_matrix(actualValue,predictedValue)
print cmt
but it gives me this error.
Traceback (most recent call last):
File "confusionMatrixCSV.py", line 7, in <module>
test_df=pd.read_csv[CSVFILE]
TypeError: 'function' object has no attribute '__getitem__'
pd.read_csv is a function. You call a function in Python by using parenthesis.
You should use pd.read_csv(CSVFILE) instead of pd.read_csv[CSVFILE].
import pandas as pd
from sklearn.metrics import confusion_matrix
import numpy as np
CSVFILE = './mappings.csv'
test_df = pd.read_csv(CSVFILE)
actualValue = test_df['actual']
predictedValue = test_df['predicted']
actualValue = actualValue.values.argmax(axis=1)
predictedValue =predictedValue.values.argmax(axis=1)
cmt = confusion_matrix(actualValue, predictedValue)
print cmt
Here's a simple solution to calculate the accuracy and plot confusion matrix for the input in the format mentioned in the question.
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
file=open("results.txt","r")
result=[]
actual=[]
i = 0
for line in file:
i+=1
sent=line.split("\t")
sent[0]=int(sent[0])
sent[1]=int(sent[1])
result.append(sent[1])
actual.append(sent[0])
cnf_mat=confusion_matrix(actual,result)
print cnf_mat
print('Test Accuracy:', accuracy_score(actual,result))

Linear Regression issues

I'm trying to run a linear regression for 2 columns of data (IMF_VALUES, BBG_FV)
I have this code:
import numpy as np
from sklearn import linear_model
import matplotlib.pyplot as plt
import pandas as pd
raw_data = pd.read_csv("IMF and BBG Fair Values.csv")
ISO_TH = raw_data[["IMF_VALUE","BBG_FV"]]
filtered_TH = ISO_TH[np.isfinite(raw_data['BBG_FV'])]
npMatrix = np.matrix(filtered_TH)
IMF_VALUE, BBG_FV = npMatrix[:,0], npMatrix[:,1]
regression = linear_model.LinearRegression
regression.fit(IMF_VALUE, BBG_FV)
When I run this as a test, I get this error and I really have no idea why:
TypeError Traceback (most recent call last)
<ipython-input-28-1ee2fa0bbed1> in <module>()
1 regression = linear_model.LinearRegression
----> 2 regression.fit(IMF_VALUE, BBG_FV)
TypeError: fit() missing 1 required positional argument: 'y'
Make sure that both are one dimensional arrays:
regression.fit(np.array(IMF_VALUE).reshape(-1,1), np.array(BBG_FV).reshape(-1,1))

python scikit-learn cosine similarity value error: could not convert integer scalar

I am trying to produce a cosine similarity matrix using text descriptions of apps. The script below first reads in a csv data file (I can provide the data file if needed) which contains two columns, one with two app categories and the other with tokenized, stemmed descriptions for a number of apps in each of these two categories. The script then creates a tfidf matrix and attempts to produce a cosine similarity matrix.
I updated Anaconda 64 bit for Windows yesterday to make sure I have the latest versions of Python, numpy, scipy, and scikit-learn.
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import os
print ('reading file into pandas')
data = pd.read_csv(os.path.join('inputfile.csv'))
cats = np.unique(data['category'])
for i in cats:
print ()
print ('prepping', i)
d2 = data[data.category == i]
descStem = d2.descStem.tolist()
print ('vectorizing', i)
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1,1), min_df=2, stop_words='english')
tfidf_matrix = tfidf_vectorizer.fit_transform(descStem)
print (tfidf_matrix.shape)
print ('calculating cosine sim', i)
cosOrig = cosine_similarity(tfidf_matrix, tfidf_matrix)
The script works just fine for the smaller category of comics, with a tdidf_matrix.shape = (3119, 8217). However, I receive the error message below for the larger category of education, with a tfidf_matrix.shape = (90327, 62863). This matrix is larger than 2^32.
Traceback (most recent call last):
File "<ipython-input-1-4b2586ddeca4>", line 1, in <module>
runfile('Z:/rangus/gplay/marcello/data/similarity/error/cosSimByCatScrapeError.py', wdir='Z:/rangus/gplay/marcello/data/similarity/error')
File "F:\u0137777\Continuum\Anaconda3\lib\site-packages\spyder\utils\site\sitecustomize.py", line 866, in runfile
execfile(filename, namespace)
File "F:\u0137777\Continuum\Anaconda3\lib\site-packages\spyder\utils\site\sitecustomize.py", line 102, in execfile
exec(compile(f.read(), filename, 'exec'), namespace)
File "Z:/rangus/gplay/marcello/data/similarity/error/cosSimByCatScrapeError.py", line 23, in <module>
cosOrig = cosine_similarity(tfidf_matrix, tfidf_matrix)
File "F:\u0137777\Continuum\Anaconda3\lib\site-packages\sklearn\metrics\pairwise.py", line 918, in cosine_similarity
K = safe_sparse_dot(X_normalized, Y_normalized.T, dense_output=dense_output)
File "F:\u0137777\Continuum\Anaconda3\lib\site-packages\sklearn\utils\extmath.py", line 186, in safe_sparse_dot
ret = ret.toarray()
File "F:\u0137777\Continuum\Anaconda3\lib\site-packages\scipy\sparse\compressed.py", line 920, in toarray
return self.tocoo(copy=False).toarray(order=order, out=out)
File "F:\u0137777\Continuum\Anaconda3\lib\site-packages\scipy\sparse\coo.py", line 258, in toarray
B.ravel('A'), fortran)
ValueError: could not convert integer scalar
I can overcome this error by running the code below, but using a dense matrix is a massive memory hog and I need to run this script on 40+ categories.
print ('vectorizing', i)
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1,1), min_df=2, stop_words='english')
tfidf_matrix = tfidf_vectorizer.fit_transform(descStem)
tfidf_matrixD = tfidf_matrix.toarray()
print ('calculating cosine sim', i)
cosOrig = cosine_similarity(tfidf_matrixD, tfidf_matrixD)
This is the closest similar issue I could find on StackOverflow, but I couldn't see out how it would help my situation...

Categories

Resources