I'm beginner in python and I am trying to plotting cluster's center, but can't do that. Here is my code:
import pandas as pd
import numpy as np
df = pd.read_csv("InputClusterModel.txt")
df.columns = ["Major","Quantity","rating","rating_2","RightWindoWeek","Ranking","CopiesQuant","Content","Trump","Movies","Carton","Serial","Before1014","categor","Purchase","Revenue"]
df.head()
from sklearn.cluster import KMeans
cluster = KMeans(n_clusters=2)
df['cluster'] = cluster.fit_predict(df[df.columns[:15]])
from sklearn.decomposition import PCA
x_cols = df.columns[1:]
pca = PCA()
df['x'] = pca.fit_transform(df[x_cols])[:,0]
df['y'] = pca.fit_transform(df[x_cols])[:,1]
df = df.reset_index()
clusters = df[['Purchase', 'cluster', 'x', 'y']]
clusters.head()
%matplotlib inline
from ggplot import *
ggplot(df, aes(x='x', y='y', color='cluster')) + \
geom_point(size=75) + \
ggtitle("Grouped by Cluster")
df.cluster.value_counts()
#after part which below I see mistake:
cluster_centers = pca.transform(cluster.cluster_centers_)
cluster_centers = pd.DataFrame(cluster_centers, columns=['x', 'y'])
cluster_centers['cluster'] = range(0, len(cluster_centers))
ggplot(cluster, aes(x='x', y='y', color='cluster')) + \
geom_point(size=100) + \
geom_point(cluster_centers, size=500) +\
ggtitle("Customers Grouped by Cluster")
print(pca.explained_variance_ratio_)
This is the error I get:
ValueError Traceback (most recent call
last) <ipython-input-18-c2ac22e32b75> in <module>()
----> 1 cluster_centers = pca.transform(cluster.cluster_centers_)
2 cluster_centers = pd.DataFrame(cluster_centers, columns=['x', 'y'])
3 cluster_centers['cluster'] = range(0, len(cluster_centers))
4
5 ggplot(cluster, aes(x='x', y='y', color='cluster')) + geom_point(size=100) + geom_point(cluster_centers, size=500) +
ggtitle("Customers Grouped by Cluster")
/home/belotelov/anaconda2/lib/python2.7/site-packages/sklearn/decomposition/base.pyc
in transform(self, X, y)
130 X = check_array(X)
131 if self.mean_ is not None:
--> 132 X = X - self.mean_
133 X_transformed = fast_dot(X, self.components_.T)
134 if self.whiten:
ValueError: operands could not be broadcast together with shapes
(2,15) (16,)
Structure of my data looks like in this header:
0,122,7,8,6,8,105.704,1,0,1,0,0,0,0,37426,11831762
1,278,8,8,12,2,2246,1,1,1,0,0,0,0,29316,7371029
1,275,6,6,14,1,1268,1,1,1,0,0,0,0,30693,7368787
0,125,5,5,5,1,105.704,1,0,1,0,0,0,0,20661,7337545
1,193,8,8,11,2,1063,1,1,1,0,0,0,0,29141,7279077
1,1,6,6,11,0,1236,1,1,0,1,0,0,0,879,325151
1,116,8,8,14,0,1209,1,1,0,1,0,0,0,17751,5529657
0,39,7,7,11,1,1128,1,1,1,0,0,0,0,15044,5643468
1,65,6,6,11,0,1209,1,1,0,1,0,0,0,9902,2612669
0,170,6,7,2,0,105.704,1,1,1,0,0,0,0,19167,5195321
p.s. Python 2.7.12 :: Anaconda custom (64-bit) on Debian Jessie
I have not reviewed your code line by line. Here's a comment on the error:
ValueError: operands could not be broadcast together with shapes
(2,15) (16,)
As the error implies, you're trying to broadcast X = X - self.mean_ with two incompatible vectors. The rule for broadcasting is that each vector's last dimension's axis lengths should match (here 15 and 1) or both should be 1.
I recommend searching for the generated error and to have a look on this
Related
Can someone enlighten me why the following code to compute the jacobian of kernel matrix doesn't work:
import autograd.numpy as np
# import numpy as np
from autograd import grad
from autograd import jacobian
from numpy import linalg as LA
def kernel(x1,x2,l):
return np.exp(-((x1-x2)**2).sum()/(2*(l**2)))
def kernel_matrixx(top_k_history):
k_t_X_list = []
for i in range(k-1):
# print(kernel(top_k_history[i],observation,l))
k_t_X_list.append(np.expand_dims(np.expand_dims((kernel(top_k_history[0],top_k_history[i+1],l)), axis=0), axis=0))
# print(k_t_X_list[0].item())
# k_t_X = np.expand_dims(np.asarray(k_t_X_list), axis=0)
k_t_X = np.expand_dims(np.expand_dims((kernel(top_k_history[0],top_k_history[0],l)), axis=0), axis=0)
for i in range(k-1):
# temp = np.expand_dims(np.expand_dims(np.asarray(kernel(observation,top_k_history[i+1],l)), axis=0), axis=0)
k_t_X = np.concatenate([k_t_X, k_t_X_list[i]], axis=1)
k_t_X_first = k_t_X
k_t_X_list_list = []
for j in range(k-1):
k_t_X_list = []
for i in range(k-1):
# print(kernel(top_k_history[i],observation,l))
k_t_X_list.append(np.expand_dims(np.expand_dims((kernel(top_k_history[j+1],top_k_history[i+1],l)), axis=0), axis=0))
# print(k_t_X_list[0].item())
# k_t_X = np.expand_dims(np.asarray(k_t_X_list), axis=0)
k_t_X = np.expand_dims(np.expand_dims((kernel(top_k_history[j+1],top_k_history[0],l)), axis=0), axis=0)
for i in range(k-1):
# temp = np.expand_dims(np.expand_dims(np.asarray(kernel(observation,top_k_history[i+1],l)), axis=0), axis=0)
k_t_X = np.concatenate([k_t_X, k_t_X_list[i]], axis=1)
k_t_X_list_list.append(k_t_X)
for i in range(k-1):
k_t_X_first = np.concatenate([k_t_X_first, k_t_X_list_list[i]], axis=0)
return k_t_X_first
k=10
l=19
top_k_history = []
for i in range(10):
top_k_history.append(np.random.rand(10))
jac = jacobian(kernel_matrixx)
jac(top_k_history)
the error I got is:
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
~\AppData\Local\Temp/ipykernel_15016/2419460232.py in <module>
1 jac = jacobian(kernel_matrixx)
----> 2 jac(top_k_history)
~\Anaconda3\envs\unlearning\lib\site-packages\autograd\wrap_util.py in nary_f(*args, **kwargs)
18 else:
19 x = tuple(args[i] for i in argnum)
---> 20 return unary_operator(unary_f, x, *nary_op_args, **nary_op_kwargs)
21 return nary_f
22 return nary_operator
~\Anaconda3\envs\unlearning\lib\site-packages\autograd\differential_operators.py in jacobian(fun, x)
57 vjp, ans = _make_vjp(fun, x)
58 ans_vspace = vspace(ans)
---> 59 jacobian_shape = ans_vspace.shape + vspace(x).shape
60 grads = map(vjp, ans_vspace.standard_basis())
61 return np.reshape(np.stack(grads), jacobian_shape)
TypeError: can only concatenate tuple (not "list") to tuple
I am already aware that I cannot create a zero matrix (or identity matrix) then fill in the value with nested for loop. Therefore I create np.array and then concat them. I use the same approach for compute the grad of some other output of the same kernel matrix and it did work so I'm not sure why it didn't work for Jacobian.
Edit: the error now should be reproducible
There is a datatype problem. I your code top_k_history is of type list and contains 10 1D-arrays, each of length 10. If you convert this into 1 2D-array of shape (10, 10), then the error should vanish:
# <original code except the last line>
top_k_history = np.array(top_k_history) # new
jac(top_k_history) # original last line
I'm hoping to cluster vectors based on the direction and magnitude using python. I've found limited examples using R but none for python. Not to confuse with standard k-means for scatter points, I'm actually trying to cluster the whole vector.
The following takes two sets of xy points to generate a vector. I'm then hoping to cluster these vectors based on the length and direction.
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.cluster import KMeans
df = pd.DataFrame(np.random.randint(0,20,size=(100, 4)), columns=list('ABCD'))
plt.rcParams['image.cmap'] = 'Paired'
fig,ax = plt.subplots()
ax.set_xlim(-5, 25)
ax.set_ylim(-5, 25)
A = df['A']
B = df['B']
C = df['C']
D = df['D']
ax.quiver(A, B, (C-A), (D-B), angles = 'xy', scale_units = 'xy', scale = 1, alpha = 0.5)
X_1 = np.array(df[['A','B','C','D']])
model = KMeans(n_clusters = 20)
model.fit(X_1)
cluster_labels = model.predict(X_1)
df['n_cluster'] = cluster_labels
centroids_1 = pd.DataFrame(data = model.cluster_centers_, columns = ['start_x', 'start_y', 'end_x', 'end_y'])
cc = model.cluster_centers_
a = cc[:, 0]
b = cc[:, 1]
c = cc[:, 2]
d = cc[:, 3]
lc1 = ax.quiver(a, b, (c-a), (d-b), angles = 'xy', scale_units = 'xy', scale = 1, alpha = 0.8)
The following figure displays an example
What about this :
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import hdbscan
df = pd.DataFrame(np.random.randint(0,20,size=(100, 4)), columns=list('ABCD'))
plt.rcParams['image.cmap'] = 'Paired'
A = df['A'] #X start
B = df['B'] #Y start
C = df['C'] #X arrive
D = df['D'] #Y arrive
clusterer = hdbscan.HDBSCAN()
df['LENGTH'] = np.sqrt(np.square(df.C-df.A) + np.square(df.D-df.B))
df['DIRECTION'] = np.degrees(np.arctan2(df.D-df.B, df.C-df.A))
coords = df[['LENGTH', 'DIRECTION']].values
clusterer.fit_predict(coords)
cluster_labels = clusterer.labels_
num_clusters = len(set(cluster_labels))
clusters = pd.DataFrame(
[(coords[cluster_labels==n], len(coords[cluster_labels==n])) for n in range(num_clusters)],
columns=["points", "weight"]
)
colors = {0:"green", 1:"blue", 2:"red", 3:"yellow", 4:"pink"}
df['CLUSTER'] = np.nan
for x, (cluster, weight) in enumerate(clusters[clusters.weight>0].values.tolist()):
df_this_cluster = pd.DataFrame(cluster, columns=['LENGTH', 'DIRECTION'])
df_this_cluster['TEMP'] = x
df = df.merge(df_this_cluster, on=['LENGTH', 'DIRECTION'], how='left')
ix = df[df.TEMP.notnull()].index
df.loc[ix, "CLUSTER"] = df.loc[ix, "TEMP"]
df.drop("TEMP", axis=1, inplace=True)
df['COLOR'] = df['CLUSTER'].map(colors).fillna('black')
fig,ax = plt.subplots()
ax.set_xlim(-5, 25)
ax.set_ylim(-5, 25)
ax.quiver(df.A, df.B, (df.C-df.A), (df.D-df.B), angles='xy', scale_units='xy', scale=1, alpha=0.5, color=df.COLOR)
This will use clustering based on length and direction (direction being transformed to degrees, radians' small range doesn't match very well with the model on my first try).
I don't think this will be a very "cartesian" solution as the two values beeing analysed in the model are not in the same metrics... But the visual results are not so bad...
I did try another match based on the 4 coordinates, which is more rigorous. But it is (quite expectably) clustering the vectors by subareas of the space (when there are any) :
coords = df[['A', 'B', 'C', 'D']].values
clusterer.fit_predict(coords)
cluster_labels = clusterer.labels_
num_clusters = len(set(cluster_labels))
clusters = pd.DataFrame(
[(coords[cluster_labels==n], len(coords[cluster_labels==n])) for n in range(num_clusters)],
columns=["points", "weight"]
)
colors = {0:"green", 1:"blue", 2:"red", 3:"yellow", 4:"pink"}
df['CLUSTER'] = np.nan
for x, (cluster, weight) in enumerate(clusters[clusters.weight>0].values.tolist()):
df_this_cluster = pd.DataFrame(cluster, columns=['A', 'B', 'C', 'D'])
df_this_cluster['TEMP'] = x
df = df.merge(df_this_cluster, on=['A', 'B', 'C', 'D'], how='left')
ix = df[df.TEMP.notnull()].index
df.loc[ix, "CLUSTER"] = df.loc[ix, "TEMP"]
df.drop("TEMP", axis=1, inplace=True)
df['COLOR'] = df['CLUSTER'].map(colors).fillna('black')
EDIT
I gave it another try, based on the (very good) suggestion that angles are not a good variable given the fact that there are discontinuities around 0/2pi ; so I choose to use both sinuses and cosinuses instead. I also scaled the length (to have matching scales for the 3 variables) :
So the result would be :
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.preprocessing import robust_scale
import hdbscan
df = pd.DataFrame(np.random.randint(0,20,size=(100, 4)), columns=list('ABCD'))
plt.rcParams['image.cmap'] = 'Paired'
A = df['A'] #X start
B = df['B'] #Y start
C = df['C'] #X arrive
D = df['D'] #Y arrive
clusterer = hdbscan.HDBSCAN()
df['LENGTH'] = robust_scale(np.sqrt(np.square(df.C-df.A) + np.square(df.D-df.B)))
df['DIRECTION'] = np.arctan2(df.D-df.B, df.C-df.A)
df['COS'] = np.cos(df['DIRECTION'])
df['SIN'] = np.sin(df['DIRECTION'])
columns = ['LENGTH', 'COS', 'SIN']
clusterer = hdbscan.HDBSCAN()
values = df[columns].values
clusterer.fit_predict(values)
cluster_labels = clusterer.labels_
num_clusters = len(set(cluster_labels))
clusters = pd.DataFrame(
[(values[cluster_labels==n], len(values[cluster_labels==n])) for n in range(num_clusters)],
columns=["points", "weight"]
)
def get_cmap(n, name='hsv'):
'''
Returns a function that maps each index in 0, 1, ..., n-1 to a distinct
RGB color; the keyword argument name must be a standard mpl colormap name.
Credits to #Ali
https://stackoverflow.com/questions/14720331/how-to-generate-random-colors-in-matplotlib#answer-25628397
'''
return plt.cm.get_cmap(name, n)
cmap = get_cmap(num_clusters+1)
colors = {x:cmap(x) for x in range(num_clusters)}
df['CLUSTER'] = np.nan
for x, (cluster, weight) in enumerate(clusters[clusters.weight>0].values.tolist()):
df_this_cluster = pd.DataFrame(cluster, columns=columns)
df_this_cluster['TEMP'] = x
df = df.merge(df_this_cluster, on=columns, how='left')
df.reset_index(drop=True, inplace=True)
ix = df[df.TEMP.notnull()].index
df.loc[ix, "CLUSTER"] = df.loc[ix, "TEMP"]
df.drop("TEMP", axis=1, inplace=True)
df['CLUSTER'] = df['CLUSTER'].fillna(num_clusters-1)
df['COLOR'] = df['CLUSTER'].map(colors)
print("Number of clusters : ", num_clusters-1)
nrows = num_clusters//2 if num_clusters%2==0 else num_clusters//2 + 1
fig,axes = plt.subplots(nrows=nrows, ncols=2)
axes = [y for row in axes for y in row]
for k,ax in enumerate(axes):
ax.set_xlim(-5, 25)
ax.set_ylim(-5, 25)
ax.set_aspect('equal', adjustable='box')
if k+1 <num_clusters:
ax.set_title(f"CLUSTER #{k+1}", fontsize=10)
this_df = df[df.CLUSTER==k]
ax.quiver(
this_df.A, #X
this_df.B, #Y
(this_df.C-this_df.A), #X component of vector
(this_df.D-this_df.B), #Y component of vector
angles = 'xy',
scale_units = 'xy',
scale = 1,
color=this_df.COLOR
)
The results are way better (though it depends much of the input dataset) ; the last subplots refers to the vectors not being found to be inside a cluster:
Edit #2
If by "direction" you mean angle in the [0..pi[ interval (ie undirected vectors), you will want to include the following code before computing the cosinuses/sinuses :
ix = df[df.DIRECTION<0].index
df.loc[ix, "DIRECTION"] += np.pi
Maybe you can also cluster the angles (besides the vector norms) by the projections of a normalized vector onto the two unit vectors (1,0) and (0,1) with this function. Handling the projections directly (which are essentially the angles), we won't went into trouble caused by the periodicity of cosine function
def get_norm_and_angle(e1):
e1_norm = np.linalg.norm(e1,axis=1)
e1 = e1 / e1_norm[:,None]
e2 = np.array([1,0])
e3 = np.array([0,1])
return np.stack((e1_norm,e1#e2,e1#e3),axis=1)
Based on this function, here is one possible solution where there is no constraint on how many clusters we want to find. In the script below, five features are used for clustering
Vector norm
Vector projections on x and y axis
Vector starting points
with these five features
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.cluster import KMeans
def get_norm_and_angle(e1):
e1_norm = np.linalg.norm(e1,axis=1)
e1 = e1 / e1_norm[:,None]
e2 = np.array([1,0])
e3 = np.array([0,1])
return np.stack((e1_norm,e1#e2,e1#e3),axis=1)
data = np.cumsum(np.random.randint(0,10,size=(50, 4)),axis=0)
df = pd.DataFrame(data, columns=list('ABCD'))
A = df['A'];B = df['B']
C = df['C'];D = df['D']
starting_points = np.stack((A,B),axis=1)
vectors = np.stack((C,D),axis=1) - np.stack((A,B),axis=1)
different_view = get_norm_and_angle(vectors)
different_view = np.hstack((different_view,starting_points))
num_clusters = 8
model = KMeans(n_clusters=num_clusters)
model.fit(different_view)
cluster_labels = model.predict(different_view)
df['n_cluster'] = cluster_labels
cluster_centers = model.cluster_centers_
cluster_offsets = cluster_centers[:,0][:,None] * cluster_centers[:,1:3]
cluster_starts = np.vstack([np.mean(starting_points[cluster_labels==ind],axis=0) for ind in range(num_clusters)])
main_streams = np.hstack((cluster_starts,cluster_starts+cluster_offsets))
a,b,c,d = main_streams.T
fig,ax = plt.subplots(figsize=(8,8))
ax.set_xlim(-np.max(data)*.1,np.max(data)*1.1)
ax.set_ylim(-np.max(data)*.1,np.max(data)*1.1)
colors = sns.color_palette(n_colors=num_clusters)
lc1 = ax.quiver(a, b, (c-a), (d-b), angles = 'xy', scale_units = 'xy', color = colors, scale = 1, alpha = 0.8, zorder=100)
lc2 = ax.quiver(A, B, (C-A), (D-B), angles = 'xy', scale_units = 'xy', scale = .6, alpha = 0.2)
start_colors = [colors[ind] for ind in cluster_labels]
ax.scatter(starting_points[:,0],starting_points[:,1],c=start_colors)
plt.show()
A sample output is
as you can see in the figure, vectors with close starting points are clustered into the same group.
I'm Building an OLS Model but cant make any predictions.
Can you explain what I'm doing wrong?
Building the model :
import numpy as np
import pandas as pd
from scipy import stats
import statsmodels.api as sm
import matplotlib.pyplot as plt
d = {'City': ['Tokyo','Tokyo','Lisbon','Tokyo','Madrid','New York','Madrid','London','Tokyo','London','Tokyo'],
'Card': ['Visa','Visa','Visa','Master Card','Bitcoin','Master Card','Bitcoin','Visa','Master Card','Visa','Bitcoin'],
'Colateral':['Yes','Yes','No','No','Yes','No','No','Yes','Yes','No','Yes'],
'Client Number':[1,2,3,4,5,6,7,8,9,10,11],
'Total':[100,100,200,300,10,20,40,50,60,100,500]}
d = pd.DataFrame(data=d).set_index('Client Number')
df = pd.get_dummies(d,prefix='', prefix_sep='')
X = df[['Lisbon','London','Madrid','New York','Tokyo','Bitcoin','Master Card','Visa','No','Yes']]
Y = df['Total']
X1 = sm.add_constant(X)
reg = sm.OLS(Y, X1).fit()
reg.summary()
Prediction:
d1 = {'City': ['Tokyo','Tokyo','Lisbon'],
'Card': ['Visa','Visa','Visa'],
'Colateral':['Yes','Yes','No'],
'Client Number':[11,12,13],
'Total':[0,0,0]}
df1 = pd.DataFrame(data=d1).set_index('Client Number')
df1 = pd.get_dummies(df1,prefix='', prefix_sep='')
y_new = df1[['Lisbon','Tokyo','Visa','No','Yes']]
x_new = df1['Total']
mod = sm.OLS(y_new, x_new)
mod.predict(reg.params)
Then it shows : ValueError: shapes (3,1) and (11,) not aligned: 1 (dim 1) != 11 (dim 0)
What Am I doing wrong?
Here is the fixed prediction part of code with my comments:
d1 = {'City': ['Tokyo','Tokyo','Lisbon'],
'Card': ['Visa','Visa','Visa'],
'Colateral':['Yes','Yes','No'],
'Client Number':[11,12,13],
'Total':[0,0,0]}
df1 = pd.DataFrame(data=d1).set_index('Client Number')
df1 = pd.get_dummies(df1,prefix='', prefix_sep='')
x_new = df1.drop(columns='Total')
The main problem is different number of dummies in training X1 and x_new dataset.
Below I add missing dummy columns and fill it with zero:
x_new = x_new.reindex(columns = X1.columns, fill_value=0)
now x_new has proper number of columns equal to training dataset X1:
const Lisbon London Madrid ... Master Card Visa No Yes
Client Number ...
11 0 0 0 0 ... 0 1 0 1
12 0 0 0 0 ... 0 1 0 1
13 0 1 0 0 ... 0 1 1 0
[3 rows x 11 columns]
Finally predict on new dataset x_new using previously trained model reg:
reg.predict(x_new)
result:
Client Number
11 35.956284
12 35.956284
13 135.956284
dtype: float64
APPENDIX
As requested I enclose below fully reproducible code to test both training and prediction tasks:
import numpy as np
import pandas as pd
from scipy import stats
import statsmodels.api as sm
import matplotlib.pyplot as plt
d = {'City': ['Tokyo','Tokyo','Lisbon','Tokyo','Madrid','New York','Madrid','London','Tokyo','London','Tokyo'],
'Card': ['Visa','Visa','Visa','Master Card','Bitcoin','Master Card','Bitcoin','Visa','Master Card','Visa','Bitcoin'],
'Colateral':['Yes','Yes','No','No','Yes','No','No','Yes','Yes','No','Yes'],
'Client Number':[1,2,3,4,5,6,7,8,9,10,11],
'Total':[100,100,200,300,10,20,40,50,60,100,500]}
d = pd.DataFrame(data=d).set_index('Client Number')
df = pd.get_dummies(d,prefix='', prefix_sep='')
X = df[['Lisbon','London','Madrid','New York','Tokyo','Bitcoin','Master Card','Visa','No','Yes']]
Y = df['Total']
X1 = sm.add_constant(X)
reg = sm.OLS(Y, X1).fit()
reg.summary()
###
d1 = {'City': ['Tokyo','Tokyo','Lisbon'],
'Card': ['Visa','Visa','Visa'],
'Colateral':['Yes','Yes','No'],
'Client Number':[11,12,13],
'Total':[0,0,0]}
df1 = pd.DataFrame(data=d1).set_index('Client Number')
df1 = pd.get_dummies(df1,prefix='', prefix_sep='')
x_new = df1.drop(columns='Total')
x_new = x_new.reindex(columns = X1.columns, fill_value=0)
reg.predict(x_new)
The biggest issue is that you are not using the same dummy transformation. That is, some values in df1 are absent. You can add the missing values/columns with the following code (from here):
d1 = {'City': ['Tokyo','Tokyo','Lisbon'],
'Card': ['Visa','Visa','Visa'],
'Colateral':['Yes','Yes','No'],
'Client Number':[11,12,13],
'Total':[0,0,0]}
df1 = pd.DataFrame(data=d1).set_index('Client Number')
df1 = pd.get_dummies(df1,prefix='', prefix_sep='')
print(df1.shape) # Shape is 3x6 but it has to be 3x11
# Get missing columns in the training test
missing_cols = set( df.columns ) - set( df1.columns )
# Add a missing column in test set with default value equal to 0
for c in missing_cols:
df1[c] = 0
# Ensure the order of column in the test set is in the same order than in train set
df1 = df1[df.columns]
print(df1.shape) # Shape is 3x11
Further, you mixed up x_new and y_new. So it should be:
x_new = df1.drop(['Total'], axis=1).values
y_new = df1['Total'].values
mod = sm.OLS(y_new, x_new)
mod.predict(reg.params)
Note that I used x_new = df1.drop(['Total'], axis=1).values instead of df1[['Lisbon','Tokyo','Visa','No','Yes']] as it is more convenient (in terms of 1) less prone to (typing)errors and 2) less code
First, you need to either string-index all the words, or one-hot encode the values. ML models don't accept words, only numbers. Next, you want you X and y to be:
X = d.iloc[:,:-1]
y = d.iloc[:,-1]
This way X has a shape of [11,3] and y has a shape of [11,], which is the proper shapes needed.
Can someone help me to figure out why i'm having this error code : ValueError: n_components must be < n_features; got 10 >= 0
import pandas as pd
from scipy.sparse import csr_matrix
users = pd.read_table(open('ml-1m/users.dat', encoding = "ISO-8859-1"), sep=':', header=None, names=['user_id', 'gender', 'age', 'occupation', 'zip'])
ratings = pd.read_table(open('ml-1m/ratings.dat', encoding = "ISO-8859-1"), sep=':', header=None, names=['user_id', 'movie_id', 'rating', 'timestamp'])
movies = pd.read_table(open('ml-1m/movies.dat', encoding = "ISO-8859-1"), sep=':', header=None, names=['movie_id', 'title', 'genres'])
MovieLens = pd.merge(pd.merge(ratings, users), movies)
ratings_mtx_df = MovieLens.pivot_table(values='rating', index='user_id', columns='title', fill_value=0)
movie_index = ratings_mtx_df.columns
from sklearn.decomposition import TruncatedSVD
recom = TruncatedSVD(n_components=10, random_state=101)
R = recom.fit_transform(ratings_mtx_df.values.T)
ValueError Traceback (most recent call last)
<ipython-input-8-0bd6c9bda95a> in <module>()
1 from sklearn.decomposition import TruncatedSVD
2 recom = TruncatedSVD(n_components=10, random_state=101)
----> 3 R = recom.fit_transform(ratings_mtx_df.values.T)
C:\Users\renau\Anaconda3\lib\site-packages\sklearn\decomposition\truncated_svd.py in fit_transform(self, X, y)
168 if k >= n_features:
169 raise ValueError("n_components must be < n_features;"
--> 170 " got %d >= %d" % (k, n_features))
171 U, Sigma, VT = randomized_svd(X, self.n_components,
172 n_iter=self.n_iter,
ValueError: n_components must be < n_features; got 10 >= 0
You're trying to split your data into 10 dimensions, but as per the documentation for TruncatedSVD, the number of features (columns) in your ratings_mtx_df data needs to be greater than the number of dimensions/components you're looking to extract. Try n_components=3 (assuming you've got at least 3 features in your data) and see if that's any better.
Also, you're turning your input data sideways, with the .T argument in:
R = recom.fit_transform(ratings_mtx_df.values.T)
That may result in switching features (columns) for observations(rows) which might explain why the fit_transform method isn't working.
I'm trying to do a simple match filtering operation on a data set in python (so I tried doing conjugation followed by convolution). However, an error message is showing in the convolution function saying object too deep for desired array. Below is the code I'm using:
import numpy as np
import cPickle
import matplotlib.pyplot as plt
with open('meteor2.pkl', 'rb') as f:
data = cPickle.load(f)
vlt = data['vlt']
mfilt=np.conjugate(vlt)
mfilt1=np.convolve(vlt,mfilt,mode='full')
#mfilt=np.conjugate(vlt)
#mfilt1=np.convolve(vlt,mfilt,'same')
r = data['r']
t = data['t']
codes = data['codes']
freqs = data['freqs']
ch0_db = 10*np.log10(np.abs(mfilt1[:, 0, :])**2)
plt.figure()
plt.imshow(ch0_db.T, vmin=0, origin='lower', cmap=plt.cm.coolwarm,aspect='auto')
plt.title('All pulses')
plt.figure()
plt.imshow(ch0_db[3::5, :].T, vmin=0, origin='lower', cmap=plt.cm.coolwarm,aspect='auto')
plt.title('Minimum sidelobe coded-pulses')
plt.show()
np.convolve does one-dimensional convolution, so in this line:
mfilt1=np.convolve(vlt,mfilt,mode='full')
you'll get that error if either vlt or mfilt is not 1-D. For example,
In [12]: x = np.array([[1,2,3]]) # x is 2-D
In [13]: y = np.array([1,2,3])
In [14]: np.convolve(x, y, mode='full')
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-14-9bf37a14877a> in <module>()
----> 1 np.convolve(x, y, mode='full')
/home/warren/anaconda/lib/python2.7/site-packages/numpy/core/numeric.pyc in convolve(a, v, mode)
822 raise ValueError('v cannot be empty')
823 mode = _mode_from_name(mode)
--> 824 return multiarray.correlate(a, v[::-1], mode)
825
826 def outer(a,b):
ValueError: object too deep for desired array
It looks like you want 2-D (or higher) convolution. scipy has a few options:
scipy.ndimage.convolve
scipy.signal.convolve
scipy.signal.convolve2d