ValueError: Found input variables with inconsistent numbers of samples: [3058, 3777] - python

here is the code:
import numpy as np
import os
import cv2
import matplotlib.pyplot as plt
from os import listdir
from pathlib import Path
jpeg_images = list(Path(r'D:/ncfm/train').glob('**/*.jpg'))
np.array([np.array(cv2.imread(str(file))).flatten() for file in
jpeg_images])
folder = ['ALB', 'BET', 'DOL', 'LAG', 'NoF', 'OTHER', 'SHARK', 'YFT','test
images']
Path = r'D:\ncfm\train'
for i in range(9):
listing = os.listdir(Path+'/'+folder[i])
folder[i] = np.array([np.array(cv2.imread(Path+'/'+folder[i]+'/'+file)).flatten()for file in listing])
L.append(len(listing))
next I have tried to concatenate this.
M = np.concatenate((folders[1], folders[2], folders[3], folders[4],
folders[5], folders[6], folders[7], folders[8]))
next i have done the labelling
label = np.ones((3777), dtype=int)
label[0:1720]=1
label[1720:1920]=2
label[1920:2038]=3
label[2038:2104]=4
label[2104:2568]=5
label[2568:2868]=6
label[2868:3044]=7
label[3044:3777]=8
from sklearn.utils import shuffle
data,Label = shuffle(M, label, random_state = 2)
here comes the error;
ValueError Traceback (most recent call last)
<ipython-input-148-f7cec68b48c6> in <module>
1 from sklearn.utils import shuffle
2
----> 3 data,Label = shuffle(M, label, random_state = 2)
~\Anaconda3\lib\site-packages\sklearn\utils\__init__.py in
shuffle(*arrays, **options)
447 """
448 options['replace'] = False
--> 449 return resample(*arrays, **options)
450
451
~\Anaconda3\lib\site-packages\sklearn\utils\__init__.py in
resample(*arrays, **options)
330 n_samples))
331
--> 332 check_consistent_length(*arrays)
333
334 if stratify is None:
~\Anaconda3\lib\site-packages\sklearn\utils\validation.py in
check_consistent_length(*arrays)
203 if len(uniques) > 1:
204 raise ValueError("Found input variables with inconsistent
numbers of"
--> 205 " samples: %r" % [int(l) for l in
lengths])
206
207
ValueError: Found input variables with inconsistent numbers of samples:
[3058, 3777]
At first i got the length as [8, 3777]. After converting RGB to gray scale and resizing it i got the length as [3058,3777]. i want to shuffle the rows in the matrix M and rows of the label simultaneously.

Related

gap statistics LinAlgError: Last 2 dimensions of the array must be square

I am trying to find the gap statistics of my clusters like this:
from gap_statistic import OptimalK
# creat function
def KMeans_clustering_func(X, k):
# Include any clustering Algorithm that can return cluster centers
m = KMeans(random_state=11, n_clusters=k)
m.fit(X)
# Return the location of each cluster center and the labels for each point.
return m.cluster_centers_, m.predict(X)
# create a wrapper around OptimalK to extract cluster centers and cluster labels
optimalK = OptimalK(clusterer=KMeans_clustering_func)
# run optimal K on the input data (subset_scaled_interim) and number of clusters
n_clusters = optimalK(X, cluster_array=np.arange(2, 21))
I encounter this error:
inAlgError Traceback (most recent call last)
Input In [211], in <cell line: 19>()
15 optimalK = OptimalK(clusterer=KMeans_clustering_func)
17 # run optimal K on the input data (subset_scaled_interim) and number of clusters
---> 19 n_clusters = optimalK(X, cluster_array=np.arange(2,25))
20 print('Optimal clusters: ', n_clusters)
22 # Gap Statistics data frame
File ~/.local/lib/python3.8/site-packages/gap_statistic/optimalK.py:134, in OptimalK.__call__(self, X, n_refs, cluster_array)
131 engine = self._process_non_parallel
133 # Calculate the gaps for each cluster count.
--> 134 for gap_calc_result in engine(X, n_refs, cluster_array):
135
136 # Assign this loop's gap statistic to gaps
137 gap_df = gap_df.append(
138 {
139 "n_clusters": gap_calc_result.n_clusters,
(...)
147 ignore_index=True,
148 )
149 gap_df["gap_k+1"] = gap_df["gap_value"].shift(-1)
File ~/.local/lib/python3.8/site-packages/gap_statistic/optimalK.py:361, in OptimalK._process_with_joblib(self, X, n_refs, cluster_array)
356 raise EnvironmentError(
357 "joblib is not installed; cannot use joblib as the parallel backend!"
358 )
360 with Parallel(n_jobs=self.n_jobs) as parallel:
--> 361 for gap_calc_result in parallel(
362 delayed(self._calculate_gap)(X, n_refs, n_clusters)
363 for n_clusters in cluster_array
364 ):
365 yield gap_calc_result
LinAlgError: Last 2 dimensions of the array must be square
How to fix it?

How to plot a windrose when the wind direction is a categorical value

From Dataset Australia Rainfall, I'm trying to predict RainTomorrow. Here is my code given below :
Downloading dataset directly from Kaggle using opendatasets library
import opendatasets as od
dataset_url = 'https://www.kaggle.com/jsphyg/weather-dataset-rattle-package'
od.download(dataset_url)
Importing necessary libraries
import os
import pandas as pd
import numpy as np
import plotly.express as px
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
sns.set_style('darkgrid')
matplotlib.rcParams['font.size'] = 14
matplotlib.rcParams['figure.figsize'] = (10,6)
matplotlib.rcParams['figure.facecolor'] = '#00000000'
Loading Dataset
data_dir = './weather-dataset-rattle-package'
os.listdir(data_dir)
train_csv = data_dir + '/weatherAUS.csv'
raw_df = pd.read_csv(train_csv)
Explore WindGustDir variable
print('WindGustDir contains', len(raw_df['WindGustDir'].unique()), 'labels')
raw_df['WindGustDir'].unique()
raw_df.WindGustDir.value_counts()
pd.get_dummies(raw_df.WindGustDir, drop_first=True, dummy_na=True).head()
pd.get_dummies(raw_df.WindGustDir, drop_first=True, dummy_na=True).sum(axis=0)
Plotting Windrose
from windrose import WindroseAxes
ax = WindroseAxes.from_ax()
ax.bar(raw_df.WindGustDir, raw_df.Rainfall, normed=True, opening=0.8,
edgecolor='white')
ax.set_legend()
I am unable to figure out which columns should use with WindGustDir or if their is any other option of compare RainTomorrow and WindGustDir .
Error Message
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
e:\Anaconda3\lib\site-packages\numpy\core\fromnumeric.py in _wrapfunc(obj, method, *args, **kwds)
57 try:
---> 58 return bound(*args, **kwds)
59 except TypeError:
TypeError: '<' not supported between instances of 'float' and 'str'
During handling of the above exception, another exception occurred:
TypeError Traceback (most recent call last)
<ipython-input-253-1a1f0fa6bf7a> in <module>
1 ax = WindroseAxes.from_ax()
----> 2 ax.bar(direction=df.WindGustDir, var=df.Rainfall, normed=True, opening=0.8, edgecolor='white')
3 ax.set_legend()
e:\Anaconda3\lib\site-packages\windrose\windrose.py in bar(self, direction, var, **kwargs)
547 """
548
--> 549 bins, nbins, nsector, colors, angles, kwargs = self._init_plot(
550 direction, var, **kwargs
551 )
e:\Anaconda3\lib\site-packages\windrose\windrose.py in _init_plot(self, direction, var, **kwargs)
359
360 # Set the global information dictionnary
--> 361 self._info["dir"], self._info["bins"], self._info["table"] = histogram(
362 direction, var, bins, nsector, normed, blowto
363 )
e:\Anaconda3\lib\site-packages\windrose\windrose.py in histogram(direction, var, bins, nsector, normed, blowto)
746 direction[direction >= 360.] = direction[direction >= 360.] - 360
747
--> 748 table = histogram2d(x=var, y=direction, bins=[var_bins, dir_bins], normed=False)[0]
749 # add the last value to the first to have the table of North winds
750 table[:, 0] = table[:, 0] + table[:, -1]
<__array_function__ internals> in histogram2d(*args, **kwargs)
e:\Anaconda3\lib\site-packages\numpy\lib\twodim_base.py in histogram2d(x, y, bins, range, normed, weights, density)
742 xedges = yedges = asarray(bins)
743 bins = [xedges, yedges]
--> 744 hist, edges = histogramdd([x, y], bins, range, normed, weights, density)
745 return hist, edges[0], edges[1]
746
<__array_function__ internals> in histogramdd(*args, **kwargs)
e:\Anaconda3\lib\site-packages\numpy\lib\histograms.py in histogramdd(sample, bins, range, normed, weights, density)
1071
1072 # Compute the bin number each sample falls into.
-> 1073 Ncount = tuple(
1074 # avoid np.digitize to work around gh-11022
1075 np.searchsorted(edges[i], sample[:, i], side='right')
e:\Anaconda3\lib\site-packages\numpy\lib\histograms.py in <genexpr>(.0)
1073 Ncount = tuple(
1074 # avoid np.digitize to work around gh-11022
-> 1075 np.searchsorted(edges[i], sample[:, i], side='right')
1076 for i in _range(D)
1077 )
<__array_function__ internals> in searchsorted(*args, **kwargs)
e:\Anaconda3\lib\site-packages\numpy\core\fromnumeric.py in searchsorted(a, v, side, sorter)
1346
1347 """
-> 1348 return _wrapfunc(a, 'searchsorted', v, side=side, sorter=sorter)
1349
1350
e:\Anaconda3\lib\site-packages\numpy\core\fromnumeric.py in _wrapfunc(obj, method, *args, **kwds)
65 # Call _wrapit from within the except clause to ensure a potential
66 # exception has a traceback chain.
---> 67 return _wrapit(obj, method, *args, **kwds)
68
69
e:\Anaconda3\lib\site-packages\numpy\core\fromnumeric.py in _wrapit(obj, method, *args, **kwds)
42 except AttributeError:
43 wrap = None
---> 44 result = getattr(asarray(obj), method)(*args, **kwds)
45 if wrap:
46 if not isinstance(result, mu.ndarray):
TypeError: '<' not supported between instances of 'float' and 'str'
It seems that the direction parameter must be numeric.
Create a dict where each key is a each direction in 'WindGustDir' and the corresponding value is a float in degrees.
.map the dict to df.WindGustDir and plot
Alternatively, create and plot a new column
df.insert(loc=8, column='WindGustDirDeg', value=df.WindGustDir.map(wind_dir_deg))
import pandas as pd
from windrose import WindroseAxes
import numpy as np
# load the downloaded data and dropna
df = pd.read_csv('weatherAUS/weatherAUS.csv').dropna(subset=['WindGustDir'])
# create a dict for WindGustDir to numeric values
wind_dir = ['E', 'ENE', 'NE', 'NNE', 'N', 'NNE', 'NW', 'WNW', 'W', 'WSW', 'SW', 'SSW', 'S', 'SSE', 'SE', 'ESE']
degrees = np.arange(0, 360, 22.5)
wind_dir_deg = dict((zip(wind_dir, degrees)))
# plot and map WindGustDir to the dict
ax = WindroseAxes.from_ax()
ax.bar(direction=df.WindGustDir.map(wind_dir_deg), var=df.Rainfall, normed=True, opening=0.8, edgecolor='white')
ax.set_legend()

statsmodels.tsa.api-0.9.0 ZeroDivisionError: division by zero

Code:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import statsmodels as sm
from statsmodels.tsa.api import ExponentialSmoothing, SimpleExpSmoothing, Holt
print('statsmodels.__version__', sm.__version__)
df = pd.DataFrame([
[547.184518, 256.990247, 237.709566, 465.214791, 1479.401737],
], columns=['point_4', 'point_5', 'point_6', 'point_7', 'point_8'], index=['000001.XSHE'])
fit2 = SimpleExpSmoothing(df.loc['000001.XSHE']).fit(smoothing_level=0.6, optimized=False)
fcast1 = fit2.forecast(1)
Error:
statsmodels.__version__ 0.9.0
/opt/conda/lib/python3.6/site-packages/statsmodels/tsa/base/tsa_model.py:221: ValueWarning: An unsupported index was provided and will be ignored when e.g. forecasting.
' ignored when e.g. forecasting.', ValueWarning)
---------------------------------------------------------------------------
ZeroDivisionError Traceback (most recent call last)
<ipython-input-4-a742c2be4f46> in <module>
12 ], columns=['point_4', 'point_5', 'point_6', 'point_7', 'point_8'], index=['000001.XSHE'])
13
---> 14 fit2 = SimpleExpSmoothing(df.loc['000001.XSHE']).fit(smoothing_level=0.6, optimized=False)
15 fcast1 = fit2.forecast(1)
/opt/conda/lib/python3.6/site-packages/statsmodels/tsa/holtwinters.py in fit(self, smoothing_level, optimized)
814 [1] Hyndman, Rob J., and George Athanasopoulos. Forecasting: principles and practice. OTexts, 2014.
815 """
--> 816 return super(SimpleExpSmoothing, self).fit(smoothing_level=smoothing_level, optimized=optimized)
817
818
/opt/conda/lib/python3.6/site-packages/statsmodels/tsa/holtwinters.py in fit(self, smoothing_level, smoothing_slope, smoothing_seasonal, damping_slope, optimized, use_boxcox, remove_bias, use_basinhopping)
592 smoothing_seasonal=gamma, damping_slope=phi,
593 initial_level=l0, initial_slope=b0, initial_seasons=s0,
--> 594 use_boxcox=use_boxcox, lamda=lamda, remove_bias=remove_bias)
595 hwfit._results.mle_retvals = opt
596 return hwfit
/opt/conda/lib/python3.6/site-packages/statsmodels/tsa/holtwinters.py in _predict(self, h, smoothing_level, smoothing_slope, smoothing_seasonal, initial_level, initial_slope, damping_slope, initial_seasons, use_boxcox, lamda, remove_bias)
733 k = m * seasoning + 2 * trending + 2 + 1 * damped
734 aic = self.nobs * np.log(sse / self.nobs) + (k) * 2
--> 735 aicc = aic + (2 * (k + 2) * (k + 3)) / (self.nobs - k - 3)
736 bic = self.nobs * np.log(sse / self.nobs) + (k) * np.log(self.nobs)
737 resid = data - fitted[:-h - 1]
ZeroDivisionError: division by zero
SimpleExpSmoothing is used for forcasting time series based data, my input data is valid, it should output forecast data without error.
If I remove point_8 column from the DataFrame, then the error disappears.
Do you know why it throws ZeroDivisionError?

Convert pandas to numpy.ndarray for sparse.hstack

I try to solve next problem
import numpy as np
import pandas as pd
from scipy import sparse
X1 = sparse.rand(10, 10000)
df = pd.DataFrame({ 'a': range(10)})
In fact, I get X1 from TfidfVectorizer but let go of the code for the sake of brevity
I want to apply sparse.hstack to use both variables in a regression.
I convert pandas to numpy.ndarray as below
X2 = df['a'].as_matrix()
type(X2)
numpy.ndarray
X = sparse.hstack((X1,X2))
ValueError Traceback (most recent call last)
<ipython-input-38-9493e3833c5d> in <module>()
----> 1 X = sparse.hstack((X1,X2))
C:\Program Files\Anaconda3\lib\site-packages\scipy\sparse\construct.py in hstack(blocks, format, dtype)
462
463 """
--> 464 return bmat([blocks], format=format, dtype=dtype)
465
466
C:\Program Files\Anaconda3\lib\site-packages\scipy\sparse\construct.py in bmat(blocks, format, dtype)
579 elif brow_lengths[i] != A.shape[0]:
580 raise ValueError('blocks[%d,:] has incompatible '
--> 581 'row dimensions' % i)
582
583 if bcol_lengths[j] == 0:
ValueError: blocks[0,:] has incompatible row dimensions
What's wrong?
I've done as below. It works
import numpy as np
import pandas as pd
from scipy import sparse
X1 = sparse.rand(10, 10000)
df = pd.DataFrame({ 'a': range(10)})
X2 = df['a'].reset_index()
X2 = X2.iloc[:,[1]].values
X = sparse.hstack((X1,X2))
your arrays must have the same first dimension size and must contain at least 1 row each.
you can check that by X1.shape() and X2.shape()

Why is statsmodels throwing an IndedxError when I try to fit a linear mixed-effect model?

Given the code:
import statsmodels.api as sm
import statsmodels.formula.api as smf
df.reset_index(drop=True, inplace=True)
display(df.describe())
md = smf.mixedlm("c ~ iscorr", df, groups=df.subnum)
mdf = md.fit()
Where df is a pandas.DataFrame, I get the following error out of smf.mixedlm:
IndexError Traceback (most recent call last)
<ipython-input-34-5373fe9b774a> in <module>()
4 df.reset_index(drop=True, inplace=True)
5 display(df.describe())
----> 6 md = smf.mixedlm("c ~ iscorr", df, groups=df.subnum)
7 # mdf = md.fit()
/home/lthibault/.pyenv/versions/3.5.0/lib/python3.5/site-packages/statsmodels/regression/mixed_linear_model.py in from_formula(cls, formula, data, re_formula, subset, *args, **kwargs)
651 subset=None,
652 exog_re=exog_re,
--> 653 *args, **kwargs)
654
655 # expand re names to account for pairs of RE
/home/lthibault/.pyenv/versions/3.5.0/lib/python3.5/site-packages/statsmodels/base/model.py in from_formula(cls, formula, data, subset, *args, **kwargs)
148 kwargs.update({'missing_idx': missing_idx,
149 'missing': missing})
--> 150 mod = cls(endog, exog, *args, **kwargs)
151 mod.formula = formula
152
/home/lthibault/.pyenv/versions/3.5.0/lib/python3.5/site-packages/statsmodels/regression/mixed_linear_model.py in __init__(self, endog, exog, groups, exog_re, use_sqrt, missing, **kwargs)
537
538 # Split the data by groups
--> 539 self.endog_li = self.group_list(self.endog)
540 self.exog_li = self.group_list(self.exog)
541 self.exog_re_li = self.group_list(self.exog_re)
/home/lthibault/.pyenv/versions/3.5.0/lib/python3.5/site-packages/statsmodels/regression/mixed_linear_model.py in group_list(self, array)
671 if array.ndim == 1:
672 return [np.array(array[self.row_indices[k]])
--> 673 for k in self.group_labels]
674 else:
675 return [np.array(array[self.row_indices[k], :])
/home/lthibault/.pyenv/versions/3.5.0/lib/python3.5/site-packages/statsmodels/regression/mixed_linear_model.py in <listcomp>(.0)
671 if array.ndim == 1:
672 return [np.array(array[self.row_indices[k]])
--> 673 for k in self.group_labels]
674 else:
675 return [np.array(array[self.row_indices[k], :])
IndexError: index 7214 is out of bounds for axis 1 with size 7214
Why is this error occurring? len(df) reports that there are 7296 rows, so there should be no issue indexing the 7214th, and the explicit re-indexing ensures that the indices span from zero to 7295.
You may download df here to fiddle around with it if you'd like.
You have 82 null values in iscorr:
>>> df.iscorr.isnull().sum()
82
Drop them and you will be fine:
df = df[df.iscorr.notnull()]
Per the function's docstring:
Notes
------
`data` must define __getitem__ with the keys in the formula
terms args and kwargs are passed on to the model
instantiation. E.g., a numpy structured or rec array, a
dictionary, or a pandas DataFrame.
If `re_formula` is not provided, the default is a random
intercept for each group.
This method currently does not correctly handle missing
values, so missing values should be explicitly dropped from
the DataFrame before calling this method.
"""
Output:
>>> mdf.params
Intercept 0.032000
iscorr[T.True] 0.030670
Intercept RE -0.057462

Categories

Resources