ValueError: Found input variables with inconsistent numbers of samples: [4620309, 6003]

ValueError: Found input variables with inconsistent numbers of samples: [4620309, 6003] - python

I couldnt find where is the problem. Can someone help me for this? My code is:
# Import the necessary modules
from osgeo import gdal
import numpy as np
import pandas as pd
# Open the GeoTIFF files using GDAL
datasetTrainingGT = gdal.Open(r'C:\Users\muham\Downloads\Project-20230123T143514Z-001\Project\S2A_MSIL1C_20220516_Train_GT.tif')
# Read the data from the first GeoTIFF file into a NumPy array
trainGT2d = datasetTrainingGT.ReadAsArray()
trainGT2d = np.swapaxes(trainGT2d, 0, 1)
# Convert the 2-dimensional NumPy arrays into 2-dimensional arrays with rows and columns
trainGT1d = trainGT2d.reshape(trainGT2d.shape[0] * trainGT2d.shape[1], 1)
# Convert the combined array into a Pandas DataFrame
dfTrainLabels = pd.DataFrame(trainGT1d)
# Export the DataFrame as a CSV file
# dfTrainLabels.to_csv('train.csv', index=False)
np.save('train_gt.npy', trainGT1d)
datasetTraining = gdal.Open(r'C:\Users\muham\Downloads\Project-20230123T143514Z-001\Project\S2A_MSIL1C_20220516_TrainingData.tif')
# Read the data from the first GeoTIFF file into a NumPy array
dataTraing = datasetTraining.ReadAsArray()
dataTraing = np.swapaxes(dataTraing, 0, 2)
# Convert the 2-dimensional NumPy arrays into 2-dimensional arrays with rows and columns
dataTraining1d = dataTraing.reshape(dataTraing.shape[0] * dataTraing.shape[1], -1)
dfTrain = pd.DataFrame(dataTraining1d)
final_data = pd.concat([dfTrainLabels, dfTrain])
train_label_data = pd.concat([dfTrainLabels, dfTrain], axis=1)
train_label_data.columns=['Code', 'Blue', 'Green', 'Red', 'NIR']
train_label_data.to_csv('train.csv')
np.save('train.npy', dataTraining1d)
datasetTestGT = gdal.Open(r'C:\Users\muham\Downloads\Project-20230123T143514Z-001\Project/S2B_MSIL1C_20220528_Test.tif')
# Read the data from the first GeoTIFF file into a NumPy array
testGT2d = datasetTestGT.ReadAsArray()
testGT2d = testGT2d[1:, :]
testGT2d = np.swapaxes(testGT2d, 0, 1)
# Convert the 2-dimensional NumPy arrays into 2-dimensional arrays with rows and columns
testGT1d = testGT2d.reshape(testGT2d.shape[0] * testGT2d.shape[1], -1)
# Convert the combined array into a Pandas DataFrame
df = pd.DataFrame(testGT1d)
# Export the DataFrame as a CSV file
df.to_csv('test_gt.csv')
np.save('test_gt.npy', testGT1d)
datasetTest = gdal.Open(r'C:\Users\muham\Downloads\Project-20230123T143514Z-001\Project/S2B_MSIL1C_20220528_Test.tif')
# Read the data from the first GeoTIFF file into a NumPy array
dataTest2d = datasetTest.ReadAsArray()
dataTest2d = np.swapaxes(dataTest2d, 0, 2)
# Convert the 2-dimensional NumPy arrays into 2-dimensional arrays with rows and columns
dataTest1d = dataTest2d.reshape(dataTest2d.shape[0] * dataTest2d.shape[1], -1)
np.save('test_all.npy', dataTest1d)
# Convert the combined array into a Pandas DataFrame
dfTest = pd.DataFrame(dataTest1d)
dfTest.columns=['Blue', 'Green', 'Red', 'NIR']
# Export the DataFrame as a CSV file
dfTest.to_csv('test.csv')
from sklearn.model_selection import train_test_split
X_Test, X_Val, y_test, y_val = train_test_split(dataTest1d, testGT1d, stratify=testGT1d, test_size=0.30)
And I have an error in last 3 row:
from sklearn.model_selection import train_test_split
X_Test, X_Val, y_test, y_val = train_test_split(dataTest1d, testGT1d, stratify=testGT1d, test_size=0.30)
Traceback (most recent call last):
File "C:\Users\muham\AppData\Local\Temp\ipykernel_14476\3326811442.py", line 2, in
X_Test, X_Val, y_test, y_val = train_test_split(dataTest1d, testGT1d, stratify=testGT1d, test_size=0.30)
File "C:\Users\muham\anaconda3\lib\site-packages\sklearn\model_selection_split.py", line 2417, in train_test_split
arrays = indexable(*arrays)
File "C:\Users\muham\anaconda3\lib\site-packages\sklearn\utils\validation.py", line 378, in indexable
check_consistent_length(*result)
File "C:\Users\muham\anaconda3\lib\site-packages\sklearn\utils\validation.py", line 332, in check_consistent_length
raise ValueError(
ValueError: Found input variables with inconsistent numbers of samples: [4620309, 6003]

Related

ValueError: y should be a 1d array, got an array of shape (1, 375) instead

Im made a code which deletes curse words but it says
ValueError: y should be a 1d array, got an array of shape (1, 375) instead.
As you can see i tried to reshape it but it didn`t work. And i wrote all of the error below the code.
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns; sns.set()
from sklearn.datasets import make_blobs
import pandas as pd
df = pd.read_excel('data.xls')
def handle_non_numerical_data(df):
columns = df.columns.values
for column in columns:
text_digit_vals = {}
def convert_to_int(val):
return text_digit_vals[val]
if df[column].dtype != np.int64 and df[column].dtype != np.float64:
column_contents = df[column].values.tolist()
unique_elements = set(column_contents)
x = 0
for unique in unique_elements:
if unique not in text_digit_vals:
text_digit_vals[unique] = x
x+=1
df[column] = list(map(convert_to_int, df[column]))
return df
df = handle_non_numerical_data(df)
X = df['str']
X = X.values.reshape(1, -1)
y = df['curse']
y = y.values.reshape(1,len(y))
plt.show()
from sklearn.naive_bayes import GaussianNB
model = GaussianNB()
model.fit(X,y)
rng = np.random.RandomState(0)
X_new=[-6,-14]+[14,18]*rng.rand(1000,2)
y_new=model.predict(X_new)
plt.scatter(X[:,0],X[:,1],c=y,s=50,cmap='RdBu')
lim = plt.axis()
plt.scatter(X_new[:,0],X_new[:,1],c=y_new,s=20,cmap='RdBu',alpha=0.2)
plt.axis(lim)
plt.show()
Traceback (most recent call last):
File "C:\Users\pc1\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\naive_bayes.py", line 207, in fit
X, y = self._validate_data(X, y)
File "C:\Users\pc1\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\base.py", line 433, in _validate_data
X, y = check_X_y(X, y, **check_params)
File "C:\Users\pc1\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\utils\validation.py", line 63, in inner_f
return f(*args, **kwargs)
File "C:\Users\pc1\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\utils\validation.py", line 883, in check_X_y
y = column_or_1d(y, warn=True)
File "C:\Users\pc1\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\utils\validation.py", line 63, in inner_f
return f(*args, **kwargs)
File "C:\Users\pc1\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\utils\validation.py", line 921, in column_or_1d
raise ValueError(
ValueError: y should be a 1d array, got an array of shape (1, 375) instead.

you reshaped it to (1, n) which is 2d array. It expects to receive 1d array

Tuple Index out of Range PIL when using Image.fromarray()

I am trying to process some CIFAR10 image data into image tiles for plotting with PIL on a black canvas. I have had success doing this while extracting features from a trained model but I keep getting IndexError: Tuple Index out of range when using Image.fromarray.
My features are shaped to be test data of shape (10000,3072). The data is 32x32x3 images.
I load cifar10 data and then flatten the data but keep getting this error.
Here is my code. Some of it is borrowed from https://medium.com/#pslinge144/representation-learning-cifar-10-23b0d9833c40
import numpy as np
from sklearn.manifold import TSNE
from time import time
from pathlib import Path
from PIL import Image
from time import time
from keras.datasets import cifar10
# Load the raw CIFAR-10 data
_, (X_test, y_test) = cifar10.load_data()
# normalize the xtest data
X_test = X_test.astype('float32')
X_test /= 255.0
features = X_test # this is (10000, 32, 32, 3) numpy array
features = np.reshape(features, (10000, 3072)) # flatten to 2d array
print(features.shape)
perplexities = [5, 30, 50, 100]
for perplexity in perplexities:
print("Starting t-SNE on images now!")
tsne = TSNE(n_components = 2, init = 'random', random_state = 0, perplexity = perplexity, learning_rate = 200).fit_transform(features)
tx, ty = tsne[:,0], tsne[:,1] # grab tsne first and 2nd dimensions
# min max normalize for plotting
tx = (tx-np.min(tx)) / (np.max(tx) - np.min(tx))
ty = (ty-np.min(ty)) / (np.max(ty) - np.min(ty))
width = 4000
height = 3000
max_dim = 100
full_image = Image.new('RGB', (width, height))
for idx, x in enumerate(features):
tile = Image.fromarray(np.uint8(x * 255), 'RGB') # rescale pixel values to [0,255] scale
rs = max(1, tile.width / max_dim, tile.height / max_dim)
tile = tile.resize((int(tile.width / rs),int(tile.height / rs)),Image.ANTIALIAS)
full_image.paste(tile, (int((width-max_dim) * tx[idx]),int((height-max_dim) * ty[idx])))
plots_output_path = Path('../data/processed/tSNE_plots').resolve()
filename = "tsne_perplex%d_plot.png" % (perplexity)
fullpath = plots_output_path.joinpath(filename).resolve()
full_image.save(str(fullpath))
Here is the error:
Traceback (most recent call last):
File "tSNE_image_thumbnail.py", line 80, in <module>
tSNE_image(x_test, 1000, 200, plots_output_path, 2)
File "tSNE_image_thumbnail.py", line 56, in tSNE_image
tile = Image.fromarray(np.uint8(x * 255), 'RGB')
File "/home/zw/src/image_classification_ML/venv/lib/python3.8/site-packages/PIL/Image.py", line 2728, in fromarray
size = shape[1], shape[0]
IndexError: tuple index out of range
Again, this code works fine when extracting my features from my CNN model and using it on a shape (10000, 512) dense layer. Not sure why this is giving me issues. Any ideas? Thanks in advance.

Your are providing arrays with the length of '3072' in your line
tile = Image.fromarray(np.uint8(x * 255), 'RGB')
Simply verify it by calling np.uint8(x * 255).shape for an x, which returns (3072,).
But for an 'RGB' image, you need a dimensionality of 3, not just 1.
Because of that, you get the error tuple index out of range, since an array with three entries and not just one is expected.
That means instead of your (3072,) you need a tuple with three entries, so for example (8,96,4), which would map your one-dimensional array of 3072 values to a matrix of 8 x 96 x 4 (=3072) values.
So you could change the line in your code to
tile = Image.fromarray(np.uint8(x).reshape(8,96,4),'RGB')
But at the end, you should define the shape according to the image dimensions.

IndexError when ploting sklearn manifold TSNE

I try to run a t-sne but python shows me this error:
IndexError: only integers, slices (:), ellipsis (...), numpy.newaxis (None) and integer or boolean arrays are valid indices
Data is being provided by this link.
Here's the code:
import pandas as pd
import numpy as np
import sklearn
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.manifold import TSNE
#Step 1 - Download the data
dataframe_all = pd.read_csv('https://d396qusza40orc.cloudfront.net/predmachlearn/pml-training.csv')
num_rows = dataframe_all.shape[0]
#Step 2 - Clearn the data
#count the number of missing elements (NaN) in each column
counter_nan = dataframe_all.isnull().sum()
counter_without_nan = counter_nan[counter_nan==0]
#remove the columns with missing elements
dataframe_all = dataframe_all[counter_without_nan.keys()]
#remove the first 7 columns which contain no descriminative information
dataframe_all = dataframe_all.ix[:,7:]
#Step 3: Create feature vectors
x = dataframe_all.ix[:,:-1].values
standard_scalar = StandardScaler()
x_std = standard_scalar.fit_transform(x)
# t distributed stochastic neighbour embedding (t-SNE) visualization
tsne = TSNE(n_components=2, random_state = 0)
x_test_2d = tsne.fit_transform(x_std)
#scatter plot the sample points among 5 classes
markers=('s','d','o','^','v')
color_map = {0:'red', 1:'blue', 2:'lightgreen', 3:'purple', 4:'cyan'}
plt.figure()
for idx, cl in enumerate(np.unique(x_test_2d)):
plt.scatter(x=x_test_2d[cl, 0],y =x_test_2d[cl, 1], c=color_map[idx], marker=markers[idx], label=cl)
plt.show()
What do I have to change in order to make this work?

The error is due to the following line:
plt.scatter(x_test_2d[cl, 0], x_test_2d[cl, 1], c=color_map[idx], marker=markers[idx])
Here, cl can take and takes not integer values (from np.unique(x_test_2d)) and this raises the error, e.g. the last value that cl takes is 99.46295 and then you use: x_test_2d[cl, 0] which translates into x_test_2d[99.46295, 0]
Define a variable y that hold the class labels, then use:
# variable holding the classes
y = dataframe_all.classe.values
y = np.array([ord(i) for i in y])
#scatter plot the sample points among 5 classes
plt.figure()
plt.scatter(x_test_2d[:, 0], x_test_2d[:, 1], c = y)
plt.show()
FULL CODE:
import pandas as pd
import numpy as np
import sklearn
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.manifold import TSNE
#Step 1 - Download the data
dataframe_all = pd.read_csv('https://d396qusza40orc.cloudfront.net/predmachlearn/pml-training.csv')
num_rows = dataframe_all.shape[0]
#Step 2 - Clearn the data
#count the number of missing elements (NaN) in each column
counter_nan = dataframe_all.isnull().sum()
counter_without_nan = counter_nan[counter_nan==0]
#remove the columns with missing elements
dataframe_all = dataframe_all[counter_without_nan.keys()]
#remove the first 7 columns which contain no descriminative information
dataframe_all = dataframe_all.ix[:,7:]
#Step 3: Create feature vectors
x = dataframe_all.ix[:,:-1].values
standard_scalar = StandardScaler()
x_std = standard_scalar.fit_transform(x)
# t distributed stochastic neighbour embedding (t-SNE) visualization
tsne = TSNE(n_components=2, random_state = 0)
x_test_2d = tsne.fit_transform(x_std)
# variable holding the classes
y = dataframe_all.classe.values # you need this for the colors
y = np.array([ord(i) for i in y]) # convert letters to numbers
#scatter plot the sample points among 5 classes
plt.figure()
plt.scatter(x_test_2d[:, 0], x_test_2d[:, 1], c = y)
plt.show()

H5PY problem saving composite numpy arrays

In an attempt to reverse-engineer a file format, I have arrived at a following minimal example for creating a composite numpy datatype and saving it to HDF5. The original file seems to be storing datasets of the below data type. However, I do not seem to be able to write such datasets to a file.
import numpy as np
import h5py
data = ("Many cats".encode(), np.linspace(0, 1, 20))
data_type = [('index', 'S' + str(len(data[0]))), ('values', '<f8', (20,))]
arr = np.array(data, dtype=data_type)
print(arr)
h5f = h5py.File("lol.h5", 'w')
dset = h5f.create_dataset("data", arr, dtype=data_type)
h5f.close()
This code crashes with the error
Traceback (most recent call last):
File "test.py", line 13, in
dset = h5f.create_dataset("data", arr, dtype=data_type)
File "/opt/anaconda3/lib/python3.7/site-packages/h5py/_hl/group.py", line
116, in create_dataset
dsid = dataset.make_new_dset(self, shape, dtype, data, **kwds)
File "/opt/anaconda3/lib/python3.7/site-packages/h5py/_hl/dataset.py", line
75, in make_new_dset
shape = tuple(shape)
TypeError: iteration over a 0-d array
How can I overcome this issue?

I restructured/reordered your code to get it to work with h5py.
The code below works for 1 row. You will have to adjust to make the number of rows a variable.
import numpy as np
import h5py
data = ("Many cats".encode(), np.linspace(0, 1, 20))
data_type = [('index', 'S' + str(len(data[0]))), ('values', '<f8', (20,))]
arr = np.zeros((1,), dtype=data_type)
arr[0]['index'] = "Many cats".encode()
arr[0]['values'] = np.linspace(0, 1, 20)
h5f = h5py.File("lol.h5", 'w')
dset = h5f.create_dataset("data", data=arr)
h5f.close()

How do I select a particular column of a Numpy array read from a CSV?

I'm trying:
import numpy as np
housing_data = np.loadtxt('Housing.csv', delimiter=',')
print(housing_data)
print(housing_data.shape)
x1 = housing_data[:,:,0]
x2 = housing_data[:,:,1]
y = housing_data[:,:,2]
print(x1)
print(x2)
print(y)
My data has shape (47, 3) and looks like:
[[2.104e+03 3.000e+00 3.999e+05]
[1.600e+03 3.000e+00 3.299e+05]
[2.400e+03 3.000e+00 3.690e+05]
....
I am trying to set the first column to x1, the second to x2 and the third to y. But my code doesn't appear to work. What am I doing wrong?

I have created a dummy *csv file with random data. I would do it like this:
import numpy as np
import pandas as pd
# read file using pandas, without header and convert it to numpy arrays
housing_data = pd.read_csv('Housing.csv', header=None).values
# print housing data
print(housing_data)
print(housing_data.shape)
# slice through the data
x1 = housing_data[:,0]
x2 = housing_data[:,1]
y = housing_data[:,2]
print(x1)
print(x2)
print(y)
The output looks like this:

selection with Numpy & Python you can use :
#Shape (2,2) from top right corner
data[:2,1:]
#Shape bottom row
data[2]
#Shape bottom row
data[2,:]
or with conditions :
data[data>2]
Maybe you could check your .csv file and datatypes:
data.astype(float)
data = np.arange(3, dtype=np.uint8)

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

ValueError: Found input variables with inconsistent numbers of samples: [4620309, 6003] - python

Related

ValueError: y should be a 1d array, got an array of shape (1, 375) instead

Tuple Index out of Range PIL when using Image.fromarray()

IndexError when ploting sklearn manifold TSNE

H5PY problem saving composite numpy arrays

How do I select a particular column of a Numpy array read from a CSV?

Categories

Resources