apply min max scalar to multiple dataset using loop - python

I have n data-frames. I want to apply min- max scalar to each of them.
from sklearn import preprocessing
x = df1.values #returns a numpy array
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(x)
#df11= pd.DataFrame(x_scaled)
x_scaled1 = pd.DataFrame(x_scaled, columns = df1.columns)
for one df. I tried several ways. Everything failed.

Please list out the methods you have tried for your problem.
However, you can store the dataframe objects in a list. And then you can scale each of the dataframe as you loop through the list. I am doing it for two dataframes but the same applies for n number of these.
df1 = {'col1' : np.random.randint(1,26,10) , 'col2' : np.random.randint(1,26,10)}
df2 = {'col1' : np.random.randint(1,26,10) , 'col2' : np.random.randint(1,26,10)}
df1 = pd.DataFrame(df1)
df2 = pd.DataFrame(df2)
from sklearn import preprocessing
df_list = [df1,df2]
def loop_scaler(df_list):
scaled_df_list = []
for df in df_list:
df_array = df.values
min_max_scaler = preprocessing.MinMaxScaler()
df_scaled = min_max_scaler.fit_transform(df_array)
df_scaled = pd.DataFrame(df_scaled, columns = df.columns)
scaled_df_list.append(df_scaled)
return scaled_df_list
You can print and look at the scaled output:
scaled_df_list = loop_scaler(df_list)
for scaled_df in scaled_df_list:
print(scaled_df.head(5))
or
scaled_df1, scaled_df2 = loop_scaler(df_list)
print(scaled_df1.show())
print(scaled_df2.show())

import pandas as pd
from sklearn.preprocessing import MinMaxScaler
data = [[-1, 2], [-0.5, 6], [0, 10], [1, 18]]
new_data = pd.DataFrame(data)
scaler = MinMaxScaler()
transformed_data = pd.DataFrame(scaler.fit_transform(new_data))
This may help to solve your problem.

Related

getting the dtype of the elements in np array

I am working to clear and manipulate csv data in numpy. i am getting this attribute error
load_metrics("covid_sentiment_metrics.csv") gives me a numpy array as an output which is passed to the next function as parameter
def unstructured_to_structured(data, indexes):
'''structured to unstructured'''
data_1 = np.delete(data, 0, 0)
for i in range(np.shape(data_1)[1]):
if i not in indexes:
data_1[:, i] = (data_1[:, i]).astype(np.float64)
data_1 = tuple([tuple(i) for i in data_1])
return data_1
data = load_metrics("covid_sentiment_metrics.csv")
data = unstructured_to_structured(data, [0, 1, 7, 8])
print(data[5][0].dtype)
>>AttributeError: 'float' object has no attribute 'dtype'
the expected return should be float64 in this case.
Not sure what i doing wrong
** this is an addition to the previous post**
import numpy as np
import pandas as pd
# this function returns a numpy array from a CSV file
def load_metrics(filename):
'''read data as numpy array'''
df = pd.read_csv('covid_sentiment_metrics.csv').astype(str)
df.loc[-1] = df.columns.to_list()
df = df.sort_index().reset_index(drop=True)
df.drop(columns=['text', 'screen_name', 'followers', 'friends', 'user_ID', 'country_region'], inplace=True)
df_numpy = df.to_numpy()
return df_numpy
# the return from above function is passed as parameters
def unstructured_to_structured(data, indexes):
data_1 = np.delete(data, 0, 0)
# I am trying to change the data type for those columns that are not listed in the indexes
for i in range(np.shape(data_1)[1]):
if i not in indexes:
data_1[:, i] = (data_1[:, i]).astype(np.float64)
data_1 = tuple([tuple(i) for i in data_1])
return data_1
data = load_metrics("covid_sentiment_metrics.csv")
data = unstructured_to_structured(data, [0, 1, 7, 8])
print(data[5][0].dtype)
>>AttributeError: 'float' object has no attribute 'dtype'
i hope this should suffice for the situation to be clear
link to csv: https://www.dropbox.com/s/q3oq5zq71dd6sjx/covid_sentiment_metrics.csv?dl=0

how to scale down the output of model in the data where we normalized the data based on a custom function

I have a data frame and I normalized the data for training and testing the LSTM model as:
x_normalized = (x_unnormalized-x_min)/(x_max-x_min).
x_min, x_max are the minimum and maximum of each entire rows.
Same as the figure, I choose the last column as the test data.
The model works and etc. However, in this condition, the y_prediction is normalized. I don't know how to see the y_prediction in the real value.
There is any simple solution for that?
Here is the simple code and the normalization:
import pandas as pd
df = pd.DataFrame()
df['x1'] = [ 1, 2,4]
df['x2'] = [ 5, 9, 5]
df['x3'] = [ 3, 21, 10 ]
df['x4'] = [ 8, 32,3 ]
df['x5'] = [ 8, 32,15 ]
df['x6'] = [ 2, 5,15 ]
def norm(df):
MIN = df.min(1)
MAX = df.max(1)
return df.sub(MIN, 0).div(MAX-MIN, 0)
df_normalized = norm(df)
train = df_normalized.iloc[:, 0:5]
test = df_normalized.iloc[:, 5]
Following your formula:
x_unnormalized = x_normalized*(x_max-x_min) + x_min
So you just need to save the values of x_max and x_min of the original dataframe and then have a function executing the formula above

How to generate leters insted of number in pandas multiindex?

I have multiindex table in pandas and I need to generate some string values instead of numbers. I need data to generate letters from a to z, and to be random. How do I do it?
import pandas as pd
import numpy as np
index = pd.MultiIndex.from_product([[2020], [1, 2, 3, 4]],
names=['year', 'q'])
columns = pd.MultiIndex.from_product([['Items1', 'Items2', 'Items3'], ['new', 'old']],
names=['subject', 'type'])
data = np.round(np.random.randn(4, 6), 1)
data[:, ::2] *= 20
data += 50
Ldata = pd.DataFrame(data, index=index, columns=columns)
Ldata
I figured out.
data = np.random.seed(123)
data = list(np.random.choice(list(string.ascii_lowercase), (4,6)))

How can make a dataset of elements of matrices in dataframe?

I have dataset of 3 parameters 'A','B','C' in .TXT file and after I print them in 24x20 matrices I need to collect the 1st elements of 'A','B','C' put in long arrays in panda dataframe and then 2nd elements of each then 3rd and so on till 480th elements.
So my data is like this in text file:
my data is txt file is following:
id_set: 000
A: -2.46882615679
B: -2.26408246559
C: -325.004619528
I already made a panda dataframe includes 3 columns of 'A','B','C' and index and defined functions to print 24x20 matric in right way. Simple example via 2x2 matrices:
1st cycle: A = [1,2, B = [4,5, C = [8,9,
3,4] 6,7] 10,11]
2nd cycle: A = [0,8, B = [1,9, C = [10,1,
2,5] 4,8] 2,7]
Reshape to this form:
A(1,1),B(1,1),C(1,1),A(1,2),B(1,2),C(1,2),.....
Result= [1,4,8,2,5,9,3,6,10,4,7,11] #1st cycle
[0,1,10,8,9,1,2,4,2,5,8,7] #2nd cycle
My scripts are following:
import numpy as np
import pandas as pd
import os
def normalize(value, min_value, max_value, min_norm, max_norm):
new_value = ((max_norm - min_norm)*((value - min_value)/(max_value - min_value))) + min_norm
return new_value
dft = pd.read_csv('D:\mc25.TXT', header=None)
id_set = dft[dft.index % 4 == 0].astype('int').values
A = dft[dft.index % 4 == 1].values
B = dft[dft.index % 4 == 2].values
C = dft[dft.index % 4 == 3].values
data = {'A': A[:,0], 'B': B[:,0], 'C': C[:,0]}
df = pd.DataFrame(data, columns=['A','B','C'], index = id_set[:,0])
#next iteration create all plots, change the number of cycles
cycles = int(len(df)/480)
print(cycles)
for cycle in range(0,10):
count = '{:04}'.format(cycle)
j = cycle * 480
for i in df:
try:
os.mkdir(i)
except:
pass
min_val = df[i].min()
min_nor = -1
max_val = df[i].max()
max_nor = 1
ordered_data = mkdf(df.iloc[j:j+480][i])
csv = print_df(ordered_data)
#Print .csv files contains matrix of each parameters by name of cycles respectively
csv.to_csv(f'{i}/{i}{count}.csv', header=None, index=None)
if 'C' in i:
min_nor = -40
max_nor = 150
#Applying normalization for C between [-40,+150]
new_value3 = normalize(df['C'].iloc[j:j+480], min_val, max_val, -40, 150)
df3 = print_df(mkdf(new_value3))
df3.to_csv(f'{i}/norm{i}{count}.csv', header=None, index=None)
else:
#Applying normalization for A,B between [-1,+1]
new_value1 = normalize(df['A'].iloc[j:j+480], min_val, max_val, -1, 1)
new_value2 = normalize(df['B'].iloc[j:j+480], min_val, max_val, -1, 1)
df1 = print_df(mkdf(new_value1))
df2 = print_df(mkdf(new_value2))
df1.to_csv(f'{i}/norm{i}{count}.csv', header=None, index=None)
df2.to_csv(f'{i}/norm{i}{count}.csv', header=None, index=None)
Note2: I provided a dataset in text file for 3 cycles:
Text dataset
I am not sure if I understood your question fully but this is a solution:
Convert your data frame to a 2d numpy array using as_matrix() then use ravel() to get a vector of size 480 * 3 then cycle over your cycles and use vstack method for stacking rows over each other in your result, this is a code with your example data:
A = [[1,2,3,4], [10,20,30,40]]
B = [[4,5,6,7], [40,50,60,70]]
C = [[8,9,10,11], [80,90,100,110]]
cycles = 2
for cycle in range(cycles):
data = {'A': A[cycle], 'B': B[cycle], 'C': C[cycle]}
df = pd.DataFrame(data)
D = df.as_matrix().ravel()
if cycle == 0:
Results = np.array(D)
else:
Results = np.vstack((Results, D2))
# Output: Results= array([[ 1, 4, 8, 2, 5, 9, 3, 6, 10, 4, 7, 11], [ 10, 40, 80, 20, 50, 90, 30, 60, 100, 40, 70, 110]], dtype=int64)
np.savetxt("Results.csv", Results, delimiter=",")
Is this what you wanted?

Change the Value of Specific Columns Based on a Condition in a DataFrame Python

I am not sure how to automate it for specific set of columns.
It will go into a specific column, check the values. If a value is equal or larger than 10, it would replace it with 100, if it is less than that it will be 0. But then it would do that for all the columns rather than specifying the name of the column each time.
import numpy as np
import pandas as pd
from sklearn.datasets import make_classification
X, y = make_classification(n_samples=1000,
n_features=6,
n_informative=3,
n_classes=2,
random_state=0,
scale = 10,
shuffle=False)
# Creating a dataFrame
df = pd.DataFrame({'car':X[:,0],
'ball':X[:,1],
'Feature 3': 5,
'Feature 4':X[:,3],
'Feature 5':X[:,4],
'Feature 6':X[:,5],
'Class':y})
df.loc[df['Feature 6'] > 10, 'Feature 6'] = 100
and the set of columns will be extracted using the following line
spike_cols = [col for col in df.columns if "tu" in str(col)]
So basically change the values of columns containing tu anywhere in the column name
I think you need numpy.where with a DataFrame constructor, because it returns a 2d numpy array:
df = pd.DataFrame(np.where(df >= 10, 100, 0), columns=df.columns, index=df.index)
Thank you #Wen for solution with DataFrame.ge and multiple boolean mask by 100, Trues are processes like 1 and Falses like 0:
df = df.ge(10).mul(100)
EDIT: If want applied solution only for some columns:
df[spike_cols] = pd.DataFrame(np.where(df[spike_cols] >= 10, 100, 0),
columns=spike_cols, index=df.index)
df[spike_cols] = df[spike_cols].ge(10).mul(100)

Categories

Resources