How to know the order of eigen values in PCA - python

I performed a Pca analysis in python. And, I got the eigenvalues for the analysis, but I don't know what variables of my dataset are represented in the components.
There are a way to know which components represent each variable of my data:
for example: 4.669473069609005 corresponds to sillas, etc...
here is the file:
https://storage.googleapis.com/min_ambiente/servi_acc/datos.csv
here is the code:
# I have libraries es for some other methods I Implemented here.
import pandas as pd
import numpy as np
import random
import matplotlib.pyplot as plt
from google.cloud import bigquery
from sklearn.preprocessing import StandardScaler
from factor_analyzer.factor_analyzer import calculate_bartlett_sphericity
from factor_analyzer.factor_analyzer import calculate_kmo
from factor_analyzer import FactorAnalyzer
%matplotlib inline
#load csv
from google.colab import files
uploaded = files.upload()
data = pd.read_csv("datos.csv")
data.fillna(0, inplace=True)
a,b = data.shape
X= data.iloc[:,0:b-1]
X.head()
enter image description here
#####################################################
###Estandarizar y realizar la matriz de covarianza###
#####################################################
#Standardize features by removing the mean and scaling to unit variance
#used for generating learning model parameters from training data and
#generate transformed data set
X_std = StandardScaler().fit_transform(X)
mean_vec = np.mean(X_std, axis=0)
cov_mat = (X_std - mean_vec).T.dot((X_std - mean_vec)) / (X_std.shape[0]-1)
###Valores y vectores propios obtenidos de la matriz covarianza
cov_mat = np.cov(X_std.T)
eig_vals, eig_vecs = np.linalg.eig(cov_mat)
dictionary = dict(zip(lst2, lst1))
print(dictionary)
###print from the highest to the lowest
eig_pairs.sort()
eig_pairs.reverse()
print('eigenvalues in descending order :')
for i in eig_pairs:
print(i[0])

Related

python time series synthetic data using ydata-synthetic package - Time series GAN

hello like a title i try to using synthetic package for Time series GAN
at the first time i was thinking putting integer then output also numerical but it wasn't, output data are decimal number i using ydata-synthetic (https://github.com/ydataai/ydata-synthetic)
here is my code for make data please help me
#Importing the required libs for the exercise
from os import path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from ydata_synthetic.synthesizers import ModelParameters
from ydata_synthetic.preprocessing.timeseries import processed_stock
from ydata_synthetic.synthesizers.timeseries import TimeGAN
import torch
arr_data = np.random.randint(0,600000,(100,1))
#Specific to TimeGANs
#stock_data
seq_len=20
n_seq = 1 #number of columns
hidden_dim=24
gamma=1
noise_dim = 32
dim = 128
batch_size = len(arr_data) - seq_len
log_step = 100
learning_rate = 5e-4
gan_args = ModelParameters(batch_size=batch_size,
lr=learning_rate,
noise_dim=noise_dim,
layers_dim=dim)
lst_temp = []
for i in range(0,len(arr_data) - seq_len):
_x = arr_data[i:i+20]
lst_temp.append(_x)
tens_rand_data = torch.tensor(lst_temp)
lst_rand_data = tens_rand_data.numpy()
synth = TimeGAN(model_parameters=gan_args, hidden_dim=24, seq_len=seq_len, n_seq=n_seq, gamma=1)
synth.train(lst_rand_data, train_steps=10)
synth_data = synth.sample(len(lst_rand_data))
print(synth_data.shape)
cols = ['Car price']
for j, col in enumerate(cols):
df = pd.DataFrame({'Real': lst_rand_data[-1][:, j],'Synthetic': synth_data[-1][:, j]})
df.plot(title = "Car price",secondary_y='Synthetic data', style=['-', '--'])
print(df)
enter image description here
Your input should be processed using a MinMaxScaler before fitting into TimeGAN, and you will always receive decimal output between 0 and 1 due to sigmoid activation on the last layer of its Generator.
You can change your code in 2 ways:
Change your input from integer to decimal range [0,1].
arr_data = np.random.randint(0,600000,(100,1))
into
arr_data = np.random.uniform(0,1,(100,1))
This way your dummy input doesn't need to be scaler since it's already in [0,1].
Use MinMaxScaler to scale your data
from sklearn.preprocessing import MinMaxScaler
arr_data = np.random.randint(0,600000,(100,1))
scaler = MinMaxScaler(feature_range = (0,1))
scaled_data = scaler.fit_transform(arr_data)
...
Please note that you will always receive decimal output from [0,1] when using TimeGAN. Now if you want to inverse synthetic data into integer, consider using inverse transform.

2D output on Lineal regression model

I'm getting the following error from my code:
ValueError: Expected 2D array, got scalar array instead:
array=99.
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.
Here is the code used:
#importing libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import linear_model
Physical_activity_df = pd.read_excel('C:/Users/Usuario/Desktop/LW_docs/Physical_activity_nopass.xlsx')
prediction_df = Physical_activity_df[['Activity_Score','Calories']]
prediction_df.plot(kind='scatter', x= 'Activity_Score', y= 'Calories')
plt.show()
#change to df variables
activity_score = pd.DataFrame(prediction_df['Activity_Score'])
calories = pd.DataFrame(prediction_df['Calories'])
lm = linear_model.LinearRegression()
model = lm.fit(activity_score,calories)
#predict new values for calories (FROM HERE COMES THE ERROR)
activity_score_new = 99
calories_predict = model.predict(activity_score_new)
calories_predict
Any idea about how to fix this issue? Thanks!

How to plot efficient frontier with pypfopt plotting

Hey guysm someone could help me?
I'm new here and in python codying to, so i'm beginner level.
I'm was trying plot my efficient frontier using pypfopt lib and i'm got some troubles with the parameters in Plotting.plot_efficient_frontier command that is used to plot the graph.
I added the picture of my code in the post
#importando as libs
import pandas as pd
import numpy as np
import pandas_datareader.data as web
import datetime as dt
import matplotlib as plt
from pypfopt.expected_returns import mean_historical_return
from pypfopt.risk_models import CovarianceShrinkage
#definindo o período de análise
start = dt.datetime(2019,1,1)
end = dt.datetime(2019,12,31)
#asset tickers
tickers = ["ITUB4.SA","B3SA3.SA","PETR4.SA","ABEV3.SA","ELET3.SA"]
#getting the data
carteira = web.DataReader(tickers,"yahoo",start,end)
#craindo outro dataframe só com os proços de feachamento ajustados
prices = carteira['Adj Close']
#expected returns
#covariance matrix
e_r = mean_historical_return(prices)
cov_matrix = CovarianceShrinkage(prices).ledoit_wolf()
from pypfopt.efficient_frontier import EfficientFrontier
#border problem solution
#selectiong portfolio with better sharpe ratio
ef = EfficientFrontier(e_r, cov_matrix)
weights = ef.max_sharpe()
from pypfopt import Plotting
Plotting.plot_efficient_frontier(ef ,points=100 , show_assets = True)
[n ][1]
[1]: https://i.stack.imgur.com/NaeJI.png
try it in lowercase() like:
from pypfopt import plotting

Principal Component Analysis in Python intrinsic dimension map to feature names?

Objective: Get data from DB to pandas DataFrame, preprocess, run PCA, get intrinsic dimension, use principal features in multi linear regression.
1) I have connected to my MS SQL db and fetched all data from sql stmt to pandas data frame. ---> shape = 54K rows X 90 features (column names)
2) Next, I have preprocessed the data.
3) Next, I have built pipeline to scale, fit, transform the data to StandardScaler and PCA with 20 components.
Now after building my bar graph of PCA features on X axis and Explained Var on Y axis I see I can use intrinsic dimension 7.
How can I translate 7 to the feature/column names? Does 7 mean take the first 7 columns of my dataframe?
Purpose: Next I will use these columns in my multi linear regression.
If needed here is what I have done so far:
from sqlalchemy import create_engine
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import pyodbc
engine = create_engine('mssql+pyodbc://uid:pwd#servername/dbname?driver=SQL+Server+Native+Client+11.0')
con = engine.connect()
stmt = """Select Col1 ... Col90 from table where Col1 = alpha, ..."""
rs = con.execute(stmt)
df = pd.DataFrame(rs.fetchall())
df.columns = rs.keys()
df_number_columns = df.loc[:,'Age':]
scaler = StandardScaler()
model = PCA(n_components=20)
pipeline = make_pipeline(scaler, model)
df_number_columns = df_number_columns.convert_objects(convert_numeric=True)
column_names = np.array(rs.keys())
idx_2_start = np.where(column_names == 'Age')[0][0]
df_number_columns.columns = column_names[idx_2_start:]
pipeline.fit_transform(df_number_columns)
features = range(model.n_components_)
plt.bar(features, model.explained_variance_)
plt.xlabel('PCA Feature')
plt.ylabel('Variance')
plt.xticks(features)
plt.show()

Using PCA on Train and testset

I'm trying to reduce the number of features I using in my Tensorflow model. For That I'm trying to use the PCA. here the code that I've wrote:
from sklearn.decomposition import PCA
from sklearn import preprocessing
import numpy as np
import pandas as pd
import csv
import matplotlib.pyplot as plt
number_of_PCA = 20
# Reading csv file
training_file = 'Training.csv'
testing_file = 'Test.csv'
dataframe_train = pd.read_csv(training_file)
dataframe_test = pd.read_csv(testing_file)
#Training values
features_labels_train = dataframe_train.columns.values[:-2]
class_labels_train = dataframe_train.iloc[:,-2]
feature_values_train = dataframe_train.iloc[:,:-2]
train_onehot = dataframe_train.iloc[:,-1]
#Test values
feature_labels_test = dataframe_test.columns.values[:-2]
class_labels_test = dataframe_test.iloc[:,-2]
features_values_test = dataframe_test.iloc[:,:-2]
test_onehot = dataframe_test.iloc[:,-1]
#values standardisation
stdsc = preprocessing.StandardScaler()
np_scaled_train = stdsc.fit_transform(feature_values_train)
np_scaled_test = stdsc.transform(features_values_test)
pca = PCA(n_components=number_of_PCA)
X_train_pca = pca.fit_transform(np_scaled_train) # This is the result
X_test_pca = pca.transform(np_scaled_test)
......................................................
When I run the Tensorflow on result, I get a big overfitting as shown bellow :
I'm assuming that the way I'm using the PCA on the test set is the the issue.
Does anyone here has an idea what I'm missing here ?

Categories

Resources