Normalization between 0 and 1 Python with >10,000 data - python

I input:
Data= (np.max(PM10_data)-np.min(PM10_data))
print(Data)
for the normalization of my data, but when I check it by print the data out, it shows:
[15442 rows x 3 columns]
PM10 39.33
Unnamed: 62 1.00
dtype: float64
May i know how could i normalize >10,000 data properly?
Below is the scipt I have:
import pandas as pd
import datetime as dt
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
import numpy as np
import statsmodels.api as sm
from sklearn import preprocessing
#import PM10 data
PM10_data= pd.read_excel(r"C:\Users\Jamie Tsoi\OneDrive\Desktop\FYP\ToUG_Office.xlsx",sheet_name='PM10')
PM10_data=PM10_data.dropna(axis=1,how='any')
PM10_data.max()
#Check null
PM10_data.isna().sum()
#Check null
PM10_data['PM10']
PM10_data.columns = PM10_data.columns.str.strip()
#Check null
PM10_data['PM10']=pd.to_numeric(PM10_data['PM10'])
print(PM10_data)
Data= (np.max(PM10_data)-np.min(PM10_data))
print(Data)

Related

Creating a Line graph from data within an excel spreadsheet

I have data from an excel spreadsheet concerning the UK economy but how do i make a line graph from said information? This is what i have so far:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.decomposition import PCA
df = (pd.read_excel('Uk_data.xlsx').dropna())
dataset = df.to_dict()
print(df[0:])
plt.figure(figsize=(9,9))
plt.style.use('seaborn')
plt.title("UK GDP Since 2017")
plt.show()
print(df.head())

ValueError: Endog and Exog are in different size

I have met ValueError: Exog and Ebndog are in different size.
When I type len(y) or len(y_scaled), it returns 0, but it supposed to be five. Hope for help. Thanks in advance.
import datetime
import dateutil
import pandas_datareader.data as wb
import matplotlib.pyplot as plt
import numpy as np
import statsmodels.api as sm
year=5
tickers =["0200.KL"]
ohlc = wb.DataReader(tickers, data_source="yahoo",start=datetime.date.today()-dateutil.relativedelta.relativedelta(years=year),end=datetime.date.today())
n=5 #get 5 consecutive data
df =ohlc.copy()
series=df["Adj Close"]
slopes=[i*0 for i in range(n-1)]
for i in range(n,len(series)+1):
y=series[i-n:n]
x=np.array(range(n))
#normalize x and y variable
y_scaled=(y-y.min())/(y.max()-y.min())
X_scaled=(x-x.min())/(x.max()-x.min())
#add a constant to the equation
X_scaled=sm.add_constant(X_scaled)
model=sm.OLS(y_scaled,X_scaled)
results=model.fit()
slopes.append(results.params[-1])
#slope coefficient is the theta in radians
slopes_angle=np.rad2degree(np.arctan(np.array(slopes)))
np.array(slopes_angle)
Solved. Thank you.
Should be y=df["Adj Close"][i-n:i] instead of y=series[n-i:n]
The full code as below:
import datetime
import dateutil
import pandas_datareader.data as wb
import matplotlib.pyplot as plt
import numpy as np
import statsmodels.api as sm
%matplotlib inline
year=1
tickers ="AAPL"
ohlc = wb.DataReader(tickers, data_source="yahoo",start=datetime.date.today()-dateutil.relativedelta.relativedelta(years=year),end=datetime.date.today())
n=5 #get 5 consecutive datas
df =ohlc.copy()
slopes=[i*0 for i in range(n-1)]
for i in range(n,len(df)+1):
y=df["Adj Close"][i-n:i]
x=np.array(range(n))
#normalize x and y variable
y_scaled=(y-y.min())/(y.max()-y.min())
X_scaled=(x-x.min())/(x.max()-x.min())
#add a constant to the equation
X_scaled=sm.add_constant(X_scaled)
model=sm.OLS(y_scaled,X_scaled)
results=model.fit()
slopes.append(results.params[-1])
#slope coefficient is the theta in radians
slopes_angle=np.rad2deg(np.arctan(np.array(slopes)))
slopes_angle=np.array(slopes_angle)
plt.plot(slopes_angle)
plt.title("Slope Coefficient of 5 Consecutive Stock Price Data")
plt.ylabel("Slope Coefficient")
plt.xlabel("Period")
plt.show()

Taking data from specific columns in a dataset

I need to take data from only 3 columns in my dataset, how do I do this? I am trying to make a correlation graph. This is my code:
import matplotlib.pyplot as plt
import pandas as pd
crimedata = pd.read_csv('MasterFileCSV.csv')
crime_df = pd.DataFrame(crimedata)
plt.matshow(crime_df.corr())
plt.show

How to plot dates from a csv file?

I am new to python and I am having some issues to plot my dates from a csv file.
The code is the following:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from pandas import DataFrame
import matplotlib.pyplot as plt
df = pd.read_csv(r"file.csv",index_col=0)
print(df.describe())
BHSI_cycle, BHSI_trend = sm.tsa.filters.hpfilter(df['BHSI-TCA'])
df['BHSI_trend'] = BHSI_trend
df['BHSI_cycle'] = BHSI_cycle
BHSI_plot = df[['BHSI-TCA','BHSI_trend']].plot(figsize=(12,10))
plt.show(BHSI_plot)
BHSI_plot2 = df[['BHSI_cycle']].plot(figsize=(12,10))
plt.show(BHSI_plot2)
And the CSV file is:
Date BHSI-TCA
23/05/2006 14821
25/05/2006 14878
30/05/2006 14837
How can I plot the dates?
Try parsing dates properly when you import from csv.
df = pd.read_csv(r"file.csv", index_col=0, parse_dates=<your_date_column>)

How can I show True/False graph on Python

I have a csv file and I want to show this data on grap. I have date,place and status data but I don't need place so I fetch data like this.
And going like this
Here is my code. How can I get a graph with 1-0 values according to date value. Which method should I use ? Thanks
import pandas as pd from pandas
import DataFrame
import datetime
import pandas.io.data
import matplotlib.pyplot as plt from mpl_toolkits.mplot3d
import Axes3D import pylab rows_list=[] df=pd.read_csv('filepath',header=None,parse_dates=True,pr‌​efix='column')
for row in df.iterrows():
if row[1][1]=='Beweging in de living':
if row[1][2]=='OPEN': rows_list.append([row[1][0],'1'])
else: rows_list.append([row[1][0],'0'])
df2 = pd.DataFrame(rows_list)
df3=df2.set_index(0)
print df3 plt.plot(df3)
plt.show()

Categories

Resources