I am Having trouble with the last subplot. The last Crosstab plot appears by itself, and then the subplot has the first 2 subplots but the 3rd one is empty and contains no data. How can I graph it so that all 3 graphs come up in one figure and they share they same Y axis or 'Frequency'
import numpy as np
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt
from patsy import dmatrices
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import train_test_split
from sklearn import metrics
from sklearn.cross_validation import cross_val_score
#Data Exploration
data = sm.datasets.fair.load_pandas().data
data['affair'] = np.where(data['affairs'] > 0 , 1,0)
print(data)
print(data.groupby('affair').mean())
print(data.groupby('rate_marriage').mean())
plt.subplot(331)
data['educ'].hist()
plt.title('Histogram of Education')
plt.xlabel('Education Level')
plt.ylabel('Frequency')
plt.subplot(332)
data['rate_marriage'].hist()
plt.title('Histogram of Marriage Rating')
plt.xlabel('Marriage Rating')
plt.ylabel('Frequency')
plt.subplot(333)
pd.crosstab(data['rate_marriage'], data['affair'].astype(bool)).plot(kind='bar')
plt.title('Marriage Rating distribution by affair Status')
plt.xlabel('Marriage Rating')
plt.ylabel('Frequency')
plt.show()
You need to tell the pandas plotting function where to plot the data.
This can be achieved through the ax keyword.
ax= plt.subplot(333)
pd.crosstab(data['rate_marriage'], data['affair'].astype(bool)).plot(kind='bar', ax=ax)
Related
my purpose is to create an anomaly graph for a stock that have dates and close. I tried to create outliers, but I get the lines not in the place I want. For example, I want the line to be in the year of 2019 and after 2020 where there are drastic changes. The X line has dates and the problem I don't know how to write the outliers
I thought to write y["2019"]=40 for example but it doesn't do anything
from pandas import read_csv
from matplotlib import pyplot
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import numpy as np
#from IPython.core.debugger import set_trace
#import data
AAPL= pd.read_csv('AAPL.csv', header=0, squeeze=True)
x=AAPL['Date']
x=pd.to_datetime(x)
y=AAPL['Close/Last']
plt.figure(figsize=(15,7))
plt.plot(x, y, label="Close")
plt.title("AAPL")
plt.xlabel("Time")
plt.ylabel("Close")
plt.xticks(rotation=0)
plt.grid()
plt.show()
y[5] = 5
y[60] =55
y[85] = 1.4
n_outliers = 3
plt.figure(figsize=(15,7))
plt.plot(x,y)
plt.scatter(x,y)
plt.grid()
plt.ylabel('Y')
plt.xlabel('x')
plt.show()
Thank you in advance
I am trying to use TSNE to visualize data based on a Category to show me if the data is separable.
I have been trying to do this for the past two days but I am not getting a scatter plot showing the different categories plotted to enable me to see the relationship.
Instead, it is plotting all the data in a straight linear line, which cannot be correct as there are 5 different distinct attributes with the column I am trying to use as a label and legend.
What do I do to correct this?
import label as label
import pandas as pd
from matplotlib.cm import get_cmap
from matplotlib.colors import rgb2hex
from sklearn.manifold import TSNE
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from matplotlib import pyplot as plt
import numpy as np
# #region Loading Data
filename = 'Dataset/test.csv'
df = pd.read_csv(filename)
label = df.pop('Activity')
label_counts = label.value_counts()
# # Scale Data
scale = StandardScaler()
tsne_data= scale.fit_transform(df)
fig, axa = plt.subplots(2, 1, figsize=(15,10))
group = label.unique()
for i , labels in label.iteritems():
# mask =(label = group)
axa[0].scatter(x = tsne_data, y = tsne_data, label = group)
plt.legend
plt.show()
I'm having trouble using the scatter to create a scatter plot. Can someone help me? I've highlighted the line causing the error:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
import numpy as np
from sklearn.preprocessing import StandardScaler
data = pd.read_csv('vetl8.csv')
df = pd.DataFrame(data=data)
clusterNum = 3
X = df.iloc[:, 1:].values
X = np.nan_to_num(X)
Clus_dataSet = StandardScaler().fit_transform(X)
k_means = KMeans(init="k-means++", n_clusters=clusterNum, n_init=12)
k_means.fit(X)
labels = k_means.labels_
df["Labels"] = labels
df.to_csv('dfkmeans.csv')
plt.scatter(df[2], df[1], c=labels) **#Here**
plt.xlabel('K', fontsize=18)
plt.ylabel('g', fontsize=16)
plt.show()
#data set correct
You are close, just a minor adjustment to access the x-y columns by number should fix it:
plt.scatter(df[df.columns[2]], df[df.columns[1]], c=df["Labels"])
I am new to SciKit-Learn and I have been working on a regression problem (king county csv) on kaggle. I have been training a regression model to predict the price of the house and I wanted to plot the graph but I have no idea how to do so. I am using python 3.6. Any advice or suggestion would be greatly appreciated.
#importing numpy and pandas, seaborn
import numpy as np #linear algebra
import pandas as pd #datapreprocessing, CSV file I/O
import seaborn as sns #for plotting graphs
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
import matplotlib.pyplot as plt
data = pd.read_csv('kc_house_data.csv')
data = data.drop('date',axis=1)
data = data.drop('id',axis=1)
X = data
Y = X['price'].values
X = X.drop('price', axis = 1).values
X_train, X_test, Y_train, Y_test = train_test_split (X, Y, test_size = 0.30, random_state=21)
reg = LinearRegression()
kfold = KFold(n_splits=15, random_state=21)
cv_results = cross_val_score(reg, X_train, Y_train, cv=kfold, scoring='r2')
print(cv_results)
round(np.mean(cv_results)*100, 2)
This is the code from sklearn: https://scikit-learn.org/stable/auto_examples/linear_model/plot_ols.html
You can use matplotlib for plotting
import matplotlib.pyplot as plt
plt.figure(figsize=(16, 9))
plt.plot(cv_results)
plt.show()
There can be multiple type of plots you can use like simple line plot or scatter plot.
plt.barh(x, y) # for bar graph
plt.plot(x,y) # for line graph
plt.scatter(x,y) # for scatter graph
Seaborn is a very useful visualization library. So much so that you can use 'seaborn.regplot' to directly plot the data and regression-model-fit line. It directly takes in the predictor variable and response variable, and spits out the plot of data points and best fit line. Here is the link on how to use it:
https://seaborn.pydata.org/generated/seaborn.regplot.html
I have also done the same competition on kaggle.
For regressions I would go for a scatter plot:
import matplotlib as plt
plt.plot(x,y)
As for the visualisations on that particular competition I would use the following code:
# visualising some more outliers in the data values
fig, axs = plt.subplots(ncols=2, nrows=0, figsize=(12, 120))
plt.subplots_adjust(right=2)
plt.subplots_adjust(top=2)
sns.color_palette("husl", 8)
for i, feature in enumerate(list(train[numeric]), 1):
if(feature=='MiscVal'):
break
plt.subplot(len(list(numeric)), 3, i)
sns.scatterplot(x=feature, y='SalePrice', hue='SalePrice', palette='Blues', data=train)
plt.xlabel('{}'.format(feature), size=15,labelpad=12.5)
plt.ylabel('SalePrice', size=15, labelpad=12.5)
for j in range(2):
plt.tick_params(axis='x', labelsize=12)
plt.tick_params(axis='y', labelsize=12)
plt.legend(loc='best', prop={'size': 10})
plt.show()
I have actually uploaded the full code for that competition on my GitHub if you want to have a look ;) (I am currently in the top 14% on that competition).
I'm have got a programme that uses matplotlib and pandas to plot the rolling mean and standard deviation for the price of bitcoin. I'm wondering how I can plot the z values ( the number of standard deviations the price is from the mean).
import pandas as pd
from matplotlib import pyplot as plt
btc_1_day = pd.read_csv('C:\Users\Oliver\Desktop\data\data1_btcusdt_1day.csv')
df1_btc = pd.DataFrame(btc_1_day)
df1_btc['SMA_10'] = df1_btc.price_close.rolling(10).mean()
df1_btc['SMSD_10'] = df1_btc.price_close.rolling(10).std()
plt.grid(True)
plt.plot(btc_1_day.price_close)
plt.plot(df1_btc['SMA_10'],label='10 day moving average')
plt.plot(df1_btc['SMSD_10'],label='10 day standard deviation')
plt.legend(loc=2)
plt.show()
Since I don't have your csv file, I'll show you how I would do this using some random data and a pandas dataframe. You can find the z score using stats.zscore(df['btc'], but that would give you numbers on a very different scale from the ones you're trying to plot in your example.
Plot 1:
Code 1:
import pandas as pd
from matplotlib import pyplot as plt
import statsmodels.api as sm
import pandas as pd
import numpy as np
import datetime
from scipy import stats
# data
np.random.seed(1234)
numdays=100
df = pd.DataFrame({'btc': (np.random.randint(low=-20, high=20, size=numdays).cumsum()+100).tolist()})
# moving averages and standard deviations
df['SMA_10'] = df['btc'].rolling(10).mean()
df['SMSD_10+sigma'] = df['btc'].rolling(10).mean()+df['btc'].rolling(10).std()
df['SMSD_10-sigma'] = df['btc'].rolling(10).mean()-df['btc'].rolling(10).std()
# matplotlib
df['ZScore']=stats.zscore(df['btc'])
plt.figure()
df['btc'].plot()
df['ZScore'].plot()
plt.show()
In order to illustrate your dataset together with averages and starndard deviations for rolling windows, I'd rather use an approach such as:
Plot 2:
Code 2:
import pandas as pd
from matplotlib import pyplot as plt
import statsmodels.api as sm
import pandas as pd
import numpy as np
import datetime
from scipy import stats
# data
np.random.seed(1234)
numdays=100
df = pd.DataFrame({'btc': (np.random.randint(low=-20, high=20, size=numdays).cumsum()+100).tolist()})
# moving averages and standard deviations
df['SMA_10'] = df['btc'].rolling(10).mean()
df['SMSD_10+sigma'] = df['btc'].rolling(10).mean()+df['btc'].rolling(10).std()
df['SMSD_10-sigma'] = df['btc'].rolling(10).mean()-df['btc'].rolling(10).std()
# matplotlib
plt.grid(True)
plt.plot( df['btc'])
plt.plot(df['SMA_10'],label='10 day moving average')
plt.plot(df['SMSD_10+sigma'],label='10 day standard deviation',
color='green',
linewidth=0.5)
plt.plot(df['SMSD_10-sigma'],label='10 day standard deviation',
color='green',
linewidth=0.5)
plt.plot(df['btc'], color='blue', linewidth=1.5)
plt.legend(loc=2)
plt.show()