I want to visualize my csv data into cluster.
This is my csv data.(https://github.com/soma11soma11/EnergyDataSimulationChallenge/blob/challenge2/soma11soma/challenge2/analysis/Soma/total_watt.csv)
For your infomation.
I could visualzie the csv data into 3D graph.
And this is my code.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import axes3d
MY_FILE = 'total_watt.csv'
df = pd.read_csv(MY_FILE, parse_dates=[0], header=None, names=['datetime', 'consumption'])
df['date'] = [x.date() for x in df['datetime']]
df['time'] = [x.time() for x in df['datetime']]
pv = df.pivot(index='time', columns='date', values='consumption')
# to avoid holes in the surface
pv = pv.fillna(0.0)
xx, yy = np.mgrid[0:len(pv),0:len(pv.columns)]
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
surf=ax.plot_surface(xx, yy, pv.values, cmap='jet', cstride=1, rstride=1)
fig.colorbar(surf, shrink=0.5, aspect=10)
dates = [x.strftime('%m-%d') for x in pv.columns]
times = [x.strftime('%H:%M') for x in pv.index]
ax.set_title('Energy consumptions Clusters', color='lightseagreen')
ax.set_xlabel('time', color='darkturquoise')
ax.set_ylabel('date(year 2011)', color='darkturquoise')
ax.set_zlabel('energy consumption', color='darkturquoise')
ax.set_xticks(xx[::10,0])
ax.set_xticklabels(times[::10], color='lightseagreen')
ax.set_yticks(yy[0,::10])
ax.set_yticklabels(dates[::10], color='lightseagreen')
ax.set_axis_bgcolor('black')
plt.show()
#Thanks for reading! Looking forward to the Skype Interview.
And this is the graph, I got from this code.
I think I should change some points of this code, in order to cluster the data into three group: high, medium and low energy consumption.
The image I want to get from clustering the data is like this.(2D, 3colours.)
k-means????? should I use?..
Here is the result using KMeans.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import axes3d
from sklearn.cluster import KMeans
MY_FILE = '/home/Jian/Downloads/total_watt.csv'
df = pd.read_csv(MY_FILE, parse_dates=[0], header=None, names=['datetime', 'consumption'])
df['date'] = [x.date() for x in df['datetime']]
df['time'] = [x.time() for x in df['datetime']]
stacked = df.pivot(index='time', columns='date', values='consumption').fillna(0).stack()
# do unsupervised clustering
# =============================================
estimator = KMeans(n_clusters=3, random_state=0)
X = stacked.values.reshape(len(stacked), 1)
cluster = estimator.fit_predict(X)
# check the mean value of each cluster
X[cluster==0].mean() # Out[53]: 324.73175293698534
X[cluster==1].mean() # Out[54]: 6320.8504071851467
X[cluster==2].mean() # Out[55]: 1831.1473140192766
# plotting
# =============================================
fig, ax = plt.subplots(figsize=(10, 8))
x = stacked.index.labels[0]
y = stacked.index.labels[1]
ax.scatter(x[cluster==0], y[cluster==0], label='mean: {}'.format(X[cluster==0].mean()), c='g', alpha=0.8)
ax.scatter(x[cluster==1], y[cluster==1], label='mean: {}'.format(X[cluster==1].mean()), c='r', alpha=0.8)
ax.scatter(x[cluster==2], y[cluster==2], label='mean: {}'.format(X[cluster==2].mean()), c='b', alpha=0.8)
ax.legend(loc='best')
Related
The 2 red lines in the graph are the 2 extrapolated lines. The upper line works well but the lower line seems to take into account the data outside the range of time2 and temp2 which makes the line look awkward after the point t=420s. I would like to know how to fix this.
*On a less important note: how can i remove the extra ticks on x-axis on the left of the origin? Thanks a lot.
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import matplotlib
from matplotlib.ticker import (MultipleLocator, FormatStrFormatter,
AutoMinorLocator)
from sklearn.linear_model import LinearRegression
from scipy.interpolate import InterpolatedUnivariateSpline
%matplotlib inline
file = pd.read_excel("T8.xlsx","Phthalansäureanhydrid",usecols=[2,3])
X = file['Zeit(s)']
Y = file['Temperatur(Celcius Grad)']
fig, ax = plt.subplots()
ax.plot(X,Y,'-',color='#10A5F3', label="Phthalansäureanhydrid")
ax.grid(True, which='major', axis='both', color='#F19211', linestyle='-')
#ax.grid(True, which='minor', axis='both', color='#F19211', linestyle='--')
ax.spines['left'].set_position('zero')
ax.spines['right'].set_color('none')
#ax.spines['bottom'].set_position('zero')
ax.spines['top'].set_color('none')
#ax.legend(loc='upper center', frameon=True)
#major & minor ticks
ax.xaxis.set_major_locator(MultipleLocator(100))
ax.xaxis.set_major_formatter(FormatStrFormatter('%d'))
ax.xaxis.set_minor_locator(MultipleLocator(10))
#extrapolation - first line
temp1 = []
time1 = []
xnew1 = []
for i in file.index:
if i > 630:
temp1.append(file['Temperatur(Celcius Grad)'][i])
time1.append(file['Zeit(s)'][i])
else:
xnew1.append(file['Zeit(s)'][i])
order = 1
extrapo1 = InterpolatedUnivariateSpline(time1, temp1, k=1)
ynew1 = extrapo1(xnew1)
#extrapolation - second line
temp2 = []
time2 = []
xnew2 = []
for i in file.index:
if 100<i<400:
temp2.append(file['Temperatur(Celcius Grad)'][i])
time2.append(file['Zeit(s)'][i])
if i>200:
xnew2.append(file['Zeit(s)'][i])
ynew2 = []
f = interpolate.interp1d(time2, temp2, fill_value='extrapolate')
for i in xnew2:
ynew2.append(f(i))
plt.xlabel(r'Zeit[s]')
plt.ylabel(r'Temperatur[c]')
plt.plot(xnew1,ynew1,'-', color = '#B94A4D')
plt.plot(xnew2,ynew2,'-', color = '#B94A4D')
plt.savefig('kmn.pdf')
Link to the data: https://docs.google.com/spreadsheets/d/1xznXj-aA-Szq2s4KWb-qPWYxZbQNrA5FgUCQT6i7oVo/edit?usp=sharing
I did a cubic regression on the data below. How can I plot the regression line with x value starting from 0 rather than the minimum x?
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
df = pd.DataFrame({'x':list(range(3,18)),'y':[-4,-2,0,3,5,8,12,17,21,23,24,25,26,26,24]})
x = df['x'].values.reshape(-1,1)
y = df['y'].values.reshape(-1,1)
cubic = PolynomialFeatures(degree=3)
x_cubic = cubic.fit_transform(x)
cubic.fit(x_cubic, y)
model = LinearRegression()
model.fit(x_cubic, y)
fig, ax = plt.subplots()
ax.scatter(x, y, color = 'blue')
pred = model.predict(cubic.fit_transform(x))
ax.plot(x, pred, color = 'red')
ax.set_xlim(0)
ax.set_ylim(-20)
This is what I have now.
How can I get a plot like this?
Try creating and extended x range like this and predicting with your existing model. Add this to the bottom of your code.
ex_x = np.arange(0,4).reshape(-1,1)
ex_pred = model.predict(cubic.fit_transform(ex_x))
ax.plot(ex_x, ex_pred, color='red', linestyle='--')
Output:
I have a 2D array of relation data with labels(first row and column).
when I created the dendrogram, my Labels overlapped.
How can I make the labels separate evenly?
file= open(fileName)
line = file.readline()
file.close()
populations=line.split('\t')
del populations[0]
data = np.loadtxt(fileName, delimiter="\t",skiprows=1,usecols=range(1,len(populations)+1 ))
fig, ax = plt.subplots()
Y1 = sch.linkage(data, method='ward',optimal_ordering=True)
Z1 = sch.dendrogram(Y1, orientation='top')
ind1= Z1['leaves']
arr = np.array(populations)
populations = arr[ind1]
ax.set_xticks([])
ax.set_xticks(np.arange(len(populations)))
ax.set_xticklabels(populations )
plt.xticks(rotation=90)
plt.show()
I think it may be easier to simply specify the labels in construction of the dendrogram, since they are known at the time of construction, something like the following
import scipy.cluster.hierarchy as sch
import numpy as np # Only needed for random sample data
np.random.seed(1) # Seeded for reproducing
populations = np.arange(10) # Create some random sample data
data = abs(np.random.randn(10))
fig, ax = plt.subplots()
Y1 = sch.linkage(data, method='ward',optimal_ordering=True)
Z1 = sch.dendrogram(Y1, orientation='top', labels=populations)
plt.show()
Would give you
I am trying to create a color mesh plot but the data points and their corresponding colors appear too small.
My script is:
import pandas as pd
import numpy as np
from mpl_toolkits.basemap import Basemap, cm
import matplotlib.pyplot as plt
df = pd.read_csv('data.csv', usecols=[1,2,4])
df = df.apply(pd.to_numeric)
val_pivot_df = df.pivot(index='Latitude', columns='Longitude', values='Bin 1')
lons = val_pivot_df.columns.astype(float)
lats = val_pivot_df.index.astype(float)
fig, ax = plt.subplots(1, figsize=(8,8))
m = Basemap(projection='merc',
llcrnrlat=df.dropna().min().Latitude-5
, urcrnrlat=df.dropna().max().Latitude+5
, llcrnrlon=df.dropna().min().Longitude-5
, urcrnrlon=df.dropna().max().Longitude+5
, resolution='i', area_thresh=10000
)
m.drawcoastlines()
m.drawstates()
m.drawcountries()
m.fillcontinents(color='gray', lake_color='white')
m.drawmapboundary(fill_color='0.3')
x, y = np.meshgrid(lons,lats)
px,py = m(x,y)
data_values = val_pivot_df.values
masked_data = np.ma.masked_invalid(data_values)
cmap = plt.cm.viridis
m.pcolormesh(px, py, masked_data, vmin=0, vmax=8000)
m.colorbar()
plt.show()
I'm looking to get the markersize larger of each data point but I can't seem to find any documentation on how to do this for pcolormesh
There is no marker in a pcolormesh. The size of the colored areas in a pcolor plot is determined by the underlying grid. As an example, if the grid in x direction was [0,1,5,105], the last column would be 100 times larger in size than the first.
import matplotlib.pyplot as plt
import numpy as np; np.random.seed(1)
x = [0,1,5,25,27,100]
y = [0,10,20,64,66,100]
X,Y = np.meshgrid(x,y)
Z = np.random.rand(len(y)-1, len(x)-1)
plt.pcolormesh(X,Y,Z)
plt.show()
Is it possible to extract the data from a sns.kdeplot() before plotting?
ie. without using the function
y.get_lines()[0].get_data() post plotting
This can be done by extracting the line data from the matplotlib Axes object:
import numpy as np
from seaborn import kdeplot
my_data = np.random.randn(1000)
my_kde = kdeplot(my_data)
line = my_kde.lines[0]
x, y = line.get_data()
fig, ax = plt.subplots()
ax.plot(x[x>0], y[x>0])
alternatively the statsmodels way:
import statsmodels.api as sm
dens = sm.nonparametric.KDEUnivariate(np.random.randn(1000))
dens.fit()
x =np.linspace(0,1,100) #restrict range to (0,1)
y = dens.evaluate(x)
plt.plot(x,y)
Based on statsmodels's documentation:
import numpy as np
import seaborn as sns
import statsmodels.api as sm
import matplotlib.pyplot as plt
# generate bimodal disrtibution
X1 = np.random.normal(100, 10, 250)
X2 = np.random.normal(10, 20, 250)
X = np.concatenate([X1, X2])
# get density from seaborn
x, y = sns.kdeplot(X).lines[0].get_data()
# get density from statsmodel
kde = sm.nonparametric.KDEUnivariate(X).fit()
xx, yy = (kde.support, kde.density)
# compare outputs
plt.plot(x, y, label='from sns')
plt.plot(xx, yy, label='from statsmodels')
plt.legend()