Matplotlib PCA sample not working after altering dimensions - python

I am trying to learn how to use matplotlib.mlabPCA. Below I have the following code:
import numpy as np
from matplotlib import pyplot as plt
from matplotlib.mlab import PCA as mlabPCA
from mpl_toolkits.mplot3d import Axes3D, proj3d
np.random.seed(234234782384239784)
DIMENSIONS = 3
mu_vec1 = np.array([0 for i in xrange(DIMENSIONS)])
cov_mat1 = np.identity(DIMENSIONS)
class1_sample = np.random.multivariate_normal(mu_vec1, cov_mat1, 20).T
assert class1_sample.shape == (DIMENSIONS, 20)
mu_vec2 = np.array([3 for i in xrange(DIMENSIONS)])
cov_mat2 = np.identity(DIMENSIONS)
class2_sample = np.random.multivariate_normal(mu_vec2, cov_mat2, 20).T
assert class2_sample.shape == (DIMENSIONS, 20)
# Combine the two together
all_samples = np.vstack([class1_sample.T, class2_sample.T])
all_samples = all_samples.T
assert all_samples.shape == (DIMENSIONS, 40)
mlab_pca = mlabPCA(all_samples.T)
# 2d plotting
plt.plot(mlab_pca.Y[0:20, 0],
mlab_pca.Y[0:20, 1],
'o', markersize=7, color='blue', alpha=0.5, label='class1')
plt.plot(mlab_pca.Y[20:40, 0],
mlab_pca.Y[20:40, 1],
'^', markersize=7, color='red', alpha=0.5, label='class2')
plt.xlabel('x_values')
plt.ylabel('y_values')
plt.xlim([-4, 4])
plt.ylim([-4, 4])
plt.legend()
plt.title('Transformed samples with class labels from matplotlib.mlab.PCA()')
plt.show()
As you can see, PCA works pretty well and I get the following graph:
However, when I try to change DIMENSIONS = 100 (I am trying to simulate spectral data analysis), I am getting this error:
RuntimeError: we assume data in a is organized with numrows>numcols
"Ok sure, I can just apply PCA onto the transpose of this matrix instead." I told myself naively.
DIMENSIONS = 100
...
mlab_pca = mlabPCA(all_samples)
plt.plot(mlab_pca.Y[0, 0:20],
mlab_pca.Y[1, 0:20],
'o', markersize=7, color='blue', alpha=0.5, label='class1')
plt.plot(mlab_pca.Y[0, 20:40],
mlab_pca.Y[1, 20:40],
'^', markersize=7, color='red', alpha=0.5, label='class2')
...
My resulting plot looks completely off!
Am I doing something wrong? Or is adding that many dimension actually messing up my data?

I would not expect the points to separate. PCA(X) is not the same thing as PCA(X.T).T
It seems that requiring numrows > numcols is a limitation of matplotlib PCA.
Both R's prcomp and Python's sklearn PCA can take matrices with either numrows > numcols or numcols > numrows.

Related

Colorbars for 2D plots using Numpy, Matplotlib and os (rendering probelm)?

I have multiple .plx files that contain two column of numbers formatted as strings (1.plx , 2.plx...)
I managed to modify a code to load the data, convert it to floats, and plot it with the appropriate colorbar, but there are two issues I couldn't solve:
The color of the lines does not update
The lines rendering looks wrong (probably due to duplicates)
I want to try to avoid that rendering problem by plotting a numpy matrix, so I want to :
Load the data
store it in a numpy matrix (outside the loop so that I can do other data processing stuff)
create a 2D plot with the colorbar
Here is my attempt and the result:
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
import os
IdVg = [IdVg for IdVg in os.listdir() if IdVg.endswith(".plx")]
n_lines = 20
steps = np.linspace(0.1, 50, 20)
norm = mpl.colors.Normalize(vmin=steps.min(), vmax=steps.max())
cmap = mpl.cm.ScalarMappable(norm=norm, cmap=mpl.cm.BuPu)
cmap.set_array([])
for i in IdVg:
x, y = np.loadtxt(i, delimiter=' ', unpack=True, skiprows= 1, dtype=str)
x = x.astype(np.float64)
y = y.astype(np.float64)
for z, ai in enumerate(steps.T): # Problem here, I want to store x, y values in a 40XN matrix
# (x1, y1, x2, y2...x20, y20) and find a way to plot them
# using Matplotlib and numpy
plt.plot(x, y, c=cmap.to_rgba(z+1))
plt.ticklabel_format(style='sci', axis='y', scilimits=(0, 0))
plt.xlabel('$V_{GS}$ (V)', fontsize=14)
plt.ylabel('$I_{DS}$ (A)', fontsize=14)
plt.tick_params(axis='both', labelsize='12')
plt.grid(True, which="both", ls="-")
plt.colorbar(cmap, ticks=steps)
plt.show()
Thanks !
Since you didn't provide data, I'm going to generate my own. I assume you want to obtain the following result:
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
import os
n_lines = 20
steps = np.linspace(0.1, 50, 20)
norm = mpl.colors.Normalize(vmin=steps.min(), vmax=steps.max())
norm_steps = norm(steps)
cmap = mpl.cm.BuPu
plt.figure()
x = np.linspace(0, np.pi / 2)
for i in range(n_lines):
y = i / n_lines * np.sin(x)
plt.plot(x, y, c=cmap(norm_steps[i]))
plt.ticklabel_format(style='sci', axis='y', scilimits=(0, 0))
plt.xlabel('$V_{GS}$ (V)', fontsize=14)
plt.ylabel('$I_{DS}$ (A)', fontsize=14)
plt.tick_params(axis='both', labelsize='12')
plt.grid(True, which="both", ls="-")
plt.colorbar(mpl.cm.ScalarMappable(norm=norm, cmap=mpl.cm.BuPu), ticks=steps)
plt.show()
Obviously, you would have to change the colormap to something more readable in the lower values!

How to overlay a pcolormesh with binary information in Python

Given a 2D array
172,47,117
192,67,251
195,103,9
211,21,242
The objective is to place markers (e.g., shape, line) overlapping to the 2D images with reference to a binary 2D coordinate below
0,1,0
0,0,0
0,1,0
1,1,0
Specifically, a marker will be place if the cell is equivalent to 1.
The expected output is as below. In this example, the marker is in the form of horizontal red line. However, the marker can be of any other shape that make the code more straight forward.
May I know how to achieve this with any graphical packages in Python?
The above coordinate can be reproduced by following
import numpy as np
import matplotlib.pyplot as plt
np.random.seed(0)
X=np.random.randint(256, size=(4, 3))
arrshape=np.random.randint(2, size=(4, 3))
fig = plt.figure(figsize=(8,6))
plt.pcolormesh(X,cmap="plasma")
plt.title("Plot 2D array")
plt.colorbar()
plt.show()
You can throw in a scatter:
x,y = X.shape
xs,ys = np.ogrid[:x,:y]
# the non-zero coordinates
u = np.argwhere(arrshape)
plt.scatter(ys[:,u[:,1]].ravel()+.5,
xs[u[:,0]].ravel()+0.5,
marker='x', color='r', s=100)
Output:
One way to do it is with matplotlib
import numpy as np
import matplotlib.pyplot as plt
np.random.seed(0)
X=np.random.randint(256, size=(4, 3))
arrshape=np.random.randint(2, size=(4, 3))
fig = plt.figure(figsize=(8,6))
plt.pcolormesh(X,cmap="plasma")
plt.title("Plot 2D array")
plt.colorbar()
markers = [
[0,1,0],
[0,0,0],
[0,1,0],
[1,1,0]]
draw = []
for y, row in enumerate(markers):
prev = 0
for x, v in enumerate(row):
if v == 1:
plt.plot([x+0.25, x+0.75], [y+0.5, y+0.5], 'r-', linewidth=5)
if prev == 1:
plt.plot([x-0.5, x+0.5], [y+0.5, y+0.5], 'r-', linewidth=5)
prev = v
plt.show()
Output:
(Since your markers are upside-down)

How to shade a region under a curve

I would like to shade a region under a curve. This is my attempt:
from scipy.stats import lognorm
import matplotlib.pyplot as plt
import numpy as np
xpoints = np.linspace(0,10,100)
plt.vlines(2, 0, lognorm.pdf(2,1), color='r', linestyles='solid')
plt.vlines(3, 0, lognorm.pdf(3,1), color='r', linestyles='solid')
plt.fill_between([2,3], [lognorm.pdf(2,1), lognorm.pdf(3,1)], color='red')
plt.plot(xpoints, lognorm.pdf(xpoints,1))
However this doesn't shade under the curve properly.
How do you do this properly?
Using where
Using the where argument of fill_between allows to select the range over which the filling should occur.
from scipy.stats import lognorm
import matplotlib.pyplot as plt
import numpy as np
x = np.linspace(0, 10, 100)
y = lognorm.pdf(x, 1)
plt.vlines(2, 0, lognorm.pdf(2, 1), color='r', linestyles='solid')
plt.vlines(3, 0, lognorm.pdf(3, 1), color='r', linestyles='solid')
plt.fill_between(x, y, where=((x >= 2) & (x <= 3)), color='red')
plt.plot(x, y)
plt.show()
A problem with this may be that the point of the original curve which is closest to the boundary chosen may still be too far away, such that gaps may occur.
Worthwhile to note that if you choose the points dense enough, or for that matter, just intelligently enough, such problems would be circumvented. I.e. using 101 points, such that 2.0 and 3.0 are actually part of the data,
x = np.linspace(0, 10, 101)
would result in a nice picture:
Plotting a refined version of the curve.
It may hence make sense to reevaluate your function on a denser grid and plot it independently.
from scipy.stats import lognorm
import matplotlib.pyplot as plt
import numpy as np
x = np.linspace(0, 10, 100)
y = lognorm.pdf(x, 1)
plt.vlines(2, 0, lognorm.pdf(2, 1), color='r', linestyles='solid')
plt.vlines(3, 0, lognorm.pdf(3, 1), color='r', linestyles='solid')
xf = np.linspace(2, 3, 301)
yf = lognorm.pdf(xf, 1)
plt.fill_between(xf, yf, color='red')
plt.plot(x, y)
plt.show()
you are filling based on 2 points only, try this instead:
plt.fill_between(xpoints[20:31], [lognorm.pdf(i,1) for i in xpoints[20:31]], color='red')

Error: K-Mean Clustering Algorithm data plots is not visible in Python

Hi I want to implement the K-Means Clustering Algorithm.
For this I am getting data from sample.csv file and apply K-Means clustering on it. Here is my source code
## K-Means.py
# clustering dataset
import pandas
from sklearn.cluster import KMeans
from sklearn import metrics
import numpy as np
import matplotlib.pyplot as plt
variables = pandas.read_csv("/Users/srikanth/Desktop/sample1.csv")
print(variables)
x1 = variables[['X']]
x2 = variables[['Y']]
print(x1)
print(x2)
plt.plot()
plt.xlim([0, 10])
plt.ylim([0, 10])
plt.title('Dataset')
plt.xlabel('X - Values')
plt.ylabel('Y - Values')
plt.scatter(x1, x2)
plt.show()
# create new plot and data
plt.plot()
X = np.array(list(zip(x1, x2))).reshape(len(x1), 2)
colors = ['b', 'g', 'r']
markers = ['o', 'v', 's']
# KMeans algorithm
K = 3
kmeans_model = KMeans(n_clusters=K).fit(X)
plt.plot()
for i, l in enumerate(kmeans_model.labels_):
plt.plot(x1[i], x2[i], color=colors[l], marker=markers[l],ls='None')
plt.xlim([0, 10])
plt.ylim([0, 10])
plt.show()
After I run the above code in terminal, the output like:
The above image don't show any clustered data plots, So I want to see my clustered data plots visually. How can I fix this.
I am new to this area.
Thank you
from sklearn.cluster import KMeans
from sklearn import metrics
import numpy as np
import matplotlib.pyplot as plt
variables = pandas.read_csv("/Users/srikanth/Desktop/sample1.csv")
print(variables)
x1 = variables[['X']]
x2 = variables[['Y']]
plt.plot()
plt.xlim([150, 190])
plt.ylim([40, 90])
plt.title('Dataset')
plt.xlabel('X - Values')
plt.ylabel('Y - Values')
plt.scatter(x1, x2)
plt.show()
The scatter that it produces for the 10 points is :
For the code using the kmeans-clustering model, you are plotting for every label in the the model which will produce 10 plots. Just changing the limits should do the magic.

How to smoothen data in Python?

I am trying to smoothen a scatter plot shown below using SciPy's B-spline representation of 1-D curve. The data is available here.
The code I used is:
import matplotlib.pyplot as plt
import numpy as np
from scipy import interpolate
data = np.genfromtxt("spline_data.dat", delimiter = '\t')
x = 1000 / data[:, 0]
y = data[:, 1]
x_int = np.linspace(x[0], x[-1], 100)
tck = interpolate.splrep(x, y, k = 3, s = 1)
y_int = interpolate.splev(x_int, tck, der = 0)
fig = plt.figure(figsize = (5.15,5.15))
plt.subplot(111)
plt.plot(x, y, marker = 'o', linestyle='')
plt.plot(x_int, y_int, linestyle = '-', linewidth = 0.75, color='k')
plt.xlabel("X")
plt.ylabel("Y")
plt.show()
I tried changing the order of the spline and the smoothing condition, but I am not getting a smooth plot.
B-spline interpolation should be able to smoothen the data but what is wrong? Any alternate method to smoothen this data?
Use a larger smoothing parameter. For example, s=1000:
tck = interpolate.splrep(x, y, k=3, s=1000)
This produces:
Assuming we are dealing with noisy observations of some phenomena, Gaussian Process Regression might also be a good choice. Knowledge about the variance of the noise can be included into the parameters (nugget) and other parameters can be found using Maximum Likelihood estimation. Here's a simple example of how it could be applied:
import matplotlib.pyplot as plt
import numpy as np
from sklearn.gaussian_process import GaussianProcess
data = np.genfromtxt("spline_data.dat", delimiter='\t')
x = 1000 / data[:, 0]
y = data[:, 1]
x_pred = np.linspace(x[0], x[-1], 100)
# <GP regression>
gp = GaussianProcess(theta0=1, thetaL=0.00001, thetaU=1000, nugget=0.000001)
gp.fit(np.atleast_2d(x).T, y)
y_pred = gp.predict(np.atleast_2d(x_pred).T)
# </GP regression>
fig = plt.figure(figsize=(5.15, 5.15))
plt.subplot(111)
plt.plot(x, y, marker='o', linestyle='')
plt.plot(x_pred, y_pred, linestyle='-', linewidth=0.75, color='k')
plt.xlabel("X")
plt.ylabel("Y")
plt.show()
which will give:
In your specific case, you could also try changing the last argument of the np.linspace function to a smaller number, np.linspace(x[0], x[-1], 10), for example.
Demo code:
import matplotlib.pyplot as plt
import numpy as np
from scipy import interpolate
data = np.random.rand(100,2)
tempx = list(data[:, 0])
tempy = list(data[:, 1])
x = np.array(sorted([point*10 + tempx.index(point) for point in tempx]))
y = np.array([point*10 + tempy.index(point) for point in tempy])
x_int = np.linspace(x[0], x[-1], 10)
tck = interpolate.splrep(x, y, k = 3, s = 1)
y_int = interpolate.splev(x_int, tck, der = 0)
fig = plt.figure(figsize = (5.15,5.15))
plt.subplot(111)
plt.plot(x, y, marker = 'o', linestyle='')
plt.plot(x_int, y_int, linestyle = '-', linewidth = 0.75, color='k')
plt.xlabel("X")
plt.ylabel("Y")
plt.show()
You could also smooth the data with a rolling_mean in pandas:
import pandas as pd
data = [...(your data here)...]
smoothendData = pd.rolling_mean(data,5)
the second argument of rolling_mean is the moving average (rolling mean) period. You can also reverse the data 'data.reverse', take a rolling_mean of the data that way, and combine it with the forward rolling mean. Another option is exponentially weighted moving averages:
Pandas: Exponential smoothing function for column
or using bandpass filters:
fft bandpass filter in python
http://docs.scipy.org/doc/scipy/reference/signal.html

Categories

Resources