I'm having trouble with my code. It's only showing the data for one graph and the other graph is blank. I can't figure why it's not working.
I'm using the subplot() function and my guess is that the the reason might be the way my function are formatted.
import numpy as np
import matplotlib.pyplot as plt
import cvxopt as opt
from cvxopt import blas, solvers
import pandas as pd
import mpld3
from mpld3 import plugins
np.random.seed(123)
solvers.options['show_progress'] = False
n_assets = 4
n_obs = 1000 # original 1000
return_vec = np.random.randn(n_assets, n_obs)
def rand_weights(n):
k = np.random.rand(n)
return k / sum(k) print(rand_weights(n_assets)) print(rand_weights(n_assets))
def random_portfolio(returns):
p = np.asmatrix(np.mean(returns,axis=1))
w = np.asmatrix(rand_weights(returns.shape[0]))
C = np.asmatrix(np.cov(returns))
mu = w * p.T
sigma = np.sqrt(w * C * w.T)
#this recursion reduces outlier to keep the graph nice
if sigma > 2:
return random_portfolio(returns)
return mu, sigma
n_portfolios = 500
means, stds = np.column_stack([random_portfolio(return_vec) for _ in range(n_portfolios)])
plt.plot(return_vec.T, alpha=.4);
plt.xlabel('time')
plt.ylabel('returns')
plt.figure(1)
plt.subplot(212)
plt.plot(stds, means, 'o', markersize = 5)
plt.xlabel('std')
plt.ylabel('mean')
plt.title('Mean and standard deviation of returns of randomly generated portfolios')
plt.subplot(211)
plt.figure(1)
plt.show()
You need to move the line plt.subplot(211) before your first call to plt.plot. This is because calls to plt.subplot must precede the actual plotting in that subplot.
Related
This is not a duplicate question since other answers only explain how to plot the cross-correlation function and do not explain how you can get the time difference.
Given a sin signal and shifted version, we should be able to get the time delay between them.
I have created a sin signal and shifted it by t_d=0.05. The following is my code and its output:
import numpy as np
import matplotlib.pyplot as plt
fs = 1000
x = np.linspace(0, 1, fs)
f = 5
t_shift = 0.05
y = np.sin(2*np.pi*f*x)
y_shifted = np.sin(2*np.pi*f*(x-t_shift))
fig, ax = plt.subplots()
ax.plot(x, y, x, y_shifted)
plt.show()
By normalizing signals and applying numpy.correlate we get the following:
y_norm = (y-y.mean())/y.std()
y_shifted_norm = (y_shifted - y_shifted.mean())/y_shifted.std()
cc = np.correlate(y_norm, y_shifted_norm, 'full')
fig, ax = plt.subplots()
ax.plot(range(len(cc)), cc)
plt.show()
Question
From the indices of cross-correlation function, how can I get t_shift=0.05?
#Sepide. It seems to me as if you are trying to maximise the correlation between the signal y and a shifted version of y_shifted. This might be accomplished using np.correlate() but it seems nontrivial indeed to recover the time shifts in the signals. In the solution below I manually shift the time series and compute the correlation coefficient using np.corrcoef. As soon as this Pearson correlation coefficient equals 1, the two signals are aligned.
import numpy as np
import matplotlib.pyplot as plt
# Setting
fs = 1000
x = np.linspace(0, 1, fs)
f = 5
t_shift = 0.05
t_step = 1/fs
# Data
y = np.sin(2*np.pi*f*x)
y_shifted = np.sin(2*np.pi*f*(x-t_shift))
# Compute correlation
MaxTimeShift = 200
CorrelationList = np.empty((MaxTimeShift,1));
CorrelationList[:] = np.NaN
# Compute correlation for various shifts
for iter in range(MaxTimeShift):
CorrelationList[iter] = np.corrcoef( y[0:801].T, y_shifted[iter:(801+iter)].T)[0,1]
# Plot 1
plt.figure(1)
plt.plot(x, y, x, y_shifted)
plt.show()
# Plot 2
plt.figure(2)
ShiftList = t_step*np.arange(MaxTimeShift)
plt.plot(ShiftList, CorrelationList)
plt.title("Correlation coefficient")
plt.show()
print("The time shift between the signals is: ", ShiftList[np.argmax(CorrelationList)])
I have a Data Frame that contains two columns named, "thousands of dollars per year", and "EMPLOY".
I create a new variable in this data frame named "cubic_Root" by computing the data in df['thousands of dollars per year']
df['cubic_Root'] = -1 / df['thousands of dollars per year'] ** (1. / 3)
The data in df['cubic_Root'] like that:
ID cubic_Root
1 -0.629961
2 -0.405480
3 -0.329317
4 -0.480750
5 -0.305711
6 -0.449644
7 -0.449644
8 -0.480750
Now! How can I draw a normal probability plot by using the data in df['cubic_Root'].
You want the "Probability" Plots.
So for a single plot, you'd have something like below.
import scipy.stats
import numpy as np
import matplotlib.pyplot as plt
# 100 values from a normal distribution with a std of 3 and a mean of 0.5
data = 3.0 * np.random.randn(100) + 0.5
counts, start, dx, _ = scipy.stats.cumfreq(data, numbins=20)
x = np.arange(counts.size) * dx + start
plt.plot(x, counts, 'ro')
plt.xlabel('Value')
plt.ylabel('Cumulative Frequency')
plt.show()
If you want to plot a distribution, and you know it, define it as a function, and plot it as so:
import numpy as np
from matplotlib import pyplot as plt
def my_dist(x):
return np.exp(-x ** 2)
x = np.arange(-100, 100)
p = my_dist(x)
plt.plot(x, p)
plt.show()
If you don't have the exact distribution as an analytical function, perhaps you can generate a large sample, take a histogram and somehow smooth the data:
import numpy as np
from scipy.interpolate import UnivariateSpline
from matplotlib import pyplot as plt
N = 1000
n = N/10
s = np.random.normal(size=N) # generate your data sample with N elements
p, x = np.histogram(s, bins=n) # bin it into n = N/10 bins
x = x[:-1] + (x[1] - x[0])/2 # convert bin edges to centers
f = UnivariateSpline(x, p, s=n)
plt.plot(x, f(x))
plt.show()
You can increase or decrease s (smoothing factor) within the UnivariateSpline function call to increase or decrease smoothing. For example, using the two you get:
Probability density Function (PDF) of inter-arrival time of events.
import numpy as np
import scipy.stats
# generate data samples
data = scipy.stats.expon.rvs(loc=0, scale=1, size=1000, random_state=123)
A kernel density estimation can then be obtained by simply calling
scipy.stats.gaussian_kde(data,bw_method=bw)
where bw is an (optional) parameter for the estimation procedure. For this data set, and considering three values for bw the fit is as shown below
# test values for the bw_method option ('None' is the default value)
bw_values = [None, 0.1, 0.01]
# generate a list of kde estimators for each bw
kde = [scipy.stats.gaussian_kde(data,bw_method=bw) for bw in bw_values]
# plot (normalized) histogram of the data
import matplotlib.pyplot as plt
plt.hist(data, 50, normed=1, facecolor='green', alpha=0.5);
# plot density estimates
t_range = np.linspace(-2,8,200)
for i, bw in enumerate(bw_values):
plt.plot(t_range,kde[i](t_range),lw=2, label='bw = '+str(bw))
plt.xlim(-1,6)
plt.legend(loc='best')
Reference:
Python: Matplotlib - probability plot for several data set
how to plot Probability density Function (PDF) of inter-arrival time of events?
I am trying to estimate the probability density function of my data. IN my case, the data is a satellite image with a shape 8200 x 8100.
Below, I present you the code of PDF (the function 'is_outlier' is borrowed by a guy that post this code on here ). As we can see, the PDF is in figure 1 too dense. I guess, this is due to the thousands of pixels that the satellite image is composed of. This is very ugly.
My question is, how can I plot a PDF that is not too dense? something like shown in figure 2 for example.
lst = 'satellite_img.tif' #import the image
lst_flat = lst.flatten() #create 1D array
#the function below removes the outliers
def is_outlier(points, thres=3.5):
if len(points.shape) == 1:
points = points[:,None]
median = np.median(points, axis=0)
diff = np.sum((points - median)**2, axis=-1)
diff = np.sqrt(diff)
med_abs_deviation = np.median(diff)
modified_z_score = 0.6745 * diff / med_abs_deviation
return modified_z_score > thres
lst_flat = np.r_[lst_flat]
lst_flat_filtered = lst_flat[~is_outlier(lst_flat)]
fit = stats.norm.pdf(lst_flat_filtered, np.mean(lst_flat_filtered), np.std(lst_flat_filtered))
plt.plot(lst_flat_filtered, fit)
plt.hist(lst_flat_filtered, bins=30, normed=True)
plt.show()
figure 1
figure 2
The issue is that the x values in the PDF plot are not sorted, so the plotted line is going back and forwards between random points, creating the mess you see.
Two options:
Don't plot the line, just plot points (not great if you have lots of points, but will confirm if what I said above is right or not):
plt.plot(lst_flat_filtered, fit, 'bo')
Sort the lst_flat_filtered array before calculating the PDF and plotting it:
lst_flat = np.r_[lst_flat]
lst_flat_filtered = np.sort(lst_flat[~is_outlier(lst_flat)]) # Changed this line
fit = stats.norm.pdf(lst_flat_filtered, np.mean(lst_flat_filtered), np.std(lst_flat_filtered))
plt.plot(lst_flat_filtered, fit)
Here's some minimal examples showing these behaviours:
Reproducing your problem:
import numpy as np
import scipy.stats as stats
import matplotlib.pyplot as plt
lst_flat_filtered = np.random.normal(7, 5, 1000)
fit = stats.norm.pdf(lst_flat_filtered, np.mean(lst_flat_filtered), np.std(lst_flat_filtered))
plt.hist(lst_flat_filtered, bins=30, normed=True)
plt.plot(lst_flat_filtered, fit)
plt.show()
Plotting points
import numpy as np
import scipy.stats as stats
import matplotlib.pyplot as plt
lst_flat_filtered = np.random.normal(7, 5, 1000)
fit = stats.norm.pdf(lst_flat_filtered, np.mean(lst_flat_filtered), np.std(lst_flat_filtered))
plt.hist(lst_flat_filtered, bins=30, normed=True)
plt.plot(lst_flat_filtered, fit, 'bo')
plt.show()
Sorting the data
import numpy as np
import scipy.stats as stats
import matplotlib.pyplot as plt
lst_flat_filtered = np.sort(np.random.normal(7, 5, 1000))
fit = stats.norm.pdf(lst_flat_filtered, np.mean(lst_flat_filtered), np.std(lst_flat_filtered))
plt.hist(lst_flat_filtered, bins=30, normed=True)
plt.plot(lst_flat_filtered, fit)
plt.show()
I have data in the form x-y-z and want to create a power spectrum along x-y. Here is a basic example I am posting to check where I might be going wrong with my actual data:
import numpy as np
from matplotlib import pyplot as plt
fq = 10; N = 20
x = np.linspace(0,8,N); y = x
space = x[1] -x[0]
xx, yy = np.meshgrid(x,y)
fnc = np.sin(2*np.pi*fq*xx)
ft = np.fft.fft2(fnc)
ft = np.fft.fftshift(ft)
freq_x = np.fft.fftfreq(ft.shape[0], d=space)
freq_y = np.fft.fftfreq(ft.shape[1], d=space)
plt.imshow(
abs(ft),
aspect='auto',
extent=(freq_x.min(),freq_x.max(),freq_y.min(),freq_y.max())
)
plt.figure()
plt.imshow(fnc)
This results in the following function & frequency figures with the incorrect frequency. Thanks.
One of your problems is that matplotlib's imshow using a different coordinate system to what you expect. Provide a origin='lower' argument, and the peaks now appear at y=0, as expected.
Another problem that you have is that fftfreq needs to be told your timestep, which in your case is 8 / (N - 1)
import numpy as np
from matplotlib import pyplot as plt
fq = 10; N = 20
x = np.linspace(0,8,N); y = x
xx, yy = np.meshgrid(x,y)
fnc = np.sin(2*np.pi*fq*xx)
ft = np.fft.fft2(fnc)
ft = np.fft.fftshift(ft)
freq_x = np.fft.fftfreq(ft.shape[0], d=8 / (N - 1)) # this takes an argument for the timestep
freq_y = np.fft.fftfreq(ft.shape[1], d=8 / (N - 1))
plt.imshow(
abs(ft),
aspect='auto',
extent=(freq_x.min(),freq_x.max(),freq_y.min(),freq_y.max()),
origin='lower' , # this fixes your problem
interpolation='nearest', # this makes it easier to see what is happening
cmap='viridis' # let's use a better color map too
)
plt.grid()
plt.show()
You may say "but the frequency is 10, not 0.5!" However, if you want to sample a frequency of 10, you need to sample a lot faster than 8/19! Nyquist's theorem says you need to exceed a sampling rate of 20 to have any hope at all
I am trying to make a profile plot for two columns of a pandas.DataFrame. I would not expect this to be in pandas directly but it seems there is nothing in matplotlib either. I have searched around and cannot find it in any package other than rootpy. Before I take the time to write this myself I thought I would ask if there was a small package that contained profile histograms, perhaps where they are known by a different name.
If you don't know what I mean by "profile histogram" have a look at the ROOT implementation. http://root.cern.ch/root/html/TProfile.html
You can easily do it using scipy.stats.binned_statistic.
import scipy.stats
import numpy
import matplotlib.pyplot as plt
x = numpy.random.rand(10000)
y = x + scipy.stats.norm(0, 0.2).rvs(10000)
means_result = scipy.stats.binned_statistic(x, [y, y**2], bins=50, range=(0,1), statistic='mean')
means, means2 = means_result.statistic
standard_deviations = numpy.sqrt(means2 - means**2)
bin_edges = means_result.bin_edges
bin_centers = (bin_edges[:-1] + bin_edges[1:])/2.
plt.errorbar(x=bin_centers, y=means, yerr=standard_deviations, linestyle='none', marker='.')
Use seaborn. Data as from #MaxNoe
import numpy as np
import seaborn as sns
# just some random numbers to get started
x = np.random.uniform(-2, 2, 10000)
y = np.random.normal(x**2, np.abs(x) + 1)
sns.regplot(x=x, y=y, x_bins=10, fit_reg=None)
You can do much more (error bands are from bootstrap, you can change the estimator on the y-axis, add regression, ...)
While #Keith's answer seems to fit what you mean, it is quite a lot of code. I think this can be done much simpler, so one gets the key concepts and can adjust and build on top of it.
Let me stress one thing: what ROOT is calling a ProfileHistogram is not a special kind of plot. It is an errorbar plot. Which can simply be done in matplotlib.
It is a special kind of computation and that's not the task of a plotting library. This lies in the pandas realm, and pandas is great at stuff like this. It's symptomatical for ROOT as the giant monolithic pile it is to have an extra class for this.
So what you want to do is: discretize in some variable x and for each bin, calculate something in another variable y.
This can easily done using np.digitize together with the pandas groupy and aggregate methods.
Putting it all together:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
# just some random numbers to get startet
x = np.random.uniform(-2, 2, 10000)
y = np.random.normal(x**2, np.abs(x) + 1)
df = pd.DataFrame({'x': x, 'y': y})
# calculate in which bin row belongs base on `x`
# bins needs the bin edges, so this will give as 100 equally sized bins
bins = np.linspace(-2, 2, 101)
df['bin'] = np.digitize(x, bins=bins)
bin_centers = 0.5 * (bins[:-1] + bins[1:])
bin_width = bins[1] - bins[0]
# grouby bin, so we can calculate stuff
binned = df.groupby('bin')
# calculate mean and standard error of the mean for y in each bin
result = binned['y'].agg(['mean', 'sem'])
result['x'] = bin_centers
result['xerr'] = bin_width / 2
# plot it
result.plot(
x='x',
y='mean',
xerr='xerr',
yerr='sem',
linestyle='none',
capsize=0,
color='black',
)
plt.savefig('result.png', dpi=300)
Just like ROOT ;)
I made a module myself for this functionality.
import pandas as pd
from pandas import Series, DataFrame
import numpy as np
import matplotlib.pyplot as plt
def Profile(x,y,nbins,xmin,xmax,ax):
df = DataFrame({'x' : x , 'y' : y})
binedges = xmin + ((xmax-xmin)/nbins) * np.arange(nbins+1)
df['bin'] = np.digitize(df['x'],binedges)
bincenters = xmin + ((xmax-xmin)/nbins)*np.arange(nbins) + ((xmax-xmin)/(2*nbins))
ProfileFrame = DataFrame({'bincenters' : bincenters, 'N' : df['bin'].value_counts(sort=False)},index=range(1,nbins+1))
bins = ProfileFrame.index.values
for bin in bins:
ProfileFrame.ix[bin,'ymean'] = df.ix[df['bin']==bin,'y'].mean()
ProfileFrame.ix[bin,'yStandDev'] = df.ix[df['bin']==bin,'y'].std()
ProfileFrame.ix[bin,'yMeanError'] = ProfileFrame.ix[bin,'yStandDev'] / np.sqrt(ProfileFrame.ix[bin,'N'])
ax.errorbar(ProfileFrame['bincenters'], ProfileFrame['ymean'], yerr=ProfileFrame['yMeanError'], xerr=(xmax-xmin)/(2*nbins), fmt=None)
return ax
def Profile_Matrix(frame):
#Much of this is stolen from https://github.com/pydata/pandas/blob/master/pandas/tools/plotting.py
import pandas.core.common as com
import pandas.tools.plotting as plots
from pandas.compat import lrange
from matplotlib.artist import setp
range_padding=0.05
df = frame._get_numeric_data()
n = df.columns.size
fig, axes = plots._subplots(nrows=n, ncols=n, squeeze=False)
# no gaps between subplots
fig.subplots_adjust(wspace=0, hspace=0)
mask = com.notnull(df)
boundaries_list = []
for a in df.columns:
values = df[a].values[mask[a].values]
rmin_, rmax_ = np.min(values), np.max(values)
rdelta_ext = (rmax_ - rmin_) * range_padding / 2.
boundaries_list.append((rmin_ - rdelta_ext, rmax_+ rdelta_ext))
for i, a in zip(lrange(n), df.columns):
for j, b in zip(lrange(n), df.columns):
common = (mask[a] & mask[b]).values
nbins = 100
(xmin,xmax) = boundaries_list[i]
ax = axes[i, j]
Profile(df[a][common],df[b][common],nbins,xmin,xmax,ax)
ax.set_xlabel('')
ax.set_ylabel('')
plots._label_axis(ax, kind='x', label=b, position='bottom', rotate=True)
plots._label_axis(ax, kind='y', label=a, position='left')
if j!= 0:
ax.yaxis.set_visible(False)
if i != n-1:
ax.xaxis.set_visible(False)
for ax in axes.flat:
setp(ax.get_xticklabels(), fontsize=8)
setp(ax.get_yticklabels(), fontsize=8)
return axes
To my knowledge matplotlib doesn't still allow to directly produce profile histograms.
You can instead give a look at Hippodraw, a package developed at SLAC, that can be used as a Python extension module.
Here there is a Profile histogram example:
http://www.slac.stanford.edu/grp/ek/hippodraw/datareps_root.html#datareps_profilehist