I have created a tornado plot taking inspiration from here. It has input variables labelled on the y-axis (a1,b1,c1...) and their respective correlation coefficients plotted next to them. See pic below:
I then sorted the correlation coefficients in a way that the highest absolute value without loosing its sign gets plotted first, then the next highest and so on. using sorted(values,key=abs, reverse=True). See the result below
If you notice, in the second pic even though the bars were sorted in the absolute descending order, the y-axis label still stay the same.
Question: How do I make the y-axis label(variable) connect to the correlation coefficient such that it always corresponds to its correlation coefficient.
Below is my code:
import numpy as np
from matplotlib import pyplot as plt
#####Importing Data from csv file#####
dataset1 = np.genfromtxt('dataSet1.csv', dtype = float, delimiter = ',', skip_header = 1, names = ['a', 'b', 'c', 'x0'])
dataset2 = np.genfromtxt('dataSet2.csv', dtype = float, delimiter = ',', skip_header = 1, names = ['a', 'b', 'c', 'x0'])
dataset3 = np.genfromtxt('dataSet3.csv', dtype = float, delimiter = ',', skip_header = 1, names = ['a', 'b', 'c', 'x0'])
corr1 = np.corrcoef(dataset1['a'],dataset1['x0'])
corr2 = np.corrcoef(dataset1['b'],dataset1['x0'])
corr3 = np.corrcoef(dataset1['c'],dataset1['x0'])
corr4 = np.corrcoef(dataset2['a'],dataset2['x0'])
corr5 = np.corrcoef(dataset2['b'],dataset2['x0'])
corr6 = np.corrcoef(dataset2['c'],dataset2['x0'])
corr7 = np.corrcoef(dataset3['a'],dataset3['x0'])
corr8 = np.corrcoef(dataset3['b'],dataset3['x0'])
corr9 = np.corrcoef(dataset3['c'],dataset3['x0'])
np.set_printoptions(precision=4)
variables = ['a1','b1','c1','a2','b2','c2','a3','b3','c3']
base = 0
values = np.array([corr1[0,1],corr2[0,1],corr3[0,1],
corr4[0,1],corr5[0,1],corr6[0,1],
corr7[0,1],corr8[0,1],corr9[0,1]])
values = sorted(values,key=abs, reverse=True)
# The y position for each variable
ys = range(len(values))[::-1] # top to bottom
# Plot the bars, one by one
for y, value in zip(ys, values):
high_width = base + value
#print high_width
# Each bar is a "broken" horizontal bar chart
plt.broken_barh(
[(base, high_width)],
(y - 0.4, 0.8),
facecolors=['red', 'red'], # Try different colors if you like
edgecolors=['black', 'black'],
linewidth=1)
# Draw a vertical line down the middle
plt.axvline(base, color='black')
# Position the x-axis on the top/bottom, hide all the other spines (=axis lines)
axes = plt.gca() # (gca = get current axes)
axes.spines['left'].set_visible(False)
axes.spines['right'].set_visible(False)
axes.spines['top'].set_visible(False)
axes.xaxis.set_ticks_position('bottom')
# Make the y-axis display the variables
plt.yticks(ys, variables)
plt.ylim(-2, len(variables))
plt.show()
Many thanks in advance
use build-in zip function - returns a list of tuples, where the i-th tuple contains the i-th element from each of the argument sequences or iterables. But aware the returned list is truncated in length to the length of the shortest argument sequence.
Related
This is a code for a waterfall chart. I'd kindly like to ask:
if there is a way to simplify this code. The code is far too long and I'm sure there is a lot of extra lines of code that could be reduced.
How I can make the first and last bars black?. Since I am creating a waterfall chart I am looking for the first and last value to be black at all times and the values in between to be green or red depending on whether or not it is a negative or positive number.
Bars greater than zero green.
Bars less than zero red.
Any help would be greatly appreciated.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.ticker import FuncFormatter
#Use python 2.7+ syntax to format currency
def money(x, pos):
'The two args are the value and tick position'
return "${:,.0f}".format(x)
formatter = FuncFormatter(money)
#Data to plot. Do not include a total, it will be calculated
index = ['sales','returns','credit fees','rebates','late charges','shipping']
data = {'amount': [350000,-30000,-7500,-25000,95000,-7000]}
#Store data and create a blank series to use for the waterfall
trans = pd.DataFrame(data=data,index=index)
blank = trans.amount.cumsum().shift(1).fillna(0)
#Get the net total number for the final element in the waterfall
total = trans.sum().amount
trans.loc["net"]= total
blank.loc["net"] = total
#The steps graphically show the levels as well as used for label placement
step = blank.reset_index(drop=True).repeat(3).shift(-1)
step[1::3] = np.nan
#When plotting the last element, we want to show the full bar,
#Set the blank to 0
blank.loc["net"] = 0
#Plot and label
my_plot = trans.plot(kind='bar', stacked=True, bottom=blank,legend=None, figsize=(10, 5), title="2014 Sales Waterfall")
my_plot.plot(step.index, step.values,'k')
my_plot.set_xlabel("Transaction Types")
#Format the axis for dollars
my_plot.yaxis.set_major_formatter(formatter)
#Get the y-axis position for the labels
y_height = trans.amount.cumsum().shift(1).fillna(0)
#Get an offset so labels don't sit right on top of the bar
max = trans.max()
neg_offset = max / 25
pos_offset = max / 50
plot_offset = int(max / 15)
#Start label loop
loop = 0
for index, row in trans.iterrows():
# For the last item in the list, we don't want to double count
if row['amount'] == total:
y = y_height[loop]
else:
y = y_height[loop] + row['amount']
# Determine if we want a neg or pos offset
if row['amount'] > 0:
y += pos_offset
else:
y -= neg_offset
my_plot.annotate("{:,.0f}".format(row['amount']),(loop,y),ha="center")
loop+=1
#Scale up the y axis so there is room for the labels
my_plot.set_ylim(0,blank.max()+int(plot_offset))
#Rotate the labels
my_plot.set_xticklabels(trans.index,rotation=0)
my_plot.get_figure().savefig("waterfall.png",dpi=200,bbox_inches='tight')
Answer to questions 2, 3 and 4: set the colors of the bar patches after plotting them:
for p, c in zip(my_plot.containers[0].patches, np.r_[0, np.sign(trans.amount[1:-1]), 0]):
p.set_color({0: 'k', 1: 'g', -1: 'r'}[c])
I want to plot an infinite non ending line between two points that are in the form of a pandas series. I am able to successfully plot a standard line between the points, however I don't want the line to "end" and instead it should continue. Expanding on this I would also like to extract the values of this new infinite line to a new dataframe so that I can see what corresponding line value a given x value in has.
data = yf.download("AAPL", start="2021-01-01", interval = "1d").drop(columns=['Adj Close'])
data = data[30:].rename(columns={"Open": "open", "High": "high", "Low": "low", "Close": "close", "Volume": "volume"})
local_max = argrelextrema(data['high'].values, np.greater)[0]
local_min = argrelextrema(data['low'].values, np.less)[0]
highs = data.iloc[local_max,:]
lows = data.iloc[local_min,:]
highesttwo = highs["high"].nlargest(2)
lowesttwo = lows["low"].nsmallest(2)
fig = plt.figure(figsize=[10,7])
data['high'].plot(marker='o', markevery=local_max)
data['low'].plot(marker='o', markevery=local_min)
highesttwo.plot()
lowesttwo.plot()
plt.show()
Currently my plot looks like this:
How ever I want it to look like this as well as be able to get the values of the line for the corresponding x value.
This can be done in a few steps as shown in the following example where the lines are computed with element-wise operations (i.e. vectorized) using the slope-intercept form of the line equation.
The stock data has a frequency based on the opening dates of the stock exchange. This frequency is not automatically recognized by pandas, therefore the .plot method produces a plot with a continuous date for the x-axis and includes the days with no data. This can be avoided by setting the argument use_index=False so that the x-axis uses integers starting from zero instead.
The challenge is to then create nicely formatted tick labels. The following example attempts to imitate the pandas tick format by using list comprehensions to select the tick locations and format the labels. These will need to be adjusted if the date range is significantly lengthened or shortened.
import numpy as np # v 1.19.2
import pandas as pd # v 1.2.3
import matplotlib.pyplot as plt # v 3.3.4
from scipy.signal import argrelextrema # v 1.6.1
import yfinance as yf # v 0.1.54
# Import data
data = (yf.download('AAPL', start='2021-01-04', end='2021-03-15', interval='1d')
.drop(columns=['Adj Close']))
data = data.rename(columns={'Open': 'open', 'High': 'high', 'Low': 'low',
'Close': 'close', 'Volume': 'volume'})
# Extract points and get appropriate x values for the points by using
# reset_index for highs/lows
local_max = argrelextrema(data['high'].values, np.greater)[0]
local_min = argrelextrema(data['low'].values, np.less)[0]
highs = data.reset_index().iloc[local_max, :]
lows = data.reset_index().iloc[local_min, :]
htwo = highs['high'].nlargest(2).sort_index()
ltwo = lows['low'].nsmallest(2).sort_index()
# Compute slope and y-intercept for each line
slope_high, intercept_high = np.polyfit(htwo.index, htwo, 1)
slope_low, intercept_low = np.polyfit(ltwo.index, ltwo, 1)
# Create dataframe for each line by using reindexed htwo and ltwo so that the
# index extends to the end of the dataset and serves as the x variable then
# compute y values
# High
line_high = htwo.reindex(range(htwo.index[0], len(data))).reset_index()
line_high.columns = ['x', 'y']
line_high['y'] = slope_high*line_high['x'] + intercept_high
# Low
line_low = ltwo.reindex(range(ltwo.index[0], len(data))).reset_index()
line_low.columns = ['x', 'y']
line_low['y'] = slope_low*line_low['x'] + intercept_low
# Plot data using pandas plotting function and add lines with matplotlib function
fig = plt.figure(figsize=[10,6])
ax = data['high'].plot(marker='o', markevery=local_max, use_index=False)
data['low'].plot(marker='o', markevery=local_min, use_index=False)
ax.plot(line_high['x'], line_high['y'])
ax.plot(line_low['x'], line_low['y'])
ax.set_xlim(0, len(data)-1)
# Set major and minor tick locations
tks_maj = [idx for idx, timestamp in enumerate(data.index)
if (timestamp.month != data.index[idx-1].month) | (idx == 0)]
tks_min = range(len(data))
ax.set_xticks(tks_maj)
ax.set_xticks(tks_min, minor=True)
# Format major and minor tick labels
labels_maj = [ts.strftime('\n%b\n%Y') if (data.index[tks_maj[idx]].year
!= data.index[tks_maj[idx-1]].year) | (idx == 0)
else ts.strftime('\n%b') for idx, ts in enumerate(data.index[tks_maj])]
labels_min = [ts.strftime('%d') if (idx+3)%5 == 0 else ''
for idx, ts in enumerate(data.index[tks_min])]
ax.set_xticklabels(labels_maj)
ax.set_xticklabels(labels_min, minor=True)
plt.show()
You can find more examples of tick formatting here and here in Solution 1.
Date string format codes
I recently started to use Python, and I can't understand how to plot a confidence interval for a given datum (or set of data).
I already have a function that computes, given a set of measurements, a higher and lower bound depending on the confidence level that I pass to it, but how can I use those two values to plot a confidence interval?
There are several ways to accomplish what you asking for:
Using only matplotlib
from matplotlib import pyplot as plt
import numpy as np
#some example data
x = np.linspace(0.1, 9.9, 20)
y = 3.0 * x
#some confidence interval
ci = 1.96 * np.std(y)/np.sqrt(len(x))
fig, ax = plt.subplots()
ax.plot(x,y)
ax.fill_between(x, (y-ci), (y+ci), color='b', alpha=.1)
fill_between does what you are looking for. For more information on how to use this function, see: https://matplotlib.org/3.1.1/api/_as_gen/matplotlib.pyplot.fill_between.html
Output
Alternatively, go for seaborn, which supports this using lineplot or regplot,
see: https://seaborn.pydata.org/generated/seaborn.lineplot.html
Let's assume that we have three categories and lower and upper bounds of confidence intervals of a certain estimator across these three categories:
data_dict = {}
data_dict['category'] = ['category 1','category 2','category 3']
data_dict['lower'] = [0.1,0.2,0.15]
data_dict['upper'] = [0.22,0.3,0.21]
dataset = pd.DataFrame(data_dict)
You can plot the confidence interval for each of these categories using the following code:
for lower,upper,y in zip(dataset['lower'],dataset['upper'],range(len(dataset))):
plt.plot((lower,upper),(y,y),'ro-',color='orange')
plt.yticks(range(len(dataset)),list(dataset['category']))
Resulting with the following graph:
import matplotlib.pyplot as plt
import statistics
from math import sqrt
def plot_confidence_interval(x, values, z=1.96, color='#2187bb', horizontal_line_width=0.25):
mean = statistics.mean(values)
stdev = statistics.stdev(values)
confidence_interval = z * stdev / sqrt(len(values))
left = x - horizontal_line_width / 2
top = mean - confidence_interval
right = x + horizontal_line_width / 2
bottom = mean + confidence_interval
plt.plot([x, x], [top, bottom], color=color)
plt.plot([left, right], [top, top], color=color)
plt.plot([left, right], [bottom, bottom], color=color)
plt.plot(x, mean, 'o', color='#f44336')
return mean, confidence_interval
plt.xticks([1, 2, 3, 4], ['FF', 'BF', 'FFD', 'BFD'])
plt.title('Confidence Interval')
plot_confidence_interval(1, [10, 11, 42, 45, 44])
plot_confidence_interval(2, [10, 21, 42, 45, 44])
plot_confidence_interval(3, [20, 2, 4, 45, 44])
plot_confidence_interval(4, [30, 31, 42, 45, 44])
plt.show()
x: The x value of the input.
values: An array containing the repeated values (usually measured values) of y corresponding to the value of x.
z: The critical value of the z-distribution. Using 1.96 corresponds to the critical value of 95%.
Result:
For a confidence interval across categories, building on what omer sagi suggested, let's say if we have a Pandas data frame with a column that contains categories (like category 1, category 2, and category 3) and another that has continuous data (like some kind of rating), here's a function using pd.groupby() and scipy.stats to plot difference in means across groups with confidence intervals:
import pandas as pd
import numpy as np
import scipy.stats as st
def plot_diff_in_means(data: pd.DataFrame, col1: str, col2: str):
"""
Given data, plots difference in means with confidence intervals across groups
col1: categorical data with groups
col2: continuous data for the means
"""
n = data.groupby(col1)[col2].count()
# n contains a pd.Series with sample size for each category
cat = list(data.groupby(col1, as_index=False)[col2].count()[col1])
# 'cat' has the names of the categories, like 'category 1', 'category 2'
mean = data.groupby(col1)[col2].agg('mean')
# The average value of col2 across the categories
std = data.groupby(col1)[col2].agg(np.std)
se = std / np.sqrt(n)
# Standard deviation and standard error
lower = st.t.interval(alpha = 0.95, df=n-1, loc = mean, scale = se)[0]
upper = st.t.interval(alpha = 0.95, df =n-1, loc = mean, scale = se)[1]
# Calculates the upper and lower bounds using SciPy
for upper, mean, lower, y in zip(upper, mean, lower, cat):
plt.plot((lower, mean, upper), (y, y, y), 'b.-')
# for 'b.-': 'b' means 'blue', '.' means dot, '-' means solid line
plt.yticks(
range(len(n)),
list(data.groupby(col1, as_index = False)[col2].count()[col1])
)
Given hypothetical data:
cat = ['a'] * 10 + ['b'] * 10 + ['c'] * 10
a = np.linspace(0.1, 5.0, 10)
b = np.linspace(0.5, 7.0, 10)
c = np.linspace(7.5, 20.0, 10)
rating = np.concatenate([a, b, c])
dat_dict = dict()
dat_dict['cat'] = cat
dat_dict['rating'] = rating
test_dat = pd.DataFrame(dat_dict)
which would look like this (but with more rows of course):
cat
rating
a
0.10000
a
0.64444
b
0.50000
b
0.12222
c
7.50000
c
8.88889
We can use the function to plot a difference in means with a confidence interval:
plot_diff_in_means(data = test_dat, col1 = 'cat', col2 = 'rating')
which gives us the following graph:
I'm trying to optimise several functions using the brute-force method of lmfit (based on scipy minimize). The function I'm minimizing can have a variable number of parameters passed into it (each parameter with variable optimisation range)
I've made a simple example to demonstrate.
import numpy as np
from matplotlib import pyplot as plt
import lmfit
def my_fun(param): # function to be optimised
return -1. * (.1 * param['a']**2 + 2. * param['b'] - 5. * \
param['c']**0.5 - param['d'] + param['e'])
def brute_wrapper(optimiser_parameters):
""" so I can optimise my_fun() across any parameter set """
initial = {'a': 1., 'b': 2., 'c': 3., 'd': 4., 'e': 5.}
parameters = optimiser_parameters.valuesdict()
for key in initial.keys(): # replace parameters established in optimiser
if key in parameters.keys():
initial[key] = parameters[key]
return my_fun(initial) # fitness indicator
I can plot the results easily if I'm only varying two parameters, like so:
# calculating and plotting for 2
optimisers = lmfit.Parameters()
optimisers.add("b", min=1, max=5, brute_step=1)
optimisers.add("e", min=5, max=11, brute_step=1)
brute = lmfit.minimize(brute_wrapper, optimisers, method='brute')
fig, ax = plt.subplots(1)
x, y = brute.brute_grid
value = -1 * np.array(brute.brute_Jout)
image = ax.pcolormesh(x, y, value)
fig.colorbar(image)
ax.set_xlabel(brute.var_names[0])
ax.set_ylabel(brute.var_names[1])
plt.show()
but for 3 or more plots I'd like to grid up the heatmaps (one plot for each pairing (b~c, b~d, b~e, ..., d~e)) but without doubling up (see example at the end).
# calculating and plotting for 4
optimisers = lmfit.Parameters()
optimisers.add("b", min=1, max=5, brute_step=1)
optimisers.add("c", min=2, max=8, brute_step=1)
optimisers.add("d", min=1, max=6, brute_step=1)
optimisers.add("e", min=5, max=11, brute_step=1)
brute = lmfit.minimize(brute_wrapper, optimisers, method='brute')
# how to structure data for plot?
I tried to use corner.corner and dissected some code from plot_mcmc() in scipy with no luck.
How do I deconstruct the data from brute and make such a plot?
I made a crappy picture to show what I mean (universal colour bar is a fool's hope)
To permute over the pairs of variables you can use the itertools.combinations() on the variable names (or indices if you prefer).
>>>list(itertools.combinations(brute.var_names, 2))
[('a', 'b'), ('a', 'c'), ('a', 'd'), ('b', 'c'), ('b', 'd'), ('c', 'd')]
This will give you the x, y variable pairs for each plot.
You will also need to specify the matplotlib.pyplot.subplot position index for each plot
i.e. for 4 parameters the lower diagonal of the 3x3 grid with positions.
(1) (2) (3)
(4) (5) (6)
(7) (8) (9)
To match the order of the combinations() output above you need the numbers in the order [1, 4, 7, 5, 8, 9]
You can get this with with something like this...
def get_tril_positions(n):
"""Lower square triangular position"""
tril_pos = np.tril((np.arange(n**2) + 1).reshape(n, -1)).T.ravel()
return tril_pos[tril_pos != 0]
Where n is the number of parameters minus 1.
Assuming that for the x-y parameters you would like the minimum residual values (letting all other parameters vary) then you can collapse the brute_grid and the brute_Jout along the other axis using np.amin().
Now with the N-d array collapsed into a 2-d array you can plot it "normally" like a 2-d array.
Combing the above together I get something like.
from itertools import combinations
n = len(brute.var_names) - 1
combos = list(combinations(brute.var_names, 2))
positions = get_tril_positions(n)
for (xname, yname), pos in zip(combos, positions):
# Specify subplot
ax = plt.subplot(n, n, pos)
# Find index for these variables
xi = brute.var_names.index(xname)
yi = brute.var_names.index(yname)
# get the meshgrids for x and y
X = brute.brute_grid[xi]
Y = brute.brute_grid[yi]
# Find other axis to collapse.
axes = tuple([ii for ii in range(brute.brute_Jout.ndim) if ii not in (xi, yi)])
# Collapse to minimum Jout
min_jout = np.amin(brute.brute_Jout, axis=axes)
min_xgrid = np.amin(X, axis=axes)
min_ygrid = np.amin(Y, axis=axes)
ax.pcolormesh(min_xgrid, min_ygrid, min_jout)
# Add colorbar to each plot
plt.colorbar()
# Add labels to edge only
if pos >= n**2 - n:
plt.xlabel(xname)
if pos % n == 1:
plt.ylabel(yname)
plt.tight_layout()
plt.show()
Which produces what you want.
Corner plot
Note, I did not multiply brute_Jout by -1 so you may need to us np.amax instead if you are using your value.
I am trying to plot a graph something similar to this:
For that, I have written the following function in python
def plot_graph_perf(dataset):
#TODO: Give labels as power ranges in spaces of 1000
plotter = ['0',
'1200000-10', '1200000-14', '1200000-18',
'1200000-2', '1200000-22', '1200000-26', '1200000-30',
'1200000-34', '1200000-38', '1200000-42', '1200000-46',
'1200000-6',
'1600000-10', '1600000-14',
'1600000-18', '1600000-2', '1600000-22',
'1600000-26', '1600000-30', '1600000-34',
'1600000-38', '1600000-42', '1600000-46',
'1600000-6',
'2000000-10', '2000000-14',
'2000000-18', '2000000-2', '2000000-22',
'2000000-26', '2000000-30', '2000000-34',
'2000000-38', '2000000-42', '2000000-46',
'2000000-6',
'2400000-10', '2400000-14',
'2400000-18', '2400000-2', '2400000-22',
'2400000-26', '2400000-30', '2400000-34',
'2400000-38', '2400000-42', '2400000-46',
'2400000-6' ,
'800000-10', '800000-14',
'800000-18', '800000-2', '800000-22',
'800000-26', '800000-30', '800000-34',
'800000-38', '800000-42', '800000-46',
'800000-6' ]
x_axis_labels = dataset[1]
x=[a for a in range(len(x_axis_labels))]
y_axis_labels = dataset[0]
y=[a for a in range(len(y_axis_labels))]
width = 0.1
plt.figure
plt.plot(plotter, color = 'g')
plt.tight_layout(pad=1, h_pad=4, w_pad=None)
plt.xticks(x,x_axis_labels, rotation='vertical')
plt.yticks(y,y_axis_labels, rotation='horizontal')
plt.xlabel('Power')
plt.ylabel('perf')
plt.title(file + ' | (Power)')
fig = plt.gcf()
fig.set_size_inches(28.5,10.5)
plt.savefig('watt' + '.png',bbox_inches='tight', pad_inches=0.5,dpi=100)
plt.clf()
Where dataset is two dimensional list something like this
dataset = [[],[]]
each sublist containing same number of elements as plotter.
I plotted dataset[0] and dataset[1] as y and x respectively, but was unable to plot the string values in plotter.
Can you please shed some light and help me plot the plotter values on the graph.
Thanks.
You have to call the text function for each word separately:
words = list("abcdefg")
xs = np.random.randint(0,10,len(words))
ys = np.random.randint(0,10,len(words))
for x, y, s in zip(xs,ys,words):
plt.text(x,y,s)