stacked bar plot using matplotlib

stacked bar plot using matplotlib - python

I am generating bar plots using matplotlib and it looks like there is a bug with the stacked bar plot. The sum for each vertical stack should be 100. However, for X-AXIS ticks 65, 70, 75 and 80 we get completely arbitrary results which do not make any sense. I do not understand what the problem is. Please find the MWE below.
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
header = ['a','b','c','d']
dataset= [('60.0', '65.0', '70.0', '75.0', '80.0', '85.0', '90.0', '95.0', '100.0', '105.0', '110.0', '115.0', '120.0', '125.0', '130.0', '135.0', '140.0', '145.0', '150.0', '155.0', '160.0', '165.0', '170.0', '175.0', '180.0', '185.0', '190.0', '195.0', '200.0'), (0.0, 25.0, 48.93617021276596, 83.01886792452831, 66.66666666666666, 66.66666666666666, 70.96774193548387, 84.61538461538461, 93.33333333333333, 85.0, 92.85714285714286, 93.75, 95.0, 100.0, 100.0, 100.0, 100.0, 80.0, 100.0, 100.0, 100.0, 100.0, 100.0, 100.0, 100.0, 100.0, 100.0, 100.0, 100.0), (0.0, 50.0, 36.17021276595745, 11.320754716981133, 26.666666666666668, 33.33333333333333, 29.03225806451613, 15.384615384615385, 6.666666666666667, 15.0, 7.142857142857142, 6.25, 5.0, 0.0, 0.0, 0.0, 0.0, 20.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0), (0.0, 12.5, 10.638297872340425, 3.7735849056603774, 4.444444444444445, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0), (100.0, 12.5, 4.25531914893617, 1.8867924528301887, 2.2222222222222223, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)]
X_AXIS = dataset[0]
matplotlib.rc('font', serif='Helvetica Neue')
matplotlib.rc('text', usetex='false')
matplotlib.rcParams.update({'font.size': 40})
fig = matplotlib.pyplot.gcf()
fig.set_size_inches(18.5, 10.5)
configs = dataset[0]
N = len(configs)
ind = np.arange(N)
width = 0.4
p1 = plt.bar(ind, dataset[1], width, color='r')
p2 = plt.bar(ind, dataset[2], width, bottom=dataset[1], color='b')
p3 = plt.bar(ind, dataset[3], width, bottom=dataset[2], color='g')
p4 = plt.bar(ind, dataset[4], width, bottom=dataset[3], color='c')
plt.ylim([0,120])
plt.yticks(fontsize=12)
plt.ylabel(output, fontsize=12)
plt.xticks(ind, X_AXIS, fontsize=12, rotation=90)
plt.xlabel('test', fontsize=12)
plt.legend((p1[0], p2[0], p3[0], p4[0]), (header[0], header[1], header[2], header[3]), fontsize=12, ncol=4, framealpha=0, fancybox=True)
plt.show()

You need the bottom of each dataset to be the sum of all the datasets that came before. you may also need to convert the datasets to numpy arrays to add them together.
p1 = plt.bar(ind, dataset[1], width, color='r')
p2 = plt.bar(ind, dataset[2], width, bottom=dataset[1], color='b')
p3 = plt.bar(ind, dataset[3], width,
bottom=np.array(dataset[1])+np.array(dataset[2]), color='g')
p4 = plt.bar(ind, dataset[4], width,
bottom=np.array(dataset[1])+np.array(dataset[2])+np.array(dataset[3]),
color='c')
Alternatively, you could convert them to numpy arrays before you start plotting.
dataset1 = np.array(dataset[1])
dataset2 = np.array(dataset[2])
dataset3 = np.array(dataset[3])
dataset4 = np.array(dataset[4])
p1 = plt.bar(ind, dataset1, width, color='r')
p2 = plt.bar(ind, dataset2, width, bottom=dataset1, color='b')
p3 = plt.bar(ind, dataset3, width, bottom=dataset1+dataset2, color='g')
p4 = plt.bar(ind, dataset4, width, bottom=dataset1+dataset2+dataset3,
color='c')
Or finally if you want to avoid converting to numpy arrays, you could use a list comprehension:
p1 = plt.bar(ind, dataset[1], width, color='r')
p2 = plt.bar(ind, dataset[2], width, bottom=dataset[1], color='b')
p3 = plt.bar(ind, dataset[3], width,
bottom=[sum(x) for x in zip(dataset[1],dataset[2])], color='g')
p4 = plt.bar(ind, dataset[4], width,
bottom=[sum(x) for x in zip(dataset[1],dataset[2],dataset[3])],
color='c')

I found this such a pain that I wrote a function to do it. I'm sharing it in the hope that others find it useful:
import numpy as np
import matplotlib.pyplot as plt
def plot_stacked_bar(data, series_labels, category_labels=None,
show_values=False, value_format="{}", y_label=None,
colors=None, grid=True, reverse=False):
"""Plots a stacked bar chart with the data and labels provided.
Keyword arguments:
data -- 2-dimensional numpy array or nested list
containing data for each series in rows
series_labels -- list of series labels (these appear in
the legend)
category_labels -- list of category labels (these appear
on the x-axis)
show_values -- If True then numeric value labels will
be shown on each bar
value_format -- Format string for numeric value labels
(default is "{}")
y_label -- Label for y-axis (str)
colors -- List of color labels
grid -- If True display grid
reverse -- If True reverse the order that the
series are displayed (left-to-right
or right-to-left)
"""
ny = len(data[0])
ind = list(range(ny))
axes = []
cum_size = np.zeros(ny)
data = np.array(data)
if reverse:
data = np.flip(data, axis=1)
category_labels = reversed(category_labels)
for i, row_data in enumerate(data):
color = colors[i] if colors is not None else None
axes.append(plt.bar(ind, row_data, bottom=cum_size,
label=series_labels[i], color=color))
cum_size += row_data
if category_labels:
plt.xticks(ind, category_labels)
if y_label:
plt.ylabel(y_label)
plt.legend()
if grid:
plt.grid()
if show_values:
for axis in axes:
for bar in axis:
w, h = bar.get_width(), bar.get_height()
plt.text(bar.get_x() + w/2, bar.get_y() + h/2,
value_format.format(h), ha="center",
va="center")
Example:
plt.figure(figsize=(6, 4))
series_labels = ['Series 1', 'Series 2']
data = [
[0.2, 0.3, 0.35, 0.3],
[0.8, 0.7, 0.6, 0.5]
]
category_labels = ['Cat A', 'Cat B', 'Cat C', 'Cat D']
plot_stacked_bar(
data,
series_labels,
category_labels=category_labels,
show_values=True,
value_format="{:.1f}",
colors=['tab:orange', 'tab:green'],
y_label="Quantity (units)"
)
plt.savefig('bar.png')
plt.show()

This is probably your most convenient solution if you are willing to use Pandas:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
X_AXIS = ('60.0', '65.0', '70.0', '75.0', '80.0', '85.0', '90.0', '95.0', '100.0', '105.0', '110.0', '115.0', '120.0', '125.0', '130.0', '135.0', '140.0', '145.0', '150.0', '155.0', '160.0', '165.0', '170.0', '175.0', '180.0', '185.0', '190.0', '195.0', '200.0')
index = pd.Index(X_AXIS, name='test')
data = {'a': (0.0, 25.0, 48.94, 83.02, 66.67, 66.67, 70.97, 84.62, 93.33, 85.0, 92.86, 93.75, 95.0, 100.0, 100.0, 100.0, 100.0, 80.0, 100.0, 100.0, 100.0, 100.0, 100.0, 100.0, 100.0, 100.0, 100.0, 100.0, 100.0),
'b': (0.0, 50.0, 36.17, 11.32, 26.67, 33.33, 29.03, 15.38, 6.67, 15.0, 7.14, 6.25, 5.0, 0.0, 0.0, 0.0, 0.0, 20.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0),
'c': (0.0, 12.5, 10.64, 3.77, 4.45, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0),
'd': (100.0, 12.5, 4.26, 1.89, 2.22, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)}
df = pd.DataFrame(data, index=index)
ax = df.plot(kind='bar', stacked=True, figsize=(10, 6))
ax.set_ylabel('foo')
plt.legend(title='labels', bbox_to_anchor=(1.0, 1), loc='upper left')
# plt.savefig('stacked.png') # if needed
plt.show()

If you're interested in ordered stacking (longest bars at bottom), here is how you can do it:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
a = pd.DataFrame({'a':[0.25, 0.5, 0.15, 0], 'b':[0.15, 0.25, 0.35, 0.15],
'c':[0.50, 0.15, 0.5, 0.35], 'd':[0.35, 0.35, 0.25, 0.5],})
# a b c d
# 0 0.25 0.15 0.50 0.35
# 1 0.50 0.25 0.15 0.35
# 2 0.15 0.35 0.50 0.25
# 3 0.00 0.15 0.35 0.50
fig, ax = plt.subplots()
x = a.index
indexes = np.argsort(a.values).T
heights = np.sort(a.values).T
order = -1
bottoms = heights[::order].cumsum(axis=0)
bottoms = np.insert(bottoms, 0, np.zeros(len(bottoms[0])), axis=0)
mpp_colors = dict(zip(a.columns, plt.rcParams['axes.prop_cycle'].by_key()['color']))
for btms, (idxs, vals) in enumerate(list(zip(indexes, heights))[::order]):
mps = np.take(np.array(a.columns), idxs)
ax.bar(x, height=vals, bottom=bottoms[btms], color=[mpp_colors[m] for m in mps])
ax.set_ylim(bottom=0, top=2)
plt.legend((np.take(np.array(a.columns), np.argsort(a.values)[0]))[::order], loc='upper right')

Here's a solution with a seaborn-like API. You can find an example usage here.
def stackedbarplot(data, stack_order=None, palette=None, **barplot_kws):
"""
Create a stacked barplot
Inputs:
| data <pd.DataFrame>: A wideform dataframe where the index is the variable to stack, the columns are different samples (x-axis), and the cells the counts (y-axis)
| stack_order <array-like>: The order for bars to be stacked (Default: given order)
| palette <array-like>: The colors to use for each value of `stack_order` (Default: husl)
| barplot_kws: Arguments to pass to sns.barplot()
Author: Michael Silverstein
Usage: https://github.com/michaelsilverstein/Pandas-and-Plotting/blob/master/lessons/stacked_bar_chart.ipynb
"""
# Order df
if stack_order is None:
stack_order = data.index
# Create palette if none
if palette is None:
palette = dict(zip(stack_order, sns.husl_palette(len(stack_order))))
# Compute cumsum
cumsum = data.loc[stack_order].cumsum()
# Melt for passing to seaborn
cumsum_stacked = cumsum.stack().reset_index(name='count')
# Get name of variable to stack and sample
stack_name, sample_name = cumsum_stacked.columns[:2]
# Plot bar plot
for s in stack_order[::-1]:
# Subset to this stack level
d = cumsum_stacked[cumsum_stacked[stack_name].eq(s)]
sns.barplot(x=sample_name, y='count', hue=stack_name, palette=palette, data=d, **barplot_kws)
return plt.gca()

Related

python list comprehension and looping question

I have variables as follows:
J = range (1,16)
T = range (1,9)
x= {} # 0,1 decision variable to be determined
These variables turn into combinations of x[j,t].
I am trying to implement a constraint for unacceptable t types in T for x[j,t] combinations that make the x var = 0.
I have a dictionary 'U' with j's as the key and t types and values stored in a list. Zero value means t is unacceptable, 1 is acceptable. The index is range 1-9, not 0-8. So in the example below, j 2, type 3 (bc its at index 3 on range(1,9)) is the only acceptable value.
{1: [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
2: [0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0],
3: [0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0],
4: [1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0],
5: [0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0],
6: [1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0],
7: [0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0],
8: [0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0],
9: [0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0],
10: [0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0],
11: [0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0],
12: [0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0],
13: [0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0],
14: [0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0],
15: [0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0]
}
I am struggling in trying to get the x[j,t] combinations bc of the misaligned index. I set it up like so:
for j,t in x:
if t in U[j]==0:
# do the thing... #addConstr(x[j,t], GRB.EQUAL,0)
So for j 2 the results I need are {(2,1):0, (2,2):0, (2,4):0, (2,5):0, (2,6):0, (2,7):0, (2,8):0} where the index value on range (1,9) becomes the t value in the tupledict.
Any pointers? Thank you!

Assuming your example data is stored in U, what you want to do is:
j_results = []
for j,types in U.items():
results = {}
for t in types:
if t == 0.0:
result[(j,int(t))] = 0
j_results.append(result)
j_results list will contain all results like you described:
for j 2 the results I need are {(2,1):0, (2,2):0, (2,4):0, (2,5):0, (2,6):0, (2,7):0, (2,8):0}
will be in j_result[1] (counter intuitive because your U data start from 1)
Note the int cast, because data you provided has floats, but results you provided are a tuple of ints.

pyspark explode one-hot encoded vector to each column with proper name

Applying one-hot encoding to multiple categorical column
X_cat = X.select(cat_cols)
str_indexer = [StringIndexer(inputCol=col, outputCol=col+"_si", handleInvalid="skip") for col in cat_cols]
ohe = [OneHotEncoder(inputCol=f"{col}_si", outputCol=f"{col}_ohe", dropLast=True) for col in cat_cols]
# ohe.setDropLast(False) # older version
pl = Pipeline(stages=str_indexer + ohe).fit(X_cat)
X_cat = pl.transform(X_cat)
si_cols = [col_nm for col_nm in X_cat.columns if col_nm.endswith("_si")]
ohe_cols = [col_nm for col_nm in X_cat.columns if col_nm.endswith("ohe")]
X_cat_ohe = X_cat.select(ohe_cols)
gives me
root
|-- workclass_ohe: vector (nullable = true)
|-- education_ohe: vector (nullable = true)
|-- marital-status_ohe: vector (nullable = true)
|-- occupation_ohe: vector (nullable = true)
+-------------+---------------+
|workclass_ohe| education_ohe|
+-------------+---------------+
|(8,[4],[1.0])| (15,[2],[1.0])|
|(8,[1],[1.0])| (15,[2],[1.0])|
|(8,[0],[1.0])| (15,[0],[1.0])|
|(8,[0],[1.0])| (15,[5],[1.0])|
which is basically
workclass_ohe education_ohe
0 (0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0) (0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
1 (0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0) (0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
2 (1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0) (1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
3 (1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0) (0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, ...
4 (1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0) (0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
I want to explode value in vector to new column with proper name.
desired output
workclass_state-gov workclass_selfemp workclass_private workclass_middle education_1 education_2 education_3
0 0 0 1 0 0 1
0 1 0 1 0 0 1
1 0 0 0 1 0 0
...
From pyspark - Convert sparse vector obtained after one hot encoding into columns
I could add new columns however from X_cat_ohe I cannot figure out which value(ex: state-gov) corresponds to 0th vector, 1st vector and so on...

Thanks to Dummy Encoding using Pyspark I can extend it to multiple columns.
for col_nm in cat_cols:
category = X.select(col_nm).distinct().rdd.flatMap(lambda x:x).collect()
category = [col_nm + "_" + ct for ct in category]
exprs = [f.when(f.col(col_nm) == ct, 1).otherwise(0)\
.alias(str(ct)) for ct in category]
X = X.select(exprs+X.columns)

how do I pass multiple variables through multiple csv at once?

apologies I know this probably a simple question but I'm new to this! I made all these calculations from a csv file and now I'm trying to make the same calculation through every file for the whole month. In the first class,def get_holdings_info(d):, I'm working through my calculations. In the second class, def get_holdings_info_array(): I'm trying to pass the first class through every day of the month. I think my code is just about right, but for some reason I'm getting a return of 0's. Thanks in advance if you can help!
holdings_darray = ['01-03-2020','01-06-2020','01-07-2020','01-08-2020','01-09-2020','01-11-2020','01-14-2020','01-15-2020','01-17-2020','01-21-2020','01-22-2020','01-23-2020',
'01-24-2020','01-27-2020','01-28-2020','01-29-2020','01-30-2020','01-31-2020','02-04-2020']
account_names = ["CATALYST EXCEED DEFINED SHIELD FUND", "SCA/IB FBO CAT EXCEED DEF SHIELD FD"]
bond_name = ["Bond Paying Periodic Income"]
money_market_name = ["Money Market Fund"]
mutual_fund_name = ["Mutual Fund"]
def get_holdings_info(d):
sbhmv = 0
sbhbv = 0
sbhs = 0
setfhmv = 0
setthbv = 0
setfhs = 0
smmhmv = 0
smmhbv = 0
smmhs = 0
holdings_file = 'holdings/Holdings As Of ' + d + '.csv'
df = pd.read_csv(holdings_file, header=1)
account_names = ["Fund_1", "Fund_1"]
bond_name = ["Bond Paying Periodic Income"]
money_market_name = ["Money Market Fund"]
mutual_fund_name = ["Mutual Fund"]
sbh = df[df["Account Name"].isin(account_names) & df["Security Type Name"].isin(bond_name)]
sbhmv = sbh['Market Value'].sum()
sbhbv = sbh['Book Value'].sum()
sbhs = sbh['Shares'].sum()
setfh = df[df["Account Name"].isin(account_names) & df["Security Type Name"].isin(mutual_fund_name)]
setfhmv = setfh['Market Value'].sum()
setthbv = setfh['Book Value'].sum()
setfhs = setfh['Shares'].sum()
smmh = df[df["Account Name"].isin(account_names) & df["Security Type Name"].isin(money_market_name)]
smmhmv = smmh['Market Value'].sum()
smmhbv = smmh['Book Value'].sum()
smmhs = smmh['Shares'].sum()
return sbhmv, sbhbv, sbhs, setfhmv, setthbv, setfhs, smmhmv, smmhbv, smmhs
def get_holdings_info_array():
c = []
for f in holdings_darray:
c.append(get_holdings_info(f))
return(c)
print(get_holdings_info_array())
[(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0), (0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0), (0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0), (0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0), (0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0), (0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0), (0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0), (0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0), (0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0), (0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0), (0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0), (0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0), (0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0), (0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0), (0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0), (0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0), (0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0), (0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0), (0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)]

What you are in need here, is of tuples. This SO question and it's answers pretty much cover every variable.
Your code, while syntactically correct, semantically is wrong. A python function finishes it's execution after encountering a return statement.
Your function get_holdings_info finishes after reaching this line
return sbhmv
and never executes the rest.
If you want to return all those values you can refer to the question I mention for all possibilities, but the simpler option would be to do:
return sbhmv, sbhbv, sbhs, setfh, setfhmv, setthbv, setfhs, smmh, smmhmv, smmhbv, smmhs
In which you return a tuple with all the values you need.

CVXOPT seemingly provides non-optimal result for this simple quadratic program

I am trying to solve a simple quadratic program using CVXOPT and am troubled by the fact that I can guess a feasible solution better than the optimum provided by the solver. The optimisation is of the form:
I will provide the definitions of P,q,G,h,A, and b at the end. When I import and run:
from cvxopt import matrix, spmatrix, solvers
# Code that creates matrices goes here
sol = solvers.qp(P, q, G, h, A, b)
the result is:
pcost dcost gap pres dres
0: 0.0000e+00 -5.5000e+00 6e+00 6e-17 4e+00
1: 0.0000e+00 -5.5000e-02 6e-02 1e-16 4e-02
2: 0.0000e+00 -5.5000e-04 6e-04 3e-16 4e-04
3: 0.0000e+00 -5.5000e-06 6e-06 1e-16 4e-06
4: 0.0000e+00 -5.5000e-08 6e-08 1e-16 4e-08
Optimal solution found.
Objective = 0.0
However I can define a different solution guessed_solution that is feasible and further minimises the objective:
guessed_solution = matrix([0.5,0.5,0.0,0.0,0.0,0.0,0.5,0.5,0.0,0.0,1.0])
# Check Ax = b; want to see zeroes
print(A * guessed_solution - b)
>>>
[ 0.00e+00]
[ 0.00e+00]
[ 2.78e-17]
# Check Gx <= h; want to see non-positive entries
print(G * guessed_solution - h)
>>>
[-5.00e-01]
[-5.00e-01]
[ 0.00e+00]
[ 0.00e+00]
[ 0.00e+00]
[ 0.00e+00]
[-5.00e-01]
[-5.00e-01]
[-1.00e+00]
[-1.00e+00]
[ 0.00e+00]
[ 0.00e+00]
[ 0.00e+00]
[-1.00e+00]
# Check objective
print(guessed_solution.T * P * guessed_solution + q.T * guessed_solution)
>>>[-6.67e-01]
This results in an objective that is -2/3, clearly less than 0. I presume that the 2.78e-17 error in the Ax=b test is not relevant.
Any help on resolving this would be appreciated! And below is the definition of relevant matrices in code (biggest matrix is 11 by 11).
P = matrix([[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0],[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0/3.0, 0.0, 2.0/3.0], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0/3.0, 2.0/3.0], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0],[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -1.0, 0.0], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -1.0/3.0, 0.0, -2.0/3.0], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -1.0/3.0, -2.0/3.0], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]]).T
q = matrix([0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0])
A = matrix([[0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0],[1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],[0.0, 1.0, 1.0/3.0, 2.0/3.0, 0.0, -1.0, -1.0/3.0, -2.0/3.0, 0.0, 0.0, 0.0]]).T
b = matrix([1.0, 1.0, 0.0])
G = spmatrix([-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 1.0, 1.0, 1.0, -1.0, -1.0, -1.0], [0,1,2,3,4,5,6,7,8,9,10,11,12,13], [0,1,2,3,4,5,6,7,8,9,10,8,9,10])
h = matrix([0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0])

Your quadratic form is not valid in regards to the assumptions.
It needs to be PSD (and symmetric).
Making it symmetric:
P = (P + P.T) / 2
will lead to cvxopt showing an error, which is due to P being indefinite:
import numpy as np
np_matrix = np.array(P)
print(np.linalg.eigvalsh(np_matrix))
#[-8.16496581e-01 -7.45355992e-01 -5.77350269e-01 -2.40008780e-16 -6.33511351e-17 -4.59089160e-17 -3.94415555e-22 5.54077304e-17 5.77350269e-01 7.45355992e-01 8.16496581e-01]
You got a solver designed for convex-optimization problems (if and only if P is PSD) feeded by some non-convex optimization problem. This won't work (in general).

For-Loop Execution in Python - Does the Executable Code Reset?

Trying to plot multiple lines on one graph using matplotlib and for loops, but the code doesn't work after the first iteration. Here's the code:
import csv
import matplotlib.pyplot as plt
r = csv.reader(open('CrimeStatebyState.csv', 'rb'))
line1 = r.next()
def crime_rate(*state):
for s in state:
orig_dict = {}
for n in range (1960,2006):
orig_dict[n] = []
for line in r:
if line[0] == s:
orig_dict[int(line[3])].append(int(line[4]))
for y in orig_dict:
orig_dict[y] = sum(orig_dict[y])
plt.plot(orig_dict.keys(), orig_dict.values(),'r')
print orig_dict.values()
print s
crime_rate("Alabama", "California", "New York")
Here's what it returns:
[39920, 38105, 41112, 44636, 53550, 55131, 61838, 65527, 71285, 75090, 85399, 86919, 84047, 91389, 107314, 125497, 139573, 136995, 147389, 159950, 190511, 191834, 182701, 162361, 155691, 158513, 173807, 181751, 188261, 190573, 198604, 219400, 217889, 204274, 206859, 206188, 205962, 211188, 200065, 192819, 202159, 192835, 200331, 201572, 201664, 197071]
Alabama
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
California
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
New York
**[[[Graph of Alabama's values]]]**
Why am I getting zeroes after the loop runs once? Is this why the other two graphs aren't showing up? Is there an issue with the sum function, the "for line in r" loop, or using *state?
Sorry if that's not enough information! Thanks to those kind/knowledgeable enough for helping.

It would appear that your csv reader is exhausted after you have processed the first state and therefore when you next call "for line in r:" on the next state there are no more lines to look at. You can confirm this by putting a print statement straight after it to see what it has to process e.g.
for line in r:
print "test" # Test print
if line[0] == s:
orig_dict[int(line[3])].append(int(line[4]))
If you re-define your csv reader within each state loop you should get your data correctly processed:
import csv
import matplotlib.pyplot as plt
def crime_rate(*state):
for s in state:
r = csv.reader(open('CrimeStatebyState.csv', 'rb'))
line1 = r.next()
orig_dict = {}
for n in range (1960,2006):
orig_dict[n] = []
for line in r:
if line[0] == s:
orig_dict[int(line[3])].append(int(line[4]))
for y in orig_dict:
orig_dict[y] = sum(orig_dict[y])
plt.plot(orig_dict.keys(), orig_dict.values(),'r')
print orig_dict.values()
print s
crime_rate("Alabama", "California", "New York")

Others have already explained the source of your error. May I suggest you use pandas for this task:
import pandas as pd
states = ["Alabama", "California", "New York"]
data = pd.read_csv('CrimeStatebyState.csv') # import data
df = data[(1996 <= data.Year) & (data.Year <= 2005)] # filter by year
pd.pivot_table(df, rows='Year', cols='State', values='Count')[states].plot()

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

stacked bar plot using matplotlib - python

Related

python list comprehension and looping question

pyspark explode one-hot encoded vector to each column with proper name

how do I pass multiple variables through multiple csv at once?

CVXOPT seemingly provides non-optimal result for this simple quadratic program

For-Loop Execution in Python - Does the Executable Code Reset?

Categories

Resources