How do you plot by Groupby? - python

I want to know how to plot a bar graph among groups 'new_build', X Axis shows the towns and Y Axis shows the percentage values from the calculation performed in the code below
df_House.groupby(['new_build', 'town'])['price_paid'].count()/df_House.groupby(['new_build', 'town'])['price_paid'].count().sum()

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
data = [['a', 'x', 100], ['b', 'y', 200], ['c', 'z', 300], ['a', 'y', 400], ['b', 'z', 600], ['c', 'x', 100]]
df = pd.DataFrame(data, columns=['new_build', 'town', 'price_paid'])
df_town = df.groupby(['new_build', 'town']).agg({'price_paid': 'sum'})
new_df = df_town.div(state, level='new_build') * 100
new_df.reset_index(inplace=True)
sns.set()
new_df.set_index(['new_build', 'town']).price_paid.plot(kind='bar', stacked=True)
plt.ylabel('% of price_paid')
Not sure if you need a stacked graph to better represent the data but to keep things less complicated, I have created a bar graph performing the task you requested.

Related

ValueError due to a missing element in color map

I need to build a network where nodes (from df1) have some specific colors based on labels from a different dataset (df2). In df1 not all the nodes have labelled assigned in df2 (for example, because they have not been labelled yet, so they have currently nan value).
The below code should provide a good example on what I mean:
import networkx as nx
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt, colors as mcolor
# Sample DataFrames
df1 = pd.DataFrame({
'Node': ['A', 'A', 'B', 'B', 'B', 'Z'],
'Edge': ['B', 'D', 'N', 'A', 'X', 'C']
})
df2 = pd.DataFrame({
'Nodes': ['A', 'B', 'C', 'D', 'N', 'S', 'X'],
'Attribute': [-1, 0, -1.5, 1, 1, 9, 0]
})
# Simplified construction of `colour_map`
uni_val = df2['Attribute'].unique()
colors = plt.cm.jet(np.linspace(0, 1, len(uni_val)))
# Map colours to_hex then zip with
mapper = dict(zip(uni_val, map(mcolor.to_hex, colors)))
color_map =df2.set_index('Nodes')['Attribute'].map(mapper).fillna('black')
G = nx.from_pandas_edgelist(df1, source='Node', target='Edge')
# Add Attribute to each node
nx.set_node_attributes(G, color_map, name="colour")
# Then draw with colours based on attribute values:
nx.draw(G,
node_color=nx.get_node_attributes(G, 'colour').values(),
with_labels=True)
plt.show()
Z is not df2 because df2 was created considering only non NA values.
I would like to assign the color black to unlabelled nodes, i.e., for those nodes that are not in df2.
Trying to run the code above, I am getting this error:
ValueError: 'c' argument has 7 elements, which is inconsistent with 'x' and 'y' with size 8.
It is clear that this error is caused by the add of color black for missing, not included in color_map.
What it is not clear to me is how to fix the issue. I hope in some help for figuring it out.
Since Z is not in df2, but is one of the nodes, we should, instead of creating properties exclusively from df2 we should reindex the color_map from nodes nodes with a fill_value:
# Create graph before color map:
G = nx.from_pandas_edgelist(df1, source='Node', target='Edge')
# Create Colour map. Ensure all nodes have a value via reindex using nodes
color_map = (
df2.set_index('Nodes')['Attribute'].map(mapper)
.reindex(G.nodes(), fill_value='black')
)
color_map without reindex
df2.set_index('Nodes')['Attribute'].map(mapper)
Nodes
A #000080
B #0080ff
C #7dff7a
D #ff9400
N #ff9400
S #800000
X #0080ff
Name: Attribute, dtype: object
nodes (using nodes here since this will be all nodes in the Graph, rather than just those in df2)
G.nodes()
['A', 'B', 'D', 'N', 'X', 'Z', 'C']
reindex to ensure all nodes are present in mapping:
df2.set_index('Nodes')['Attribute'].map(mapper).reindex(G.nodes(), fill_value='black')
Nodes
A #000080
B #0080ff
D #ff9400
N #ff9400
X #0080ff
Z black # <- Missing Nodes are added with specified value
C #7dff7a
Name: Attribute, dtype: object
Complete Code:
import networkx as nx
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt, colors as mcolor
# Sample DataFrames
df1 = pd.DataFrame({
'Node': ['A', 'A', 'B', 'B', 'B', 'Z'],
'Edge': ['B', 'D', 'N', 'A', 'X', 'C']
})
df2 = pd.DataFrame({
'Nodes': ['A', 'B', 'C', 'D', 'N', 'S', 'X'],
'Attribute': [-1, 0, -1.5, 1, 1, 9, 0]
})
# Simplified construction of `colour_map`
uni_val = df2['Attribute'].unique()
colors = plt.cm.jet(np.linspace(0, 1, len(uni_val)))
# Map colours to_hex then zip with
mapper = dict(zip(uni_val, map(mcolor.to_hex, colors)))
G = nx.from_pandas_edgelist(df1, source='Node', target='Edge')
# Create Colour map. Ensure all nodes have a value via reindex
color_map = (
df2.set_index('Nodes')['Attribute'].map(mapper)
.reindex(G.nodes(), fill_value='black')
)
# Add Attribute to each node
nx.set_node_attributes(G, color_map, name="colour")
# Then draw with colours based on attribute values:
nx.draw(G,
node_color=nx.get_node_attributes(G, 'colour').values(),
with_labels=True)
plt.show()

Hide non observed categories in a seaborn boxplot

I am currently working on a data analysis, and want to show some data distributions through seaborn boxplots.
I have a categorical data, 'seg1' which can in my dataset take 3 values ('Z1', 'Z3', 'Z4'). However, data in group 'Z4' is too exotic to be reported for me, and I would like to produce boxplots showing only categories 'Z1' and 'Z3'.
Filtering the data source of the plot did not work, as category 'Z4' is still showed with no data point.
Is there any other solution than having to create a new CategoricalDtype with only ('Z1', 'Z3') and cast/project my data back on this new category?
I would simply like to hide 'Z4' category.
I am using seaborn 0.10.1 and matplotlib 3.3.1.
Thanks in advance for your answers.
My tries are below, and some data to reproduce.
Dummy data
dummy_cat = pd.CategoricalDtype(['a', 'b', 'c'])
df = pd.DataFrame({'col1': ['a', 'b', 'a', 'b'], 'col2': [12., 5., 3., 2]})
df.col1 = df.col1.astype(dummy_cat)
sns.boxplot(data=df, x='col1', y='col2')
Apply no filter
fig, axs = plt.subplots(figsize=(8, 25), nrows=len(indicators2), squeeze=False)
for j, indicator in enumerate(indicators2):
sns.boxplot(data=orders, y=indicator, x='seg1', hue='origin2', ax=axs[j, 0], showfliers=False)
Which produces:
Filter data source
mask_filter = orders.seg1.isin(['Z1', 'Z3'])
fig, axs = plt.subplots(figsize=(8, 25), nrows=len(indicators2), squeeze=False)
for j, indicator in enumerate(indicators2):
sns.boxplot(data=orders.loc[mask_filter], y=indicator, x='seg1', hue='origin2', ax=axs[j, 0], showfliers=False)
Which produces:
To cut off the last (or first) x-value, set_xlim() can be used, e.g. ax.set_xlim(-0.5, 1.5).
Another option is to work with seaborn's order= parameter and only add the desired values in that list. Optionally that can be created programmatically:
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
dummy_cat = pd.CategoricalDtype(['a', 'b', 'c'])
df = pd.DataFrame({'col1': ['a', 'b', 'a', 'b'], 'col2': [12., 5., 3., 2]})
df.col1 = df.col1.astype(dummy_cat)
order = [cat for cat in dummy_cat.categories if df['col1'].str.contains(cat).any()]
sns.boxplot(data=df, x='col1', y='col2', order=order)
plt.show()

Python + Matplotlib: multi-level treemap plot?

I recently saw this treemap chart from https://www.kaggle.com/philippsp/exploratory-analysis-instacart (two levels of hierarchy, colored, squarified treemap).
It is made with R ggplot2::treemap, by:
treemap(tmp,index=c("department","aisle"),vSize="n",title="",
palette="Set3",border.col="#FFFFFF")
I want to know how can I make this plot in Python?
I searched a bit, but didn't find any multi-level treemap example.
https://gist.github.com/gVallverdu/0b446d0061a785c808dbe79262a37eea
https://python-graph-gallery.com/200-basic-treemap-with-python/
You can use plotly. Here you can find several examples.
https://plotly.com/python/treemaps/
This is a very simple example with a multi-level structure.
import plotly.express as px
import pandas as pd
from collections import defaultdict
data = defaultdict()
data['level_1'] = ['A', 'A', 'A', 'B', 'B', 'B']
data['level_2'] = ['X', 'X', 'Y', 'Z', 'Z', 'X']
data['level_3'] = ['1', '2', '2', '1', '1', '2']
data = pd.DataFrame.from_dict(data)
fig = px.treemap(data, path=['level_1', 'level_2', 'level_3'])
fig.show()
The package matplotlib-extra provides a treemap function that supports multi-level treemap plot. For the dataset of G20, treemap can produce the similar treemap, such as:
import matplotlib.pyplot as plt
import mpl_extra.treemap as tr
fig, ax = plt.subplots(figsize=(7,7), dpi=100, subplot_kw=dict(aspect=1.156))
trc = tr.treemap(ax, df, area='gdp_mil_usd', fill='hdi', labels='country',
levels=['region', 'country'],
textprops={'c':'w', 'wrap':True,
'place':'top left', 'max_fontsize':20},
rectprops={'ec':'w'},
subgroup_rectprops={'region':{'ec':'grey', 'lw':2, 'fill':False,
'zorder':5}},
subgroup_textprops={'region':{'c':'k', 'alpha':0.5, 'fontstyle':'italic'}},
)
ax.axis('off')
cb = fig.colorbar(trc.mappable, ax=ax, shrink=0.5)
cb.ax.set_title('hdi')
cb.outline.set_edgecolor('w')
plt.show()
The obtained treemap is as follows:
For more inforamtion, you can see the project, which has some examples. The source code has an api docstring.

Non overlapping error bars in line plot

I am using Pandas and Matplotlib to create some plots. I want line plots with error bars on them. The code I am using currently looks like this
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
df = pd.DataFrame(index=[10,100,1000,10000], columns=['A', 'B', 'C', 'D', 'E', 'F'], data=np.random.rand(4,6))
df_yerr = pd.DataFrame(index=[10,100,1000,10000], columns=['A', 'B', 'C', 'D', 'E', 'F'], data=np.random.rand(4,6))
fig, ax = plt.subplots()
df.plot(yerr=df_yerr, ax=ax, fmt="o-", capsize=5)
ax.set_xscale("log")
plt.show()
With this code, I get 6 lines on a single plot (which is what I want). However, the error bars completely overlap, making the plot difficult to read.
Is there a way I could slightly shift the position of each point on the x-axis so that the error bars no longer overlap?
Here is a screenshot:
One way to achieve what you want is to plot the error bars 'by hand', but it is neither straight forward nor much better looking than your original. Basically, what you do is make pandas produce the line plot and then iterate through the data frame columns and do a pyplot errorbar plot for each of them such, that the index is slightly shifted sideways (in your case, with the logarithmic scale on the x axis, this would be a shift by a factor). In the error bar plots, the marker size is set to zero:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
colors = ['red','blue','green','yellow','purple','black']
df = pd.DataFrame(index=[10,100,1000,10000], columns=['A', 'B', 'C', 'D', 'E', 'F'], data=np.random.rand(4,6))
df_yerr = pd.DataFrame(index=[10,100,1000,10000], columns=['A', 'B', 'C', 'D', 'E', 'F'], data=np.random.rand(4,6))
fig, ax = plt.subplots()
df.plot(ax=ax, marker="o",color=colors)
index = df.index
rows = len(index)
columns = len(df.columns)
factor = 0.95
for column,color in zip(range(columns),colors):
y = df.values[:,column]
yerr = df_yerr.values[:,column]
ax.errorbar(
df.index*factor, y, yerr=yerr, markersize=0, capsize=5,color=color,
zorder = 10,
)
factor *= 1.02
ax.set_xscale("log")
plt.show()
As I said, the result is not pretty:
UPDATE
In my opinion a bar plot would be much more informative:
fig2,ax2 = plt.subplots()
df.plot(kind='bar',yerr=df_yerr, ax=ax2)
plt.show()
you can solve with alpha for examples
df.plot(yerr=df_yerr, ax=ax, fmt="o-", capsize=5,alpha=0.5)
You can also check this link for reference

Color a heatmap in Python/Matplotlib according to requirement

I'm trying to make a heatmap with a specified requirement of the coloring. I want to set an interval for the data and judge that as ok and color it green, the rest of the results should be colored as red. Does anyone have a clue of how to do this??
I have attache a simple example using pandas and matplotlib for better understanding.
import numpy as np
from pandas import *
import matplotlib.pyplot as plt
Index= ['aaa', 'bbb', 'ccc', 'ddd', 'eee']
Cols = ['A', 'B', 'C', 'D']
data= abs(np.random.randn(5, 4))
df = DataFrame(data, index=Index, columns=Cols)
plt.pcolor(df)
plt.yticks(np.arange(0.5, len(df.index), 1), df.index)
plt.xticks(np.arange(0.5, len(df.columns), 1), df.columns)
plt.show()
There's more than one way to do this.
The easiest way is to just pass in a boolean array to pcolor and then choose a colormap where green is high and red is low.
For example:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
Index= ['aaa', 'bbb', 'ccc', 'ddd', 'eee']
Cols = ['A', 'B', 'C', 'D']
data= np.random.random((5, 4))
df = pd.DataFrame(data, index=Index, columns=Cols)
plt.pcolor(df > 0.5, cmap='RdYlGn')
plt.yticks(np.arange(0.5, len(df.index), 1), df.index)
plt.xticks(np.arange(0.5, len(df.columns), 1), df.columns)
plt.show()
Alternately, as #Cyber mentioned, you could make a two-color colormap based on your values and use it:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
Index= ['aaa', 'bbb', 'ccc', 'ddd', 'eee']
Cols = ['A', 'B', 'C', 'D']
data= np.random.random((5, 4))
df = pd.DataFrame(data, index=Index, columns=Cols)
# Values from 0-0.5 will be red and 0.5-1 will be green
cmap, norm = mcolors.from_levels_and_colors([0, 0.5, 1], ['red', 'green'])
plt.pcolor(df, cmap=cmap, norm=norm)
plt.yticks(np.arange(0.5, len(df.index), 1), df.index)
plt.xticks(np.arange(0.5, len(df.columns), 1), df.columns)
plt.show()
(The color difference is just because the "RdYlGn" colormap uses darker greens and reds as its endpoints.)
On a side note, it's also considerably faster to use pcolormesh for this, rather than pcolor. For small arrays, it won't make a significant difference, but for large arrays pcolor is excessively slow. imshow is even faster yet, if you don't mind raster output. Use imshow(data, interpolation='nearest', aspect='auto', origin='lower') to match the defaults of pcolor and pcolormesh.
You can make a 2 color colormap.
Then you can set the cutoff value between red and green.

Categories

Resources