I am writing a simple code with matplotlib/seaborn to plot the data of a sample csv file. However, when call the sns.histplot() function through a for loop, the legends of each column are displaying twice. Any help would be greatly appreciated:)
Here's the code:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import matplotlib
sns.set_style('darkgrid')
df = pd.read_csv('dm_office_sales.csv')
df['salary'] = df['salary'] * 3
df['sample salary'] = df['salary'] * 2
x = df['salary']
y = df['sales']
z = df['sample salary']
fig,ax = plt.subplots()
for i in [x,y,z]:
sns.histplot(data = i, bins=50, ax=ax, palette = 'bright',alpha=0.3, label='{}'.format(i.name))
plt.legend(numpoints=1)
plt.suptitle('Sales/Salary Histogram')
plt.show()
Pass just the columns in question in one step, instead of looping.
sns.histplot(data=df[['salary', 'sales', 'sample salary']], ...)
Here's a demo with the tips dataset:
tips = sns.load_dataset('tips')
fig, ax = plt.subplots()
sns.histplot(tips[['total_bill', 'tip']], bins=50,
ax=ax, alpha=0.3, palette='bright')
plt.show()
Related
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import seaborn as sns
import pandas as pd
sns.set(style="darkgrid")
fig, ax = plt.subplots(figsize=(8, 5))
palette = sns.color_palette("bright", 6)
g = sns.scatterplot(ax=ax, x="Area", y="Rent/Sqft", hue="Region", marker='o', data=df, s=100, palette= palette)
g.legend(bbox_to_anchor=(1, 1), ncol=1)
g.set(xlim = (50000,250000))
How can I can change the axis format from a number to custom format? For example, 125000 to 125.00K
IIUC you can format the xticks and set these:
In[60]:
#generate some psuedo data
df = pd.DataFrame({'num':[50000, 75000, 100000, 125000], 'Rent/Sqft':np.random.randn(4), 'Region':list('abcd')})
df
Out[60]:
num Rent/Sqft Region
0 50000 0.109196 a
1 75000 0.566553 b
2 100000 -0.274064 c
3 125000 -0.636492 d
In[61]:
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import seaborn as sns
import pandas as pd
sns.set(style="darkgrid")
fig, ax = plt.subplots(figsize=(8, 5))
palette = sns.color_palette("bright", 4)
g = sns.scatterplot(ax=ax, x="num", y="Rent/Sqft", hue="Region", marker='o', data=df, s=100, palette= palette)
g.legend(bbox_to_anchor=(1, 1), ncol=1)
g.set(xlim = (50000,250000))
xlabels = ['{:,.2f}'.format(x) + 'K' for x in g.get_xticks()/1000]
g.set_xticklabels(xlabels)
Out[61]:
The key bit here is this line:
xlabels = ['{:,.2f}'.format(x) + 'K' for x in g.get_xticks()/1000]
g.set_xticklabels(xlabels)
So this divides all the ticks by 1000 and then formats them and sets the xtick labels
UPDATE
Thanks to #ScottBoston who has suggested a better method:
ax.xaxis.set_major_formatter(ticker.FuncFormatter(lambda x, pos: '{:,.2f}'.format(x/1000) + 'K'))
see the docs
The canonical way of formatting the tick labels in the standard units is to use an EngFormatter. There is also an example in the matplotlib docs.
Also see Tick locating and formatting
Here it might look as follows.
import numpy as np; np.random.seed(42)
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import seaborn as sns
import pandas as pd
df = pd.DataFrame({"xaxs" : np.random.randint(50000,250000, size=20),
"yaxs" : np.random.randint(7,15, size=20),
"col" : np.random.choice(list("ABC"), size=20)})
fig, ax = plt.subplots(figsize=(8, 5))
palette = sns.color_palette("bright", 6)
sns.scatterplot(ax=ax, x="xaxs", y="yaxs", hue="col", data=df,
marker='o', s=100, palette="magma")
ax.legend(bbox_to_anchor=(1, 1), ncol=1)
ax.set(xlim = (50000,250000))
ax.xaxis.set_major_formatter(ticker.EngFormatter())
plt.show()
Using Seaborn without importing matplotlib:
import seaborn as sns
sns.set()
chart = sns.relplot(x="x_val", y="y_val", kind="line", data=my_data)
ticks = chart.axes[0][0].get_xticks()
xlabels = ['$' + '{:,.0f}'.format(x) for x in ticks]
chart.set_xticklabels(xlabels)
chart.fig
Thank you to EdChum's answer above for getting me 90% there.
Here's how I'm solving this: (similar to ScottBoston)
from matplotlib.ticker import FuncFormatter
f = lambda x, pos: f'{x/10**3:,.0f}K'
ax.xaxis.set_major_formatter(FuncFormatter(f))
We could used the APIs: ax.get_xticklabels() , get_text() and ax.set_xticklabels do it.
e.g,
xlabels = ['{:.2f}k'.format(float(x.get_text().replace('−', '-')))/1000 for x in g.get_xticklabels()]
g.set_xticklabels(xlabels)
Hello how can i make a figure with scatter subplots using pandas? Its working with plot, but not with scatter.
Here an Example
import numpy as np
import pandas as pd
matrix = np.random.rand(200,5)
df = pd.DataFrame(matrix,columns=['index','A','B','C','D'])
#single plot, working with
df.plot(
kind='scatter',
x='index',
y='A',
s= 0.5
)
# not workig
df.plot(
subplots=True,
kind='scatter',
x='index',
y=['A','B','C'],
s= 0.5
)
Error
raise ValueError(self._kind + " requires an x and y column")
ValueError: scatter requires an x and y column
Edit:
Solution to make a figure with subplots with using df.plot
(Thanks to #Fourier)
import numpy as np
import pandas as pd
matrix = np.random.rand(200,5)#random data
df = pd.DataFrame(matrix,columns=['index','A','B','C','D']) #make df
#get a list for subplots
labels = list(df.columns)
labels.remove('index')
df.plot(
layout=(-1, 5),
kind="line",
x='index',
y=labels,
subplots = True,
sharex = True,
ls="none",
marker="o")
Would this work for you:
import pandas as pd
import numpy as np
df = pd.DataFrame({"index":np.arange(5),"A":np.random.rand(5),"B":np.random.rand(5),"C":np.random.rand(5)})
df.plot(kind="line", x="index", y=["A","B","C"], subplots=True, sharex=True, ls="none", marker="o")
Output
Note: This uses a line plot with invisible lines. For a scatter, I would go and loop over it.
for column in df.columns[:-1]: #[:-1] ignores the index column for my random sample
df.plot(kind="scatter", x="index", y=column)
EDIT
In order to add custom ylabels you can do the following:
axes = df.plot(kind='line', x="index", y=["A","B","C"], subplots=True, sharex=True, ls="none", marker="o", legend=False)
ylabels = ["foo","bar","baz"]
for ax, label in zip(axes, ylabels):
ax.set_ylabel(label)
I have the following code
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
info = {"Quiz":[1,2,5,4,3,2,6,5,7],
"Score":[1,6,4,2,8,9,10,5,7]}
df = pd.DataFrame.from_dict(info)
fig, ax = plt.subplots(figsize=(6,4))
sns.catplot(x="Quiz", y = 'Score', data=df, ax=ax)
plt.show()
This is what I am seeing.
Why are there two images showing?
Reading the documentation, catplot doesn't return an ax object.
This is a followup question on this solution. There is automatic assignment of different colors when kind=line but for scatter plot that's not the case.
import pandas as pd
import matplotlib.pylab as plt
import numpy as np
# random df
df = pd.DataFrame(np.random.randint(0,10,size=(25, 3)), columns=['label','x','y'])
# plot groupby results on the same canvas
fig, ax = plt.subplots(figsize=(8,6))
df.groupby('label').plot(kind='scatter', x = "x", y = "y", ax=ax)
There is a connected issue here. Is there any simple workaround for this?
Update:
When I try the solution recommended by #ImportanceOfBeingErnest for a label column with strings, its not working!
df = pd.DataFrame(np.random.randint(0,10,size=(5, 2)), columns=['x','y'])
df['label'] = ['yes','no','yes','yes','no']
fig, ax = plt.subplots(figsize=(8,6))
ax.scatter(x='x', y='y', c='label', data=df)
It throws following error,
ValueError: Invalid RGBA argument: 'yes'
During handling of the above exception, another exception occurred:
You can use sns:
df = pd.DataFrame(np.random.randint(0,10,size=(100, 2)), columns=['x','y'])
df['label'] = np.random.choice(['yes','no','yes','yes','no'], 100)
fig, ax = plt.subplots(figsize=(8,6))
sns.scatterplot(x='x', y='y', hue='label', data=df)
plt.show()
Output:
Another option is as what suggested in the comment: Map value to number, by categorical type:
fig, ax = plt.subplots(figsize=(8,6))
ax.scatter(df.x, df.y, c = pd.Categorical(df.label).codes, cmap='tab20b')
plt.show()
Output:
You can loop over groupby and create a scatter per group. That is efficient for less than ~10 categories.
import pandas as pd
import matplotlib.pylab as plt
import numpy as np
# random df
df = pd.DataFrame(np.random.randint(0,10,size=(5, 2)), columns=['x','y'])
df['label'] = ['yes','no','yes','yes','no']
# plot groupby results on the same canvas
fig, ax = plt.subplots(figsize=(8,6))
for n, grp in df.groupby('label'):
ax.scatter(x = "x", y = "y", data=grp, label=n)
ax.legend(title="Label")
plt.show()
Alternatively you can create a single scatter like
import pandas as pd
import matplotlib.pylab as plt
import numpy as np
# random df
df = pd.DataFrame(np.random.randint(0,10,size=(5, 2)), columns=['x','y'])
df['label'] = ['yes','no','yes','yes','no']
# plot groupby results on the same canvas
fig, ax = plt.subplots(figsize=(8,6))
u, df["label_num"] = np.unique(df["label"], return_inverse=True)
sc = ax.scatter(x = "x", y = "y", c = "label_num", data=df)
ax.legend(sc.legend_elements()[0], u, title="Label")
plt.show()
Incase we have a grouped data already, then I find the following solution could be useful.
df = pd.DataFrame(np.random.randint(0,10,size=(5, 2)), columns=['x','y'])
df['label'] = ['yes','no','yes','yes','no']
fig, ax = plt.subplots(figsize=(7,3))
def plot_grouped_df(grouped_df,
ax, x='x', y='y', cmap = plt.cm.autumn_r):
colors = cmap(np.linspace(0.5, 1, len(grouped_df)))
for i, (name,group) in enumerate(grouped_df):
group.plot(ax=ax,
kind='scatter',
x=x, y=y,
color=colors[i],
label = name)
# now we can use this function to plot the groupby data with categorical values
plot_grouped_df(df.groupby('label'),ax)
I have plotted a graph with two y axes and would now like to add two separate trendlines for each of the y plots.
This is my code:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
%matplotlib inline
amp_costs=pd.read_csv('/Users/Ampicillin_Costs.csv', index_col=None, usecols=[0,1,2])
amp_costs.columns=['PERIOD', 'ITEMS', 'COST PER ITEM']
ax=amp_costs.plot(x='PERIOD', y='COST PER ITEM', color='Blue', style='.', markersize=10)
amp_costs.plot(x='PERIOD', y='ITEMS', secondary_y=True,
color='Red', style='.', markersize=10, ax=ax)
Any guidance as to how to plot these two trend lines to this graph would be much appreciated!
Here is a quick example of how to use sklearn.linear_model.LinearRegression to make the trend line.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
plt.style.use('ggplot')
%matplotlib inline
period = np.arange(10)
items = -2*period +1 + np.random.randint(-2,2,len(period))
cost = 35000*period +15000 + np.random.randint(-25000,25000,len(period))
data = np.vstack((period,items,cost)).T
df = pd.DataFrame(data, columns=\['P','ITEMS', 'COST'\]).set_index('P')
lmcost = LinearRegression().fit(period.reshape(-1,1), cost.reshape(-1,1))
lmitems = LinearRegression().fit(period.reshape(-1,1), items.reshape(-1,1))
df['ITEMS_LM'] = lmitems.predict(period.reshape(-1,1))
df['COST_LM'] = lmcost.predict(period.reshape(-1,1))
fig,ax = plt.subplots()
df.ITEMS.plot(ax = ax, color = 'b')
df.ITEMS_LM.plot(ax = ax,color= 'b', linestyle= 'dashed')
df.COST.plot(ax = ax, secondary_y=True, color ='g')
df.COST_LM.plot(ax = ax, secondary_y=True, color = 'g', linestyle='dashed')