I have plotted a graph with two y axes and would now like to add two separate trendlines for each of the y plots.
This is my code:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
%matplotlib inline
amp_costs=pd.read_csv('/Users/Ampicillin_Costs.csv', index_col=None, usecols=[0,1,2])
amp_costs.columns=['PERIOD', 'ITEMS', 'COST PER ITEM']
ax=amp_costs.plot(x='PERIOD', y='COST PER ITEM', color='Blue', style='.', markersize=10)
amp_costs.plot(x='PERIOD', y='ITEMS', secondary_y=True,
color='Red', style='.', markersize=10, ax=ax)
Any guidance as to how to plot these two trend lines to this graph would be much appreciated!
Here is a quick example of how to use sklearn.linear_model.LinearRegression to make the trend line.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
plt.style.use('ggplot')
%matplotlib inline
period = np.arange(10)
items = -2*period +1 + np.random.randint(-2,2,len(period))
cost = 35000*period +15000 + np.random.randint(-25000,25000,len(period))
data = np.vstack((period,items,cost)).T
df = pd.DataFrame(data, columns=\['P','ITEMS', 'COST'\]).set_index('P')
lmcost = LinearRegression().fit(period.reshape(-1,1), cost.reshape(-1,1))
lmitems = LinearRegression().fit(period.reshape(-1,1), items.reshape(-1,1))
df['ITEMS_LM'] = lmitems.predict(period.reshape(-1,1))
df['COST_LM'] = lmcost.predict(period.reshape(-1,1))
fig,ax = plt.subplots()
df.ITEMS.plot(ax = ax, color = 'b')
df.ITEMS_LM.plot(ax = ax,color= 'b', linestyle= 'dashed')
df.COST.plot(ax = ax, secondary_y=True, color ='g')
df.COST_LM.plot(ax = ax, secondary_y=True, color = 'g', linestyle='dashed')
Related
I am writing a simple code with matplotlib/seaborn to plot the data of a sample csv file. However, when call the sns.histplot() function through a for loop, the legends of each column are displaying twice. Any help would be greatly appreciated:)
Here's the code:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import matplotlib
sns.set_style('darkgrid')
df = pd.read_csv('dm_office_sales.csv')
df['salary'] = df['salary'] * 3
df['sample salary'] = df['salary'] * 2
x = df['salary']
y = df['sales']
z = df['sample salary']
fig,ax = plt.subplots()
for i in [x,y,z]:
sns.histplot(data = i, bins=50, ax=ax, palette = 'bright',alpha=0.3, label='{}'.format(i.name))
plt.legend(numpoints=1)
plt.suptitle('Sales/Salary Histogram')
plt.show()
Pass just the columns in question in one step, instead of looping.
sns.histplot(data=df[['salary', 'sales', 'sample salary']], ...)
Here's a demo with the tips dataset:
tips = sns.load_dataset('tips')
fig, ax = plt.subplots()
sns.histplot(tips[['total_bill', 'tip']], bins=50,
ax=ax, alpha=0.3, palette='bright')
plt.show()
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import seaborn as sns
import pandas as pd
sns.set(style="darkgrid")
fig, ax = plt.subplots(figsize=(8, 5))
palette = sns.color_palette("bright", 6)
g = sns.scatterplot(ax=ax, x="Area", y="Rent/Sqft", hue="Region", marker='o', data=df, s=100, palette= palette)
g.legend(bbox_to_anchor=(1, 1), ncol=1)
g.set(xlim = (50000,250000))
How can I can change the axis format from a number to custom format? For example, 125000 to 125.00K
IIUC you can format the xticks and set these:
In[60]:
#generate some psuedo data
df = pd.DataFrame({'num':[50000, 75000, 100000, 125000], 'Rent/Sqft':np.random.randn(4), 'Region':list('abcd')})
df
Out[60]:
num Rent/Sqft Region
0 50000 0.109196 a
1 75000 0.566553 b
2 100000 -0.274064 c
3 125000 -0.636492 d
In[61]:
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import seaborn as sns
import pandas as pd
sns.set(style="darkgrid")
fig, ax = plt.subplots(figsize=(8, 5))
palette = sns.color_palette("bright", 4)
g = sns.scatterplot(ax=ax, x="num", y="Rent/Sqft", hue="Region", marker='o', data=df, s=100, palette= palette)
g.legend(bbox_to_anchor=(1, 1), ncol=1)
g.set(xlim = (50000,250000))
xlabels = ['{:,.2f}'.format(x) + 'K' for x in g.get_xticks()/1000]
g.set_xticklabels(xlabels)
Out[61]:
The key bit here is this line:
xlabels = ['{:,.2f}'.format(x) + 'K' for x in g.get_xticks()/1000]
g.set_xticklabels(xlabels)
So this divides all the ticks by 1000 and then formats them and sets the xtick labels
UPDATE
Thanks to #ScottBoston who has suggested a better method:
ax.xaxis.set_major_formatter(ticker.FuncFormatter(lambda x, pos: '{:,.2f}'.format(x/1000) + 'K'))
see the docs
The canonical way of formatting the tick labels in the standard units is to use an EngFormatter. There is also an example in the matplotlib docs.
Also see Tick locating and formatting
Here it might look as follows.
import numpy as np; np.random.seed(42)
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import seaborn as sns
import pandas as pd
df = pd.DataFrame({"xaxs" : np.random.randint(50000,250000, size=20),
"yaxs" : np.random.randint(7,15, size=20),
"col" : np.random.choice(list("ABC"), size=20)})
fig, ax = plt.subplots(figsize=(8, 5))
palette = sns.color_palette("bright", 6)
sns.scatterplot(ax=ax, x="xaxs", y="yaxs", hue="col", data=df,
marker='o', s=100, palette="magma")
ax.legend(bbox_to_anchor=(1, 1), ncol=1)
ax.set(xlim = (50000,250000))
ax.xaxis.set_major_formatter(ticker.EngFormatter())
plt.show()
Using Seaborn without importing matplotlib:
import seaborn as sns
sns.set()
chart = sns.relplot(x="x_val", y="y_val", kind="line", data=my_data)
ticks = chart.axes[0][0].get_xticks()
xlabels = ['$' + '{:,.0f}'.format(x) for x in ticks]
chart.set_xticklabels(xlabels)
chart.fig
Thank you to EdChum's answer above for getting me 90% there.
Here's how I'm solving this: (similar to ScottBoston)
from matplotlib.ticker import FuncFormatter
f = lambda x, pos: f'{x/10**3:,.0f}K'
ax.xaxis.set_major_formatter(FuncFormatter(f))
We could used the APIs: ax.get_xticklabels() , get_text() and ax.set_xticklabels do it.
e.g,
xlabels = ['{:.2f}k'.format(float(x.get_text().replace('−', '-')))/1000 for x in g.get_xticklabels()]
g.set_xticklabels(xlabels)
Hello how can i make a figure with scatter subplots using pandas? Its working with plot, but not with scatter.
Here an Example
import numpy as np
import pandas as pd
matrix = np.random.rand(200,5)
df = pd.DataFrame(matrix,columns=['index','A','B','C','D'])
#single plot, working with
df.plot(
kind='scatter',
x='index',
y='A',
s= 0.5
)
# not workig
df.plot(
subplots=True,
kind='scatter',
x='index',
y=['A','B','C'],
s= 0.5
)
Error
raise ValueError(self._kind + " requires an x and y column")
ValueError: scatter requires an x and y column
Edit:
Solution to make a figure with subplots with using df.plot
(Thanks to #Fourier)
import numpy as np
import pandas as pd
matrix = np.random.rand(200,5)#random data
df = pd.DataFrame(matrix,columns=['index','A','B','C','D']) #make df
#get a list for subplots
labels = list(df.columns)
labels.remove('index')
df.plot(
layout=(-1, 5),
kind="line",
x='index',
y=labels,
subplots = True,
sharex = True,
ls="none",
marker="o")
Would this work for you:
import pandas as pd
import numpy as np
df = pd.DataFrame({"index":np.arange(5),"A":np.random.rand(5),"B":np.random.rand(5),"C":np.random.rand(5)})
df.plot(kind="line", x="index", y=["A","B","C"], subplots=True, sharex=True, ls="none", marker="o")
Output
Note: This uses a line plot with invisible lines. For a scatter, I would go and loop over it.
for column in df.columns[:-1]: #[:-1] ignores the index column for my random sample
df.plot(kind="scatter", x="index", y=column)
EDIT
In order to add custom ylabels you can do the following:
axes = df.plot(kind='line', x="index", y=["A","B","C"], subplots=True, sharex=True, ls="none", marker="o", legend=False)
ylabels = ["foo","bar","baz"]
for ax, label in zip(axes, ylabels):
ax.set_ylabel(label)
Starting from the following example:
fig, ax = plt.subplots()
df = pd.DataFrame({'n1':[1,2,1,3], 'n2':[1,3,2,1], 'l':['a','b','c','d']})
for label in df['l']:
df.plot('n1','n2', kind='scatter', ax=ax, s=50, linewidth=0.1, label=label)
what I obtained is the following scatterplot:
I'm now trying to set a different color for each of the four points. I know that I can loop over a set of, for instance, 4 colors in a list like:
colorlist = ['b','r','c','y']
but since my real dataset comprise at least 20 different points, I was looking for a sort of "color generator" to loop within it.
The following method will create a list of colors as long as your dataframe, and then plot a point with a label with each color:
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import matplotlib.colors as colors
import numpy as np
import pandas as pd
fig, ax = plt.subplots()
df = pd.DataFrame({'n1':[1,2,1,3], 'n2':[1,3,2,1], 'l':['a','b','c','d']})
colormap = cm.viridis
colorlist = [colors.rgb2hex(colormap(i)) for i in np.linspace(0, 0.9, len(df['l']))]
for i,c in enumerate(colorlist):
x = df['n1'][i]
y = df['n2'][i]
l = df['l'][i]
ax.scatter(x, y, label=l, s=50, linewidth=0.1, c=c)
ax.legend()
plt.show()
IIUC you can do it this way:
import matplotlib.pyplot as plt
from matplotlib import colors
import pandas as pd
colorlist = list(colors.ColorConverter.colors.keys())
fig, ax = plt.subplots()
[df.iloc[[i]].plot.scatter('n1', 'n2', ax=ax, s=50, label=l,
color=colorlist[i % len(colorlist)])
for i,l in enumerate(df.l)]
colorlist:
In [223]: colorlist
Out[223]: ['m', 'b', 'g', 'r', 'k', 'y', 'c', 'w']
PS colorlist[i % len(colorlist)] - should always remain in the list bounds
How about this,
Here is the source code,
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from matplotlib import cm
fig, ax = plt.subplots()
df = pd.DataFrame({'n1':[1,2,1,3], 'n2':[1,3,2,1], 'l':['a','b','c','d']})
#colors = ['b','r','c','y']
nrof_labels = len(df['l'])
colors = cm.rainbow(np.linspace(0, 1, nrof_labels)) # create a bunch of colors
for i, r in df.iterrows():
ax.plot(r['n1'], r['n2'], 'o', markersize=10, color=colors[i], linewidth=0.1, label=r['l'])
ax.set_xlim(0.5, 3.5)
ax.set_ylim(0.5, 3.5)
plt.legend(loc='best')
plt.show()
Additionally, if df[l] has repeated elements and if the colors have to be assigned accordingly:
import matplotlib.cm as cm
import matplotlib.colors as colors
import numpy as np
import pandas as pd
fig, ax = plt.subplots(figsize=(8,8))
df = pd.DataFrame({'n1':[1,2,1,3], 'n2':[1,3,2,1], 'l':['b','b','c','d']})
l_unq = df['l'].unique()
colormap = cm.viridis
colorlist = [colors.rgb2hex(colormap(i)) for i in np.linspace(0, 0.9, len(l_unq))]
for i,c in enumerate(colorlist):
x = df[df.l==l_unq[i]].n1
y = df[df.l==l_unq[i]].n2
l = l_unq[i]
ax.scatter(x, y, label=l, s=50, linewidth=0.1, c=c)
ax.set_xlabel('n1')
ax.set_ylabel('n2')
ax.legend()
plt.show()
Starting from the following example:
fig, ax = plt.subplots()
df = pd.DataFrame({'n1':[1,2,1,3], 'n2':[1,3,2,1], 'l':['a','b','c','d']})
for label in df['l']:
df.plot('n1','n2', kind='scatter', ax=ax, s=50, linewidth=0.1, label=label)
what I obtained is the following scatterplot:
I'm now trying to set a different color for each of the four points. I know that I can loop over a set of, for instance, 4 colors in a list like:
colorlist = ['b','r','c','y']
but since my real dataset comprise at least 20 different points, I was looking for a sort of "color generator" to loop within it.
The following method will create a list of colors as long as your dataframe, and then plot a point with a label with each color:
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import matplotlib.colors as colors
import numpy as np
import pandas as pd
fig, ax = plt.subplots()
df = pd.DataFrame({'n1':[1,2,1,3], 'n2':[1,3,2,1], 'l':['a','b','c','d']})
colormap = cm.viridis
colorlist = [colors.rgb2hex(colormap(i)) for i in np.linspace(0, 0.9, len(df['l']))]
for i,c in enumerate(colorlist):
x = df['n1'][i]
y = df['n2'][i]
l = df['l'][i]
ax.scatter(x, y, label=l, s=50, linewidth=0.1, c=c)
ax.legend()
plt.show()
IIUC you can do it this way:
import matplotlib.pyplot as plt
from matplotlib import colors
import pandas as pd
colorlist = list(colors.ColorConverter.colors.keys())
fig, ax = plt.subplots()
[df.iloc[[i]].plot.scatter('n1', 'n2', ax=ax, s=50, label=l,
color=colorlist[i % len(colorlist)])
for i,l in enumerate(df.l)]
colorlist:
In [223]: colorlist
Out[223]: ['m', 'b', 'g', 'r', 'k', 'y', 'c', 'w']
PS colorlist[i % len(colorlist)] - should always remain in the list bounds
How about this,
Here is the source code,
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from matplotlib import cm
fig, ax = plt.subplots()
df = pd.DataFrame({'n1':[1,2,1,3], 'n2':[1,3,2,1], 'l':['a','b','c','d']})
#colors = ['b','r','c','y']
nrof_labels = len(df['l'])
colors = cm.rainbow(np.linspace(0, 1, nrof_labels)) # create a bunch of colors
for i, r in df.iterrows():
ax.plot(r['n1'], r['n2'], 'o', markersize=10, color=colors[i], linewidth=0.1, label=r['l'])
ax.set_xlim(0.5, 3.5)
ax.set_ylim(0.5, 3.5)
plt.legend(loc='best')
plt.show()
Additionally, if df[l] has repeated elements and if the colors have to be assigned accordingly:
import matplotlib.cm as cm
import matplotlib.colors as colors
import numpy as np
import pandas as pd
fig, ax = plt.subplots(figsize=(8,8))
df = pd.DataFrame({'n1':[1,2,1,3], 'n2':[1,3,2,1], 'l':['b','b','c','d']})
l_unq = df['l'].unique()
colormap = cm.viridis
colorlist = [colors.rgb2hex(colormap(i)) for i in np.linspace(0, 0.9, len(l_unq))]
for i,c in enumerate(colorlist):
x = df[df.l==l_unq[i]].n1
y = df[df.l==l_unq[i]].n2
l = l_unq[i]
ax.scatter(x, y, label=l, s=50, linewidth=0.1, c=c)
ax.set_xlabel('n1')
ax.set_ylabel('n2')
ax.legend()
plt.show()