Pandas groupby scatter plot in a single plot - python

This is a followup question on this solution. There is automatic assignment of different colors when kind=line but for scatter plot that's not the case.
import pandas as pd
import matplotlib.pylab as plt
import numpy as np
# random df
df = pd.DataFrame(np.random.randint(0,10,size=(25, 3)), columns=['label','x','y'])
# plot groupby results on the same canvas
fig, ax = plt.subplots(figsize=(8,6))
df.groupby('label').plot(kind='scatter', x = "x", y = "y", ax=ax)
There is a connected issue here. Is there any simple workaround for this?
Update:
When I try the solution recommended by #ImportanceOfBeingErnest for a label column with strings, its not working!
df = pd.DataFrame(np.random.randint(0,10,size=(5, 2)), columns=['x','y'])
df['label'] = ['yes','no','yes','yes','no']
fig, ax = plt.subplots(figsize=(8,6))
ax.scatter(x='x', y='y', c='label', data=df)
It throws following error,
ValueError: Invalid RGBA argument: 'yes'
During handling of the above exception, another exception occurred:

You can use sns:
df = pd.DataFrame(np.random.randint(0,10,size=(100, 2)), columns=['x','y'])
df['label'] = np.random.choice(['yes','no','yes','yes','no'], 100)
fig, ax = plt.subplots(figsize=(8,6))
sns.scatterplot(x='x', y='y', hue='label', data=df)
plt.show()
Output:
Another option is as what suggested in the comment: Map value to number, by categorical type:
fig, ax = plt.subplots(figsize=(8,6))
ax.scatter(df.x, df.y, c = pd.Categorical(df.label).codes, cmap='tab20b')
plt.show()
Output:

You can loop over groupby and create a scatter per group. That is efficient for less than ~10 categories.
import pandas as pd
import matplotlib.pylab as plt
import numpy as np
# random df
df = pd.DataFrame(np.random.randint(0,10,size=(5, 2)), columns=['x','y'])
df['label'] = ['yes','no','yes','yes','no']
# plot groupby results on the same canvas
fig, ax = plt.subplots(figsize=(8,6))
for n, grp in df.groupby('label'):
ax.scatter(x = "x", y = "y", data=grp, label=n)
ax.legend(title="Label")
plt.show()
Alternatively you can create a single scatter like
import pandas as pd
import matplotlib.pylab as plt
import numpy as np
# random df
df = pd.DataFrame(np.random.randint(0,10,size=(5, 2)), columns=['x','y'])
df['label'] = ['yes','no','yes','yes','no']
# plot groupby results on the same canvas
fig, ax = plt.subplots(figsize=(8,6))
u, df["label_num"] = np.unique(df["label"], return_inverse=True)
sc = ax.scatter(x = "x", y = "y", c = "label_num", data=df)
ax.legend(sc.legend_elements()[0], u, title="Label")
plt.show()

Incase we have a grouped data already, then I find the following solution could be useful.
df = pd.DataFrame(np.random.randint(0,10,size=(5, 2)), columns=['x','y'])
df['label'] = ['yes','no','yes','yes','no']
fig, ax = plt.subplots(figsize=(7,3))
def plot_grouped_df(grouped_df,
ax, x='x', y='y', cmap = plt.cm.autumn_r):
colors = cmap(np.linspace(0.5, 1, len(grouped_df)))
for i, (name,group) in enumerate(grouped_df):
group.plot(ax=ax,
kind='scatter',
x=x, y=y,
color=colors[i],
label = name)
# now we can use this function to plot the groupby data with categorical values
plot_grouped_df(df.groupby('label'),ax)

Related

Matplot legends are printing twice

I am writing a simple code with matplotlib/seaborn to plot the data of a sample csv file. However, when call the sns.histplot() function through a for loop, the legends of each column are displaying twice. Any help would be greatly appreciated:)
Here's the code:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import matplotlib
sns.set_style('darkgrid')
df = pd.read_csv('dm_office_sales.csv')
df['salary'] = df['salary'] * 3
df['sample salary'] = df['salary'] * 2
x = df['salary']
y = df['sales']
z = df['sample salary']
fig,ax = plt.subplots()
for i in [x,y,z]:
sns.histplot(data = i, bins=50, ax=ax, palette = 'bright',alpha=0.3, label='{}'.format(i.name))
plt.legend(numpoints=1)
plt.suptitle('Sales/Salary Histogram')
plt.show()
Pass just the columns in question in one step, instead of looping.
sns.histplot(data=df[['salary', 'sales', 'sample salary']], ...)
Here's a demo with the tips dataset:
tips = sns.load_dataset('tips')
fig, ax = plt.subplots()
sns.histplot(tips[['total_bill', 'tip']], bins=50,
ax=ax, alpha=0.3, palette='bright')
plt.show()

Plotting multiple pandas DataFrames in one *3D* scatterplotplot

I want to plot two dataframes in one 3D scatterplot.
This is the code I have for one dataframe:
import seaborn as sns
from mpl_toolkits.mplot3d import Axes3D
...
sns.set(style = "darkgrid")
fig = plt.figure()
ax = fig.add_subplot(111, projection = '3d')
x = df['xitem']
y = df['yitem']
z = df['zitem']
ax.set_xlabel("X Label")
ax.set_ylabel("Y Label")
ax.set_zlabel("Z Label")
ax.scatter(x, y, z)
plt.show()
I can't figure out how to adjust this so I have two different dataframes plotted on the same plot but with different colors. How can I do this?
Edit: I'm looking for how to use two dataframes for a 3D plot specifically.
Assuming that you have two DataFrame called df1 and df2, both containing columns 'xitem', 'yitem', 'zitem', you can plot them in this way:
for curr_df, c in zip((df1, df2), ('b', 'r')):
ax.scatter(*curr_df[['xitem', 'yitem', 'zitem']].values.T, color=c)
Here a complete example:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style = "darkgrid")
df1 = pd.DataFrame(
data=np.random.random((100, 3)) + np.array([1, 1, 1]),
columns=['xitem', 'yitem', 'zitem'],
)
df2 = pd.DataFrame(
data=np.random.random((100, 3)),
columns=['xitem', 'yitem', 'zitem'],
)
fig = plt.figure()
ax = fig.add_subplot(111, projection = '3d')
for curr_df, c in zip((df1, df2), ('b', 'r')):
ax.scatter(*curr_df[['xitem', 'yitem', 'zitem']].values.T, color=c)
ax.set_xlabel("X Label")
ax.set_ylabel("Y Label")
ax.set_zlabel("Z Label")
plt.show()

Trying to make scatter plots in subplots using for-loops

I am trying to make subplots using for loop to go through my x variables in the dataframe. All plots would be a scatter plot.
X-variable: 'Protein', 'Fat', 'Sodium', 'Fiber', 'Carbo', 'Sugars'
y-variable: 'Cal'
This is where I am stuck
plt.subplot(2, 3, 2)
for i in range(3):
plt.scatter(i,sub['Cal'])
With this code:
import matplotlib.pyplot as plt
import pandas as pd
df = pd.read_csv('data.csv')
columns = list(df.columns)
columns.remove('Cal')
fig, ax = plt.subplots(1, len(columns), figsize = (20, 5))
for idx, col in enumerate(columns, 0):
ax[idx].plot(df['Cal'], df[col], 'o')
ax[idx].set_xlabel('Cal')
ax[idx].set_title(col)
plt.show()
I get this subplot of scatter plots:
However, maybe it is a better choice to use a single scatterplot and use marker color in order to distinguish data type. See this code:
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
sns.set_style('darkgrid')
df = pd.read_csv('data.csv')
# df.drop(columns = ['Sodium'], inplace = True) # <--- removes 'Sodium' column
table = df.melt('Cal', var_name = 'Type')
fig, ax = plt.subplots(1, 1, figsize = (10, 10))
sns.scatterplot(data = table,
x = 'Cal',
y = 'value',
hue = 'Type',
s = 200,
alpha = 0.5)
plt.show()
that give this plot where all data are together:
The 'Sodium' values are different from others by far, so, if you remove this column with this line:
df.drop(columns = ['Sodium'], inplace = True)
you get a more readable plot:

How to use pandas df.plot.scatter to make a figure with subplots

Hello how can i make a figure with scatter subplots using pandas? Its working with plot, but not with scatter.
Here an Example
import numpy as np
import pandas as pd
matrix = np.random.rand(200,5)
df = pd.DataFrame(matrix,columns=['index','A','B','C','D'])
#single plot, working with
df.plot(
kind='scatter',
x='index',
y='A',
s= 0.5
)
# not workig
df.plot(
subplots=True,
kind='scatter',
x='index',
y=['A','B','C'],
s= 0.5
)
Error
raise ValueError(self._kind + " requires an x and y column")
ValueError: scatter requires an x and y column
Edit:
Solution to make a figure with subplots with using df.plot
(Thanks to #Fourier)
import numpy as np
import pandas as pd
matrix = np.random.rand(200,5)#random data
df = pd.DataFrame(matrix,columns=['index','A','B','C','D']) #make df
#get a list for subplots
labels = list(df.columns)
labels.remove('index')
df.plot(
layout=(-1, 5),
kind="line",
x='index',
y=labels,
subplots = True,
sharex = True,
ls="none",
marker="o")
Would this work for you:
import pandas as pd
import numpy as np
df = pd.DataFrame({"index":np.arange(5),"A":np.random.rand(5),"B":np.random.rand(5),"C":np.random.rand(5)})
df.plot(kind="line", x="index", y=["A","B","C"], subplots=True, sharex=True, ls="none", marker="o")
Output
Note: This uses a line plot with invisible lines. For a scatter, I would go and loop over it.
for column in df.columns[:-1]: #[:-1] ignores the index column for my random sample
df.plot(kind="scatter", x="index", y=column)
EDIT
In order to add custom ylabels you can do the following:
axes = df.plot(kind='line', x="index", y=["A","B","C"], subplots=True, sharex=True, ls="none", marker="o", legend=False)
ylabels = ["foo","bar","baz"]
for ax, label in zip(axes, ylabels):
ax.set_ylabel(label)

in pandas , add scatter plot to line plot

I am trying to add a scatter plot to a line plot by using plandas plot function (in jupyter notebook).
I have tried the following code :
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
# plot the line
a = pd.DataFrame({'a': [3,2,6,4]})
ax = a.plot.line()
# try to add the scatterplot
b = pd.DataFrame({'b': [5, 2]})
plot = b.reset_index().plot.scatter(x = 'index', y = 'b', c ='r', ax = ax)
plt.show()
I also checked the following various SO answers but couldn't find the solution.
If anytone can help me, that ould be very appreciated.
EDIT:
somehow the accepted answers works, but i realise that in my case the reason it was not working might have to do with the fact i was using datetime.
like in this code, i cant see the red dots...
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime as dt
%matplotlib inline
fig, ax = plt.subplots()
# plot the line
a = pd.DataFrame({'a': [3,2,6,4]}, index = pd.date_range(dt(2019,1,1), periods = 4))
plot = a.plot.line(ax = ax)
# try to add the scatterplot
b = pd.DataFrame({'b': [5, 2]}, index = [x.timestamp() for x in pd.date_range(dt(2019,1,1), periods = 2)])
plot = b.reset_index().plot.scatter(x = 'index', y = 'b', c ='r', ax = ax)
plt.show()
Any idea whats wrong here?
This should do it (just add fig, ax = plt.subplots() in the beginning):
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
fig, ax = plt.subplots()
# plot the line
a = pd.DataFrame({'a': [3,2,6,4]})
a.plot.line(ax=ax)
# try to add the scatterplot
b = pd.DataFrame({'b': [5, 2]})
plot = b.reset_index().plot.scatter(x = 'index', y = 'b', c ='r', ax = ax)
plt.show()
Edit:
This will work for datetimes:
import matplotlib.pyplot as plt
from datetime import datetime as dt
# %matplotlib inline
fig, ax = plt.subplots()
# plot the line
a = pd.DataFrame({'a': [3,2,6,4]}, index = pd.date_range(dt(2019,1,1), periods = 4))
plot = plt.plot_date(x=a.reset_index()['index'], y=a['a'], fmt="-")
# try to add the scatterplot
b = pd.DataFrame({'b': [5, 2]}, index = pd.date_range(dt(2019,1,1), periods = 2))
plot = plt.scatter(x=b.reset_index()['index'], y=b['b'], c='r')
plt.show()

Categories

Resources