Subplotting of Pandas.DataFrameGroupBy[group_name] does not yield expected results - python

This is a re-opening of my initial question with the same title which was closed as duplicate. As None of the suggested duplicates helped me to solve my problem, I post this question again.
I have a DataFrame with time series related to some devices which come from a hdf-file:
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
from pandas import DataFrame
def open_dataset(file_name: str, name: str, combined_frame: DataFrame):
data_set: DataFrame = pd.read_hdf(file_name, key=name)
data_set['name'] = name
combined_frame = pd.concat([combined_frame, data_set], axis=0)
return combined_frame
if __name__ == '__main__':
names = ['YRT1IN1E', 'YRT1LE1', 'YRT1MH1', 'YR08DT1ML']
working_frame = DataFrame()
for name in names:
working_frame = open_dataset('data.h5', name, working_frame)
grouped_frame = working_frame.groupby('name')
fig, axs = plt.subplots(figsize=(10, 5),
nrows=4, ncols=1, # fix as above
gridspec_kw=dict(hspace=0), sharex=True)
axs = grouped_frame.get_group('YR08DT1ML').rawsum.plot()
axs = grouped_frame.get_group('YRT1LE1').voltage.plot()
axs = grouped_frame.get_group('YRT1MH1').current.plot()
axs = grouped_frame.get_group('YRT1IN1E').current.plot()
plt.show()
This produces the following output:
What am I doing wrong? I would like to have each of the plots in it's own row, not all in one row.
The data file "data.h5" is available at: Google Drive
What I tried from the suggested posts:
Answer by joris, Mar 18, 2014 at 15:45 causes code to go into infinite loop, data is never plotted:
fig, axs = plt.subplots(nrows=2, ncols=2)
grouped_frame.get_group('YR08DT1ML').rawsum.plot(ax=axs[0,0])
grouped_frame.get_group('YR...').rawsum.plot(ax=axs[0,1])
grouped_frame.get_group('YR...').rawsum.plot(ax=axs[1,0])
grouped_frame.get_group('YR...').rawsum.plot(ax=axs[1,1])
A variation is leading to same result as I described above:
axes[0,0] = grouped_frame.get_group('YR08DT1ML').rawsum.plot()
axes[0,1] = grouped_frame.get_group('YR...').rawsum.plot()
...
Infinite loop happens as well for sedeh's, Jun 4, 2015 at 15:26 answer:
grouped_frame.get_group('YR08DT1ML').rawsum.plot(subplots=True, layout=(1,2))
...
Infinite loop happens as well for Justice_Lords, Mar 15, 2019 at 7:26 answer:
fig=plt.figure()
ax1=fig.add_subplot(4,1,1)
ax2=fig.add_subplot(4,1,2)
ax3=fig.add_subplot(4,1,3)
ax4=fig.add_subplot(4,1,4)
grouped_frame.get_group('YR08DT1ML').rawsum.plot(ax=ax1)
...
It seems to me that the problem is related to the fact that I plot with a pandas.DataFrameGroupBy and not a pandas.DataFrame

Seems like matplotlib was taking a long time to process the DatetimeIndex. Converting to a time and cleaning everything up did the trick:
names = ['YR08DT1ML', 'YRT1LE1', 'YRT1MH1', 'YRT1IN1E']
df = pd.concat([pd.read_hdf('data.h5', name) for name in names])
df.reset_index(inplace=True)
df.index = df['time'].dt.time
df.sort_index(inplace=True)
fig, axes = plt.subplots(figsize=(10, 5), nrows=4, ncols=1, gridspec_kw=dict(hspace=0), sharex=True)
cols = ['rawsum', 'voltage', 'current', 'current']
for ix, name in enumerate(names):
df.loc[df['nomen'].eq(name), cols[ix]]\
.plot(ax=axes[ix])
plt.show();
Hope this helps.

Thanks to #fishmulch's answer I found a way to do what I wanted. However, I want to provide an answer for my initial question how to plot the "groupby" data set. The following __main__ function provides the desired output with input file data.h5:
if __name__ == '__main__':
names = ['YRT1IN1E', 'YRT1LE1', 'YRT1MH1', 'YR08DT1ML']
working_frame = DataFrame()
for name in names:
working_frame = open_dataset('data.h5', name, working_frame)
grouped_frame = working_frame.groupby('name')
fig = plt.figure(1)
gs = gridspec.GridSpec(4, 1)
gs.update(wspace=0.0, hspace=0.0) # set the spacing between axes.
cols = ['current', 'voltage', 'current', 'rawsum']
row = 0
for name, col in zip(names, cols):
data = grouped_frame.get_group(name)
if row == 0:
ax = fig.add_subplot(gs[row])
else:
ax = fig.add_subplot(gs[row], sharex=ax)
ax.plot(data.get(col))
row += 1
plt.show()
... some beautification still needed ...

Related

Understading why looping subplot does not work with loc()

Considering the following toy dataset:
df = pd.DataFrame(np.random.randint(0,100,size=(100, 1)), columns=["A"])`
my_ID = ["control", "sample1", "sample2", "sample3"]
new_ID = 25 * my_ID
df["new_ID"] = new_ID
I make a subplot for df["A"] based on the unique values of df["new_ID"] as follows:
unique_list = list(df["new_ID"].unique())
fig, axs = plt.subplots(int(len(unique_list) / 2) , 2)
for unique_sample, ax in zip(unique_list, axs.flat):
sns.histplot(data=df[df["new_ID"] == unique_sample], x= "A", ax=ax)
ax.set_title(unique_sample)
The code functions well as we use df[df["new_ID"]].
However, the script will give us four same subplots if we write it as:
d = df.loc[df["new_ID"] == unique_sample]
fig, axs = plt.subplots(int(len(unique_list) / 2) , 2)
for unique_sample, ax in zip(unique_list, axs.flat):
g1 = sns.histplot(data=d, x= "A", ax=ax)
ax.set_title(unique_sample)
I am wondering why loc() cannot be used in iteration, although, we can use it in making single plots.
Cannot reproduce unfortunately. Using your code - there are four different plots. seaborn 0.11.2, pandas 1.3.5.
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
df = pd.DataFrame(np.random.randint(0,100,size=(100, 1)), columns=["A"])
my_ID = ["control", "sample1", "sample2", "sample3"]
new_ID = 25 * my_ID
df["new_ID"] = new_ID
unique_list = df["new_ID"].unique()
print(unique_list)
#I make a subplot for df["A"] based on the unique values of df["new_ID"] as follows:
fig, axs = plt.subplots(int(len(unique_list) / 2) , 2)
for unique_sample, ax in zip(unique_list, axs.flat):
sns.histplot(data=df[df["new_ID"] == unique_sample], x= "A", ax=ax)
ax.set_title(unique_sample)
plt.tight_layout()
For what it's worth, consider using sns.displot(kind="hist", ...) for this job! It has built in support for facet grid and will do the "facetting" by new_ID easily for you, you don't have to manage the axes manually.
This is all it takes to get the equivalent display:
sns.displot(data=df, col="new_ID", col_wrap=2, x="A");

How to plot value counts for each subset in matplotlib/seaborn?

I am relatively new to matplotlib and there is probably a better way to deal with the problem. I have tried sns.countplot(), which does not have sorting option. So I tried to do it with a bar plot and pandas for counting:
my_data = pd.DataFrame({'actions': ['buy','buy','buy','observe','consult'] , 'places':['NY','AR','AR','NY','AR']})
fig, axs = plt.subplots(1, 2, figsize = (5,7))
axs = axs.ravel()
for place in my_data['places']:
x = 0
temp_df = my_data[my_data['places'] == place]
axs[x] = sns.barplot(y=temp_df.actions.value_counts().index, x=temp_df.actions.value_counts().values, color="#43B8E7",orient = 'h')
axs[x].set_title(place)
x=+1
where data look like
actions places
0 buy NY
1 buy AR
2 buy AR
3 observe NY
4 consult AR
and the code produces what's below. As you may have assumed, I need to plot NY as well, however, because of subsetting or something missed in the loop it does not work well. How to fix that? I feel that this is the easy one, however, cannot find it.
Are you looking for:
(my_data.groupby('places')['actions']
.value_counts().unstack('places')
.plot.bar(subplots=True)
)
Or similarly:
(pd.crosstab(my_data['actions'], my_data['places'])
.plot.bar(subplots=True)
)
Output:
If you want horizontal bars:
(pd.crosstab(my_data['actions'], my_data['places'])
.plot.barh(subplots=True, layout=[1,2])
)
Output:
Or we can fix your code:
fig, axs = plt.subplots(1, 2, figsize = (5,7))
axs = axs.ravel()
for ax,place in zip(axs,my_data['places'].unique()):
temp_df = my_data[my_data['places'] == place].actions.value_counts()
sns.barplot(y=temp_df.index, x=temp_df,
color="#43B8E7", ax=ax, orient = 'h')
ax.set_title(place)
Output (which isn't very well-aligned IMHO):
I would use a facetgrid since you're already using seaborn:
import pandas
import seaborn
axgrid = pandas.DataFrame({
'actions': ['buy','buy','buy','observe','consult'] ,
'places':['NY','AR','AR','NY','AR']
}).pipe((seaborn.catplot, 'data'),
y="actions", col="places",
order=['buy', 'consult', 'observe'],
kind="count"
)
And you get:

Plotting 3 diffrent coloums from a CSV file python

My goal is to use the sorted result data to plot "Month vs Mean Temp" graph for each year on the same window.
I've sorted the first two columns that have the year and the month respectively and then saved the new sorted data into a file called NewFile, but I can't seem to get to a solution here, I used csv reader and now I'm using numpy,
Code:
import numpy as np
import matplotlib.pyplot as plt
plt.style.use('ggplot')
csv1 = open('Data_5.1.csv')
data = np.array(list(csv.reader(csv1,delimiter=',').astype("string")
year = data[:,0]
mounth = data[:,1]
temp= data[:,3]
fig, ax = plt.subplots(figsize=(10,10))
ax.plot(year, mounth, label='mounth/year')
ax.plot(year, temp, label='year/temp')
plt.legend()
But it just throws an error saying:
File "<ipython-input-282-282e91df631f>", line 9
year = data[:,0]
^
SyntaxError: invalid syntax
I will put two links to the files, the Data_5.1 and the NewFile respectively
Data_5.1
NewFile
1 - You didn't close brackets in line 6, hence you are getting the error in line 8.
2 - astype("string") is not needed in line 6.
I fixed your code, but you will have to complete the subplotting. Good luck!
import numpy as np
import matplotlib.pyplot as plt
import csv
plt.style.use('ggplot')
csv1 = open('Data_5.1.csv')
data = np.array(list(csv.reader(csv1,delimiter=',')))
year = data[:,0]
mounth = data[:,1]
temp= data[:,3]
fig, ax = plt.subplots(2,2) #This will create 4X4 subplots in one window
ax[0,0].plot(year, mounth, label='mounth/year') #This will plot in the 0,0 subplot
ax[0,1].plot(year, temp, label='year/temp') #This will plot in the 0,1 subplot
'''
For you to continue.
'''
plt.legend()
plt.show()
Your data is in a CSV file, and it's non-homogenous in type. Pandas is really the more appropriate tool for this.
I had to adapt your CSV slightly due to encoding errors, here is what it ended up looking like:
year,Month,Other Month,temperature_C
2003,Jan.,Some val,17.7
2004,Jan.,Some val,19.5
2005,Jan.,Some val,17.3
2006,Jan.,Some val,17.8
...
Here is a general sketch of what the code you shared could look like after the refactoring:
import matplotlib.pyplot as plt
import pandas as pd
plt.style.use('ggplot')
# csv1 = open('Data_5.1.csv')
# data = np.array(list(csv.reader(csv1,delimiter=',').astype("string")
df_1 = pd.read_csv('../resources/Data_5.1.csv', header=0, names=['year', 'month', 'some_col', 'temp'],
dtype={'some_col': str, 'temp': float, 'month': str, 'year': str})
year = df_1['year']
month = df_1['month']
temp = df_1['temp']
fig, ax = plt.subplots(figsize=(10, 10))
ax.plot(year, month, label='month/year')
ax.plot(year, temp, label='year/temp')
plt.show()
Let me know if you have any questions :)

Pandas plot: Assign Colors

I have many data frames that I am plotting for a presentation. These all have different columns, but all contain the same additional column foobar. At the moment, I am plotting these different data frames using
df.plot(secondary_y='foobar')
Unfortunately, since these data frames all have different additional columns with different ordering, the color of foobar is always different. This makes the presentation slides unnecessary complicated. I would like, throughout the different plots, assign that foobar is plotted bold and black.
Looking at the docs, the only thing coming close appears to be the parameter colormap - I would need to ensure that the xth color in the color map is always black, where x is the order of foobar in the data frame. Seems to be more complicated than it should be, also this wouldn't make it bold.
Is there a (better) approach?
I would suggest using matplotlib directly rather than the dataframe plotting methods. If df.plot returned the artists it added instead of an Axes object it wouldn't be too bad to change the color of the line after it was plotted.
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
def pandas_plot(ax, df, callout_key):
"""
Parameters
----------
ax : mpl.Axes
The axes to draw to
df : DataFrame
Data to plot
callout_key : str
key to highlight
"""
artists = {}
x = df.index.values
for k, v in df.iteritems():
style_kwargs = {}
if k == callout_key:
style_kwargs['c'] = 'k'
style_kwargs['lw'] = 2
ln, = ax.plot(x, v.values, **style_kwargs)
artists[k] = ln
ax.legend()
ax.set_xlim(np.min(x), np.max(x))
return artists
Usage:
fig, ax = plt.subplots()
ax2 = ax.twinx()
th = np.linspace(0, 2*np.pi, 1024)
df = pd.DataFrame({'cos': np.cos(th), 'sin': np.sin(th),
'foo': np.sin(th + 1), 'bar': np.cos(th +1)}, index=th)
df2 = pd.DataFrame({'cos': -np.cos(th), 'sin': -np.sin(th)}, index=th)
pandas_plot(ax, df, 'sin')
pandas_plot(ax2, df2, 'sin')
Perhaps you could define a function which handles the special column in a separate plot call:
def emphasize_plot(ax, df, col, **emphargs):
columns = [c for c in df.columns if c != col]
df[columns].plot(ax=ax)
df[col].plot(ax=ax, **emphargs)
Using code from tcaswell's example,
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
def emphasize_plot(ax, df, col, **emphargs):
columns = [c for c in df.columns if c != col]
df[columns].plot(ax=ax)
df[col].plot(ax=ax, **emphargs)
fig, ax = plt.subplots()
th = np.linspace(0, 2*np.pi, 1024)
df = pd.DataFrame({'cos': np.cos(th), 'foobar': np.sin(th),
'foo': np.sin(th + 1), 'bar': np.cos(th +1)}, index=th)
df2 = pd.DataFrame({'cos': -np.cos(th), 'foobar': -np.sin(th)}, index=th)
emphasize_plot(ax, df, 'foobar', lw=2, c='k')
emphasize_plot(ax, df2, 'foobar', lw=2, c='k')
plt.show()
yields
I used #unutbut's answer and extended it to allow for a secondary y axis and correct legends:
def emphasize_plot(ax, df, col, **emphargs):
columns = [c for c in df.columns if c != col]
ax2 = ax.twinx()
df[columns].plot(ax=ax)
df[col].plot(ax=ax2, **emphargs)
lines, labels = ax.get_legend_handles_labels()
lines2, labels2 = ax2.get_legend_handles_labels()
ax2.legend(lines + lines2, labels + labels2, loc=0)

Plot panda series in separate subplots using matplotlib

Hoping to get some help please, I'm trying plot simulation data in separate subplots using pandas and matplotlib my code so far is:
import matplotlib.pylab as plt
import pandas as pd
fig, ax = plt.subplots(2, 3)
for i in range(2):
for j in range(50, 101, 10):
for e in range(3):
Var=(700* j)/ 100
Names1 = ['ig','M_GZ']
Data1 = pd.read_csv('~/File/JTL_'+str(Var)+'/GZ.csv', names=Names1)
ig = Data1['ig']
M_GZ=Data1['M_GZ']
MGZ = Data1[Data1.M_GZ != 0]
ax[i, e].plot(MGZ['ig'][:4], MGZ['M_GZ'][:4], '--v', linewidth=1.75)
plt.tight_layout()
plt.show()
But the code gives me 6 duplicate copies of the same plot:
instead of each iteration of Var having its own plot, I've tried changing the loop and using different variations like:
fig = plt.figure()
for i in range(1, 7):
ax = fig.add_subplot(2, 3, i)
for j in range(50, 101, 10):
Var=(700* j)/ 100
Names1 = ['ig','M_GZ']
Data1 = pd.read_csv('~/File/JTL_'+str(Var)+'/GZ.csv', names=Names1)
ig = Data1['ig']
M_GZ=Data1['M_GZ']
MGZ = Data1[Data1.M_GZ != 0]
ax.plot(MGZ['ig'][:4], MGZ['M_GZ'][:4], '--v', linewidth=1.75)
plt.tight_layout()
plt.show()
but that changes nothing I still get the same plot as above. Any help would be appreciated, I'm hoping that each subplot contains one set of data instead of all six
This is a Link to one of the Dataframes each subdirectory ~/File/JTL_'+str(Var)+'/ contains a copy of this file there are 6 in total
The problem is in your loop
for i in range(2): # Iterating rows of the plot
for j in range(50, 101, 10): # Iterating your file names
for e in range(3): # iterating the columns of the plot
The end result is that you iterate all the columns for each file name
For it two work, you should have only two nesting levels in your loop. Potential code (updated) :
import matplotlib.pylab as plt
import pandas as pd
fig, ax = plt.subplots(2, 3)
for row in range(2):
for col in range(3):
f_index = range(50, 101, 10)[row+1 * col]
print row, col, f_index
Var=(700* f_index)/ 100
Names1 = ['ig','M_GZ']
Data1 = pd.read_csv('~/File/JTL_'+str(Var)+'/GZ.csv', names=Names1)
ig = Data1['ig']
M_GZ=Data1['M_GZ']
MGZ = Data1[Data1.M_GZ != 0]
ax[row, col].plot(MGZ['ig'][:4], MGZ['M_GZ'][:4], '--v',linewidth=1.75)
plt.tight_layout()
plt.show()

Categories

Resources