I have the a large dataframe where I calculate the p value using a t-test for each row. I now want to have a boxplot of the row with the top ten of lowest p-values
LeadSNPs = pd.unique(candidate_genes.LeadSNP) #rs3184504 rs531612
gene_counts_per_snp_df = pd.DataFrame.empty
save_path = "../figures/SM5_gene_counts/"
for LeadSNP_cnt, LeadSNP in enumerate(LeadSNPs):
print(LeadSNP)
candidate_genes_per_SNP = candidate_genes.Target[np.where(candidate_genes.LeadSNP==LeadSNP)[0]]
region = pd.unique(candidate_genes.Region[np.where(candidate_genes.LeadSNP==LeadSNP)[0]])
first_gene_flag = 1
for gene_cnt, target_gene in enumerate(candidate_genes_per_SNP):
gene_indexes = candidate_genes_per_SNP.index
PRE = candidate_genes['sumOfWeightedWeights (PRE)'][gene_indexes[gene_cnt]]
print(target_gene)
ensembl_id = get_ensembl_id(target_gene)
print(ensembl_id)
if pd.isnull(ensembl_id):
pass
else:
gene_counts_df = get_gene_counts_df(ensembl_id)
if gene_counts_df.shape[0]==0:
print('no ensemble id found in gene counts!')
else:
gene_counts_df = gene_counts_df.melt(id_vars=["Gene"], var_name='compartment', value_name='count')
gene_counts_df = reshape_gene_counts_df(gene_counts_df)
gene_counts_df['target_gene'] = target_gene
gene_counts_df['PRE'] = PRE
gene_counts_df['pval_ftest']= np.nan
pop3= gene_counts_df.loc[(gene_counts_df['target_gene']==target_gene) & (gene_counts_df['compartment']=='CSF_N')]['count']
pop4 = gene_counts_df.loc[(gene_counts_df['target_gene']==target_gene) & (gene_counts_df['compartment']=='PB_N')]['count']
pval1 = stats.ttest_ind(pop3, pop4)[1]
gene_counts_df.loc[(gene_counts_df['target_gene']==target_gene) & (gene_counts_df['compartment'].isin(['CSF_N','PB_N'])),"pval_ftest"]= pval_ftest
if first_gene_flag == 1:
gene_counts_per_snp_df = gene_counts_df
first_gene_flag = 0
else:
gene_counts_per_snp_df = pd.concat([gene_counts_per_snp_df, gene_counts_df])
gene_counts_per_snp_df['LeadSNP'] = LeadSNP
if LeadSNP_cnt == 0:
all_gene_counts = gene_counts_per_snp_df
else:
all_gene_counts = pd.concat([all_gene_counts, gene_counts_per_snp_df])
all_gene_counts = pd.DataFrame.reset_index(all_gene_counts)
plot_top_genes_snps(all_gene_counts_per_comp, 'target_gene')
and the plotting code is given here:
def plot_top_genes_snps(all_gene_counts_per_comp, x_label):
sns.set(style="white")
sns.set_context("poster")
palette = sns.color_palette("colorblind", 10)
fig, ax = plt.subplots(figsize=(25,4))
g = sns.boxplot(ax=ax, y='count', x=x_label, data=all_gene_counts_per_comp, hue = 'compartment', showfliers=False, palette=palette, hue_order=comp_order)
ax.set_xticklabels(ax.get_xticklabels(), rotation=45, horizontalalignment='right')
handles, _ = ax.get_legend_handles_labels()
current_legends = []
for str_ind in range(len(handles)):
current_legends.append(comp_dict[handles[str_ind].get_label()])
ax.legend(handles, current_legends, bbox_to_anchor=(1, 1), loc=2)
ax.yaxis.grid()
sns.set(font_scale = 2)
plt.xlabel('')
plt.ylabel('Gene count')
# plt.savefig(save_path+str(LeadSNP)+'.pdf', bbox_inches='tight')
plt.show()
For context, I want the top ten target_gene with the lowest p values. However, this is the plot I am getting:
allgenesandpvalues
How do I extract only the ten lowest p values and boxplot them.
Update: The dataframe looks like this, the table is repeated from different SNPs:
dataframe
The dataframe in text format:
Gene compartment count patient_id target_gene PRE \
1 ENSG00000157870 CSF_N 0 1 FAM213B 7.5
11 ENSG00000157870 CSF_N 0 2 FAM213B 7.5
21 ENSG00000157870 CSF_N 0 3 FAM213B 7.5
31 ENSG00000157870 CSF_N 0 4 FAM213B 7.5
41 ENSG00000157870 CSF_N 0 5 FAM213B 7.5
.. ... ... ... ... ... ...
21 ENSG00000182866 CSF_N 18 3 LCK 2.0
31 ENSG00000182866 CSF_N 45 4 LCK 2.0
41 ENSG00000182866 CSF_N 0 5 LCK 2.0
51 ENSG00000182866 CSF_N 9 6 LCK 2.0
61 ENSG00000182866 CSF_N 0 7 LCK 2.0
pval_ftest LeadSNP
1 0.222523 rs6670198
11 0.222523 rs6670198
21 0.222523 rs6670198
31 0.222523 rs6670198
41 0.222523 rs6670198
all_gene_counts_per_comp.sort_values(by="pval_ftest").loc[:10, :]
will give you the top 10 rows with the smallest "pval_ftest" value.
Maybe this toy example will make it clearer how to sort and select subsets of a DataFrame.
>>> df = pd.DataFrame({"a": [1, 2, 3, 4], "b": [4, 3, 2, 1]})
>>> print(df)
a b
0 1 4
1 2 3
2 3 2
3 4 1
>>> df_sorted = df.sort_values(by="b")
>>> print(df_sorted)
a b
3 4 1
2 3 2
1 2 3
0 1 4
>>> print(df_sorted.loc[:2, :])
a b
3 4 1
2 3 2
Related
This question already has answers here:
matplotlib color line by "value" [duplicate]
(2 answers)
How to manually create a legend
(5 answers)
map pandas values to a categorical level
(1 answer)
Closed 1 year ago.
I did try other solutions that are similar to my question but I did not succeed,
python: how to plot one line in different colors
Color by Column Values in Matplotlib
pandas plot one line graph with color change on column
I want the plot to change color when the values changes, for instance, if the emotion is 0, it will stay black, if the value changes to 1, the color will be red, if the value is 2, the color will be blue and etc. The progress I've made so far is attached to this question, thank you in advance.
random_emotions = [0,0,0,0,0,0,0,1,2,3,2,1,2,1,
2,3,2,2,2,2,1,1,2,3,3,3,3,3,3,4,
4,4,4,4,2,1,2,2,1,2,3,4,0,0,0,0,0]
random_emotions = np.array(random_emotions)
EmotionsInNumber = random_emotions
x = np.array(list(range(0,len(EmotionsInNumber))))
Angry = np.ma.masked_where(EmotionsInNumber == 0,EmotionsInNumber)
Fear = np.ma.masked_where(EmotionsInNumber == 1,EmotionsInNumber)
Happy = np.ma.masked_where(EmotionsInNumber == 2,EmotionsInNumber)
Neutral = np.ma.masked_where(EmotionsInNumber == 3, EmotionsInNumber)
Sad = np.ma.masked_where(EmotionsInNumber == 4,EmotionsInNumber)
fig, ax = plt.subplots()
ax.plot(x, Angry,linewidth = 4, color = 'black')
ax.plot(x, Fear,linewidth = 4, color = 'red')
ax.plot(x, Happy,linewidth = 4, color = 'blue')
ax.plot(x, Neutral,linewidth = 4, color = 'yellow')
ax.plot(x, Sad,linewidth = 4, color = 'green')
ax.legend(['Angry','Fear','Happy','Neutral','Sad',])
ax.set_title("Emotion Report of ")
plt.show()
This is the result that I am getting
The color is not changed accordingly, the legends are wrong and I have no idea how to fix this.
matplotlib color line by "value" [duplicate]
This 'matplotlib color line by "value" [duplicate]' is the closest I got, but when the color changes to cyan on index 1 and 5, the blue should be empty but it keeps plotting both blue and cyan. This is because the dataframe is grouped by 'colors' but it should not plot blue on 1 and 5 and cyan on 2,3,4 on the graph.
The main question will be closed as a duplicate to this answer of this question
The code is explained in the duplicates.
When a question is marked as a duplicate and you don't agree, it is your responsibility to show with code, exactly how you tried to incorporate the duplicate, and what's not working.
SO is a repository of questions and answers, which can be used as a reference to answer new questions. When a question is answered by code in an existing question/answer, it is up to you to do the work.
Since it's a duplicate, this answer has been added as a community wiki.
from matplotlib.lines import Line2D
import pandas as pd
import matplotlib.pyplot as plt
# set up the dataframe to match the duplicate
random_emotions = [0,0,0,0,0,0,0,1,2,3,2,1,2,1, 2,3,2,2,2,2,1,1,2,3,3,3,3,3,3,4, 4,4,4,4,2,1,2,2,1,2,3,4,0,0,0,0,0]
df = pd.DataFrame({'val': random_emotions})
# map values is covered in duplicate
emotion_dict = {0: 'Angry', 1: 'Fear', 2: 'Happy', 3: 'Neutral', 4: 'Sad'}
color_dict = {0: 'k', 1: 'r', 2: 'b', 3: 'y', 4: 'g'}
df['emotion'] = df.val.map(emotion_dict)
df['color'] = df.val.map(color_dict)
# everything else from here is a duplicated
df['change'] = df.val.ne(df.val.shift().bfill()).astype(int)
df['subgroup'] = df['change'].cumsum()
df.index += df['subgroup'].values
first_i_of_each_group = df[df['change'] == 1].index
for i in first_i_of_each_group:
# Copy next group's first row to current group's last row
df.loc[i-1] = df.loc[i]
# But make this new row part of the current group
df.loc[i-1, 'subgroup'] = df.loc[i-2, 'subgroup']
# Don't need the change col anymore
df.drop('change', axis=1, inplace=True)
df.sort_index(inplace=True)
# Create duplicate indexes at each subgroup border to ensure the plot is continuous.
df.index -= df['subgroup'].values
fig, ax = plt.subplots(figsize=(15, 4))
for k, g in df.groupby('subgroup'):
g.plot(ax=ax, y='val', color=g['color'].values[0], marker='.', legend=False, xticks=df.index)
ax.margins(x=0)
# create custom legend is covered in duplicate
custom_lines = [Line2D([0], [0], color=color, lw=4) for color in color_dict.values()]
_ = ax.legend(title='Emotion', handles=custom_lines, labels=emotion_dict.values(), bbox_to_anchor=(1, 1.02), loc='upper left')
# display(df.T)
0 1 2 3 4 5 6 7 7 8 8 9 9 10 10 11 11 12 12 13 13 14 14 15 15 16 16 17 18 19 20 20 21 22 22 23 23 24 25 26 27 28 29 29 30 31 32 33 34 34 35 35 36 36 37 38 38 39 39 40 40 41 41 42 42 43 44 45 46
val 0 0 0 0 0 0 0 1 1 2 2 3 3 2 2 1 1 2 2 1 1 2 2 3 3 2 2 2 2 2 1 1 1 2 2 3 3 3 3 3 3 3 4 4 4 4 4 4 2 2 1 1 2 2 2 1 1 2 2 3 3 4 4 0 0 0 0 0 0
emotion Angry Angry Angry Angry Angry Angry Angry Fear Fear Happy Happy Neutral Neutral Happy Happy Fear Fear Happy Happy Fear Fear Happy Happy Neutral Neutral Happy Happy Happy Happy Happy Fear Fear Fear Happy Happy Neutral Neutral Neutral Neutral Neutral Neutral Neutral Sad Sad Sad Sad Sad Sad Happy Happy Fear Fear Happy Happy Happy Fear Fear Happy Happy Neutral Neutral Sad Sad Angry Angry Angry Angry Angry Angry
color k k k k k k k r r b b y y b b r r b b r r b b y y b b b b b r r r b b y y y y y y y g g g g g g b b r r b b b r r b b y y g g k k k k k k
subgroup 0 0 0 0 0 0 0 0 1 1 2 2 3 3 4 4 5 5 6 6 7 7 8 8 9 9 10 10 10 10 10 11 11 11 12 12 13 13 13 13 13 13 13 14 14 14 14 14 14 15 15 16 16 17 17 17 18 18 19 19 20 20 21 21 22 22 22 22 22
I want to sum up data across overlapping bins. Basically the question here but instead of the bins being (0-8 years old), (9 - 17 years old), (18-26 years old), (27-35 years old), and (26 - 44 years old) I want them to be (0-8 years old), (1 - 9 years old), (2-10 years old), (3-11 years old), and (4 - 12 years old).
Starting with a df like this
id
awards
age
1
100
24
1
150
26
1
50
54
2
193
34
2
209
50
I am using the code from this answer to calculate summation across non-overlapping bins.
bins = [9 * i for i in range(0, df['age'].max() // 9 + 2)]
cuts = pd.cut(df['age'], bins, right=False)
print(cuts)
0 [18, 27)
1 [18, 27)
2 [54, 63)
3 [27, 36)
4 [45, 54)
Name: age, dtype: category
Categories (7, interval[int64, left]): [[0, 9) < [9, 18) < [18, 27) < [27, 36) < [36, 45) < [45, 54) < [54, 63)]
df_out = (df.groupby(['id', cuts])
.agg(total_awards=('awards', 'sum'))
.reset_index(level=0)
.reset_index(drop=True)
)
df_out['age_interval'] = df_out.groupby('id').cumcount()
Result
print(df_out)
id total_awards age_interval
0 1 0 0
1 1 0 1
2 1 250 2
3 1 0 3
4 1 0 4
5 1 0 5
6 1 50 6
7 2 0 0
8 2 0 1
9 2 0 2
10 2 193 3
11 2 0 4
12 2 209 5
13 2 0 6
Is it possible to work off the existing code to do this with overlapping bins?
First pivot_table your data to get a row per id and the columns being the ages. then reindex to get all the ages possible, from 0 to at least the max in the column age (here I use the max plus the interval length). Now you can use rolling along the columns. Rename the columns to create meaningful names. Finally stack and reset_index to get a dataframe with the expected shape.
interval = 9 #include both bounds like 0 and 8 for the first interval
res = (
df.pivot_table(index='id', columns='age', values='awards',
aggfunc=sum, fill_value=0)
.reindex(columns=range(0, df['age'].max()+interval), fill_value=0)
.rolling(interval, axis=1, min_periods=interval).sum()
.rename(columns=lambda x: f'{x-interval+1}-{x} y.o.')
.stack()
.reset_index(name='awards')
)
and you get with the input data provided in the question
print(res)
# id age awards
# 0 1 0-8 y.o. 0.0
# 1 1 1-9 y.o. 0.0
# ...
# 15 1 15-23 y.o. 0.0
# 16 1 16-24 y.o. 100.0
# 17 1 17-25 y.o. 100.0
# 18 1 18-26 y.o. 250.0
# 19 1 19-27 y.o. 250.0
# 20 1 20-28 y.o. 250.0
# 21 1 21-29 y.o. 250.0
# 22 1 22-30 y.o. 250.0
# 23 1 23-31 y.o. 250.0
# 24 1 24-32 y.o. 250.0
# 25 1 25-33 y.o. 150.0
# 26 1 26-34 y.o. 150.0
# 27 1 27-35 y.o. 0.0
# ...
# 45 1 45-53 y.o. 0.0
# 46 1 46-54 y.o. 50.0
# 47 1 47-55 y.o. 50.0
# 48 1 48-56 y.o. 50.0
# 49 1 49-57 y.o. 50.0
# ...
I think the best would be to first compute per-age sums, and then a rolling window to get all 9 year intervals. This only works because all your intervals have the same size − otherwise it would be much harder.
>>> totals = df.groupby('age')['awards'].sum()
>>> totals = totals.reindex(np.arange(0, df['age'].max() + 9)).fillna(0, downcast='infer')
>>> totals
0 6
1 2
2 4
3 6
4 4
..
98 0
99 0
100 0
101 0
102 0
Name: age, Length: 103, dtype: int64
>>> totals.rolling(9).sum().dropna().astype(int).rename(lambda age: f'{age-8}-{age}')
0-8 42
1-9 43
2-10 45
3-11 47
4-12 47
..
90-98 31
91-99 27
92-100 20
93-101 13
94-102 8
Name: age, Length: 95, dtype: int64
This is slightly complicated by the fact you also want to group by id, but the idea stays the same:
>>> idx = pd.MultiIndex.from_product([df['id'].unique(), np.arange(0, df['age'].max() + 9)], names=['id', 'age'])
>>> totals = df.groupby(['id', 'age']).sum().reindex(idx).fillna(0, downcast='infer')
>>> totals
awards
1 0 128
1 204
2 136
3 367
4 387
... ...
2 98 0
99 0
100 0
101 0
102 0
[206 rows x 1 columns]
>>> totals.groupby('id').rolling(9).sum().droplevel(0).dropna().astype(int).reset_index('id')
id awards
age
8 1 3112
9 1 3390
10 1 3431
11 1 3609
12 1 3820
.. .. ...
98 2 1786
99 2 1226
100 2 900
101 2 561
102 2 317
[190 rows x 2 columns]
This is the same as #Ben.T’s answer except we keep the Series shape and his answer pivots it to a dataframe. At any step you could .stack('age') or .unstack('age') to switch between both answer’s formats.
IIUC, you can use pd.IntervalIndex with some list comprehension:
ii = pd.IntervalIndex.from_tuples(
[
(s, e)
for e, s in pd.Series(np.arange(51)).rolling(9).agg(min).dropna().iteritems()
]
)
df_out = pd.concat(
[
pd.Series(ii.contains(x["age"]) * x["awards"], index=ii)
for i, x in df[["age", "awards"]].iterrows()
],
axis=1,
).groupby(level=0).sum().T
df_out.stack()
Output:
0 (0.0, 8.0] 0
(1.0, 9.0] 0
(2.0, 10.0] 0
(3.0, 11.0] 0
(4.0, 12.0] 0
...
4 (38.0, 46.0] 0
(39.0, 47.0] 0
(40.0, 48.0] 0
(41.0, 49.0] 0
(42.0, 50.0] 209
Length: 215, dtype: int64
A old way without pd.cut using a for loop and some masks.
import pandas as pd
max_age = df["age"].max()
interval_length = 8
values = []
for min_age in range(max_age - interval_length + 1):
max_age = min_age + interval_length
awards = df.query("#min_age <= age <= #max_age").loc[:, "age"].sum()
values.append([min_age, max_age, awards])
df_out = pd.DataFrame(values, columns=["min_age", "max_age", "awards"])
Let me know if this is what you want :)
Let df be a DataFrame:
import pandas as pd
import random
def r(b, e):
return [random.randint(b, e) for _ in range(300)]
df = pd.DataFrame({'id': r(1, 3), 'awards': r(0, 400), 'age': r(1, 99)})
For binning by age, I would advise creating a new column since it is clearer (and faster):
df['bin'] = df['age'].apply(lambda x: x // 9)
print(df)
The number of awards per id per bin can be obtained using simply:
totals_separate = df.groupby(['id', 'bin'])['awards'].sum()
print(totals_separate)
If I understand correctly, you would like the sum for each window of size 9 rows:
totals_rolling = df.groupby(['id', 'bin'])['awards'].rolling(9, min_periods=1).sum()
print(totals_rolling)
Reference: https://pandas.pydata.org/docs/reference/api/pandas.Series.rolling.html
I'm working with a data frame like this, but bigger and with more zone. I am trying to sum the value of the rows by their names. The total sum of the R or C zones goes in total column while the total sum of either M zones goes in total1 .
Input:
total, total1 are the desired output.
ID Zone1 CHC1 Value1 Zone2 CHC2 Value2 Zone3 CHC3 Value3 total total1
1 R5B 100 10 C2 0 20 R10A 2 5 35 0
1 C2 95 20 M2-6 5 6 R5B 7 3 23 6
3 C2 40 4 C4 60 6 0 6 0 10 0
3 C1 100 8 0 0 0 0 100 0 8 0
5 M1-5 10 6 M2-6 86 15 0 0 0 0 21
You can use filter for DataFrames for Zones and Values:
z = df.filter(like='Zone')
v = df.filter(like='Value')
Then create boolean DataFrames by contains with apply if want check substrings:
m1 = z.apply(lambda x: x.str.contains('R|C'))
m2 = z.apply(lambda x: x.str.contains('M'))
#for check strings
#m1 = z == 'R2'
#m2 = z.isin(['C1', 'C4'])
Last filter by where v and sum per rows:
df['t'] = v.where(m1.values).sum(axis=1).astype(int)
df['t1'] = v.where(m2.values).sum(axis=1).astype(int)
print (df)
ID Zone1 CHC1 Value1 Zone2 CHC2 Value2 Zone3 CHC3 Value3 t t1
0 1 R5B 100 10 C2 0 20 R10A 2 5 35 0
1 1 C2 95 20 M2-6 5 6 R5B 7 3 23 6
2 3 C2 40 4 C4 60 6 0 6 0 10 0
3 3 C1 100 8 0 0 0 0 100 0 8 0
4 5 M1-5 10 6 M2-6 86 15 0 0 0 0 21
Solution1 (simpler code but slower and less flexible)
total = []
total1 = []
for i in range(df.shape[0]):
temp = df.iloc[i].tolist()
if "R2" in temp:
total.append(temp[temp.index("R2")+1])
else:
total.append(0)
if ("C1" in temp) & ("C4" in temp):
total1.append(temp[temp.index("C1")+1] + temp[temp.index("C4")+1])
else:
total1.append(0)
df["Total"] = total
df["Total1"] = total1
Solution2 (faster than solution1 and easier to customize but possibly memory intensive)
# columns to use
cols = df.columns.tolist()
zones = [x for x in cols if x.startswith('Zone')]
vals = [x for x in cols if x.startswith('Value')]
# you can customize here
bucket1 = ['R2']
bucket2 = ['C1', 'C4']
thresh = 2 # "OR": 1, "AND": 2
original = df.copy()
# bucket1 check
for zone in zones:
df.loc[~df[zone].isin(bucket1), cols[cols.index(zone)+1]] = 0
original['Total'] = df[vals].sum(axis=1)
df = original.copy()
# bucket2 check
for zone in zones:
df.loc[~df[zone].isin(bucket2), cols[cols.index(zone)+1]] = 0
df['Check_Bucket'] = df[zones].stack().reset_index().groupby('level_0')[0].apply(list)
df['Check_Bucket'] = df['Check_Bucket'].apply(lambda x: len([y for y in x if y in bucket2]))
df['Total1'] = df[vals].sum(axis=1)
df.loc[df.Check_Bucket < thresh, 'Total1'] = 0
df.drop('Check_Bucket', axis=1, inplace=True)
When I expanded original dataframe to 100k rows, solution 1 took 11.4 s ± 82.1 ms per loop, while solution 2 took 3.53 s ± 29.8 ms per loop. The difference is because solution 2 does not for-looping over row direction.
I want to plot factor plots for my data. I tried doing it the below way but the chart values didn't turn out as expected.
df = pd.DataFrame({'subset_product':['A','A','A','B','B','C','C'],
'subset_close':[1,1,0,1,1,1,0]})
prod_counts = df.groupby('subset_product').size().rename('prod_counts')
df['prod_count'] = df['subset_product'].map(prod_counts)
g = sns.factorplot(y='prod_count',x='subset_product',hue='subset_close',data=df,kind='bar',palette='muted',legend=False,ci=None)
plt.legend(loc='best')
However, my plots all have the same height, meaning it didn't separate the data into '1' and '0'.
Example: For A, the blue bar should have height = 1, and the green bar should have height = 2.
The problem is your 'prod_count'.
print(df)
# subset_close subset_product prod_count
# 0 1 A 3
# 1 1 A 3
# 2 0 A 3
# 3 1 B 2
# 4 1 B 2
# 5 1 C 2
# 6 0 C 2
You are telling seaborn that y is 3 when subset_close == 1 & subset_product == A and y is also 3 when subset_close == 0 & subset_product == A.
Below should do what you want.
# Count the number of each (`subset_close`, `subset_product`) combination.
df2 = df.groupby(['subset_product', 'subset_close']).size().reset_index(name='prod_count')
# Plot
g = sns.factorplot(y='prod_count', x='subset_product', hue='subset_close', data=df2,
kind='bar', palette='muted', legend=False, ci=None)
plt.legend(loc='best')
plt.show()
print(df2)
# subset_product subset_close prod_count
# 0 A 0 1
# 1 A 1 2
# 2 B 1 2
# 3 C 0 1
# 4 C 1 1
Is there a way to group boxplots in matplotlib WITHOUT the use of seaborn or some other library?
e.g. in the following, I want to have blocks along the x axis, and plot values grouped by condition (so there will be 16 boxes). Like what seaborn's hue argument accomplishes.
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
blocks = 4
conditions = 4
ndatapoints = blocks * conditions
blockcol = np.repeat(list(range(1, conditions+1)), blocks)
concol = np.repeat(np.arange(1, conditions+1, 1), blocks)
trialcol = np.arange(1, ndatapoints+1, 1)
valcol = np.random.normal(0, 1, ndatapoints)
raw_data = {'blocks': np.repeat(list(range(1, conditions+1)), blocks),
'condition': list(range(1, conditions+1))*blocks,
'trial': np.arange(1, ndatapoints+1, 1),
'value': np.random.normal(0, 1, ndatapoints)}
df = pd.DataFrame(raw_data)
df
blocks condition trial value
0 1 1 1 1.306146
1 1 2 2 -0.024201
2 1 3 3 -0.374561
3 1 4 4 -0.093366
4 2 1 5 -0.548427
5 2 2 6 -1.205077
6 2 3 7 0.617165
7 2 4 8 -0.239830
8 3 1 9 -0.876789
9 3 2 10 0.656436
10 3 3 11 -0.471325
11 3 4 12 -1.465787
12 4 1 13 -0.495308
13 4 2 14 -0.266914
14 4 3 15 -0.305884
15 4 4 16 0.546730
I can't seem to find any examples.
I think you just want a factor plot:
import numpy
import pandas
import seaborn
blocks = 3
conditions = 4
trials = 12
ndatapoints = blocks * conditions * trials
blockcol = list(range(1, blocks + 1)) * (conditions * trials)
concol = list(range(1, conditions + 1)) * (blocks * trials)
trialcol = list(range(1, trials + 1)) * (blocks * conditions)
valcol = numpy.random.normal(0, 1, ndatapoints)
fg = pandas.DataFrame({
'blocks': blockcol,
'condition': concol,
'trial': trialcol,
'value': valcol
}).pipe(
(seaborn.factorplot, 'data'),
x='blocks', y='value', hue='condition',
kind='box'
)