I have the following DF:
import pandas as pd
df = pd.DataFrame({
"a": ["a1", "a1", "a1", "a2", "a2", "a2", "a3", "a3", "a3"],
"b": ["b1", "b2", "b3", "b1", "b2", "b3", "b1", "b2", "b3"],
"c": [10, 20, 30, 40, 50, 60, 80, 90, 100],
"d": [100, 200, 300, 400, 500, 600, 1000, 2000, 3000]
})
df
a b c d
0 a1 b1 10 100
1 a1 b2 20 200
2 a1 b3 30 300
3 a2 b1 40 400
4 a2 b2 50 500
5 a2 b3 60 600
6 a3 b1 80 1000
7 a3 b2 90 2000
8 a3 b3 100 3000
I want to create 3 figures for column a: a1, a2 and a3. In the x-axis, for each graph, I have the same b1, b2 and b3. I want to use a loop instead of the code below, which is repetitive because the x-axis is the same for all figures (b1, b2, b3) and the data names are the same (ccc and ddd):
from plotly import graph_objects as go
df1=df.query("a=='a1'")
fig1 = go.Figure(data=[
go.Bar(name = "ccc", x=df1.b, y=df1.c),
go.Bar(name = "ddd", x=df1.b, y=df1.d)
])
fig1.update_layout(barmode='group', title="fig1 for a1")
df2=df.query("a=='a2'")
fig2 = go.Figure(data=[
go.Bar(name = "ccc", x=df2.b, y=df2.c),
go.Bar(name = "ddd", x=df2.b, y=df2.d)
])
fig2.update_layout(barmode='group', title="fig2 for a2")
df3=df.query("a=='a3'")
fig3 = go.Figure(data=[
go.Bar(name = "ccc", x=df3.b, y=df3.c),
go.Bar(name = "ddd", x=df3.b, y=df3.d)
])
fig3.update_layout(barmode='group', title="fig3 for a3")
How do I create that loop?
Approach
There are a number of ways you can do this. The real challenge is rather how to reference them later. Here's how you do it:
Slice your dataframe using frames = df['a'].unique(),
loop through your subsets using for i, f in enumerate(frames):,
build figures as normal using fig=go.Figure()
add each new figure to a dictionary using figs['fig'+str(i)] = fig
Now you can reference and show, for example, figure 1 using:
figs['fig1'].show()
Plot
Complete code
import pandas as pd
from plotly import graph_objects as go
df = pd.DataFrame({
"a": ["a1", "a1", "a1", "a2", "a2", "a2", "a3", "a3", "a3"],
"b": ["b1", "b2", "b3", "b1", "b2", "b3", "b1", "b2", "b3"],
"c": [10, 20, 30, 40, 50, 60, 80, 90, 100],
"d": [100, 200, 300, 400, 500, 600, 1000, 2000, 3000]
})
frames = df['a'].unique() # use to slice dataframe
figs = {} # container for figures
for i, f in enumerate(frames, start = 1):
di = df[df['a']==f]
fig = go.Figure()
fig.add_bar(name = "ccc", x=di.b, y=di.c)
fig.add_bar(name = "ddd", x=di.b, y=di.d)
fig.update_layout(barmode='group', title="fig" + str(i) + " for a" + str(i))
figs['fig'+str(i)] = fig
figs['fig1'].show()
Related
say I have two dataframes with multiindices, where one of the indices is deeper than the other. Now I want to select only those rows from the one (deeper) dataframe where their partial index is included in the other dataframe.
Example input:
df = pandas.DataFrame(
{
"A": ["a1", "a1", "a1", "a2", "a2", "a2"],
"B": ["b1", "b1", "b2", "b1", "b2", "b2"],
"C": ["c1", "c2", "c1", "c1", "c1", "c2"],
"V": [1, 2, 3, 4, 5, 6],
}
).set_index(["A", "B", "C"])
df2 = pandas.DataFrame(
{
"A": ["a1", "a1", "a2", "a2"],
"B": ["b1", "b3", "b1", "b3"],
"X": [1, 2, 3, 4]
}
).set_index(["A", "B"])
Visual:
V
A B C
a1 b1 c1 1
c2 2
b2 c1 3
a2 b1 c1 4
b2 c1 5
c2 6
X
A B
a1 b1 1
b3 2
a2 b1 3
b3 4
Desired output:
result = pandas.DataFrame(
{
"A": ["a1", "a1", "a2"],
"B": ["b1", "b1", "b1"],
"C": ["c1", "c2", "c1"],
"V": [1, 2, 4],
}
).set_index(["A", "B", "C"])
Visual:
V
A B C
a1 b1 c1 1
c2 2
a2 b1 c1 4
I tried
df.loc[df2.index] and df.loc[df.index.intersection(df2.index)] but that does not work.
I guess I could do df.join(df2, how="inner") and afterwards remove all the columns of df2 that were added, but that is cumbersome. Or is there a way to take away all the columns of df2?
I would appreciate any help.
One option is to use isin on the specific labels common to both, and use the resulting boolean to filter df:
df.loc[df.index.droplevel('C').isin(df2.index)]
V
A B C
a1 b1 c1 1
c2 2
a2 b1 c1 4
I would like to concatenate two incomplete data frame with the same data (in theory) regarding a similar index.
I tried with pd.concat but I don't managed to get what I need.
Here is a simple example of what I would like to do :
df1 = pd.DataFrame(
{
"A": ["A0", "A1", "A2", "A3"],
"B": ["B0", "B1", "B2", "B4"],
"C": ["C0", "C1", "C2", "B5"],
"D": [np.nan,np.nan,np.nan,np.nan,]
},
index=[0, 1, 2, 3],)
df2 = pd.DataFrame(
{
"A": ["A0", "A1", "A5", "A6"],
"B": ["B0", "B1", "B5", "B6"],
"C": [np.nan,np.nan,np.nan,np.nan,],
"D": ["D0", "D1", "D5", "D6"],
},
index=[0, 1, 5, 6]
)
res_expected = pd.DataFrame(
{
"A": ["A0", "A1", "A2", "A3", "A5", "A6"],
"B": ["B0", "B1", "B2", "B3", "B5", "B6"],
"C": ["C0", "C1", "C2", "B5",np.nan,np.nan,],
"D": ["D0", "D1", np.nan,np.nan,"D5", "D6"],
},
index=[0, 1, 2, 3, 5, 6]
)
Does someone have an idea ?
Thanks !
You can use combine_first(), as follows:
df_result = df1.combine_first(df2)
combine_first() works as follows:
Combine two DataFrame objects by filling null values in one DataFrame
with non-null values from other DataFrame. The row and column indexes
of the resulting DataFrame will be the union of the two.
Result:
print(df_result)
A B C D
0 A0 B0 C0 D0
1 A1 B1 C1 D1
2 A2 B2 C2 NaN
3 A3 B4 B5 NaN
5 A5 B5 NaN D5
6 A6 B6 NaN D6
res_expected=df1.append(df2,ignore_index=True)
This should work
I have two dataframes:
left = pd.DataFrame(
{
"Col": ["D", "C", "B", "A"],
},
index=[0, 1, 2, 3],
)
right = pd.DataFrame(
{
"A": ["A0", "A1", "A2", "A3"],
"B": ["B0", "B1", "B2", "B3"],
"C": ["C0", "C1", "C2", "C3"],
"D": ["D0", "D1", "D2", "D3"],
},
index=[0, 1, 2, 3],
)
Is it possible to merge them based on indices and col of the left and column names on the right ?
I need to get the following result:
result = pd.DataFrame(
{
"Col": ["D", "C", "B", "A"],
"Val": ["D0", "C1", "B2", "A3"],
},
)
Try with
left['new'] = right.values[np.arange(len(left)), right.columns.get_indexer(left.Col)]
left
Out[129]:
Col new
0 D D0
1 C C1
2 B B2
3 A A3
Notice, we used to have lookup but it deprecated, ,above is one of the alternative of lookup from numpy
The reason here I am not use the index : numpy do not have index, so we need the position to pass by the correct value, most of time index same as position but it will may different from
each other as well.
Another solution:
left["new"] = right.apply(lambda x: x[left.loc[x.name, "Col"]], axis=1)
print(left)
Prints:
Col new
0 D D0
1 C C1
2 B B2
3 A A3
Alternative approach (convert columns to index with melt and then merge):
left['id'] = left.index
m = right.melt(ignore_index=False, var_name="Col", value_name="Val")
m['id'] = m.index
result = pd.merge(left, m, on=["id", "Col"])[["Col", "Val"]]
It is faster than use of apply but slower than the accepted answer.
I have two dataframes:
left = pd.DataFrame(
{
"Col": ["D", "C", "B", "A"],
},
)
right = pd.DataFrame(
{
"A": ["A0"],
"B": ["B0"],
"C": ["C0"],
"D": ["D0"],
},
)
Is it possible to merge them based on col of the left and column names on the right ?
I need to get the following result:
result = pd.DataFrame(
{
"Col": ["D", "C", "B", "A"],
"Val": ["D0", "C0", "B0", "A0"],
},
)
You can do it with a pretty straightforward .map:
In [319]: left['Val'] = left['Col'].map(right.T[0])
In [320]: left
Out[320]:
Col Val
0 D D0
1 C C0
2 B B0
3 A A0
Try join with the transposition (T or transpose()):
import pandas as pd
left = pd.DataFrame({
"Col": ["D", "C", "B", "A"],
})
right = pd.DataFrame({
"A": ["A0"],
"B": ["B0"],
"C": ["C0"],
"D": ["D0"],
})
new_df = left.join(right.T, on='Col').rename(columns={0: 'Val'})
print(new_df)
new_df:
Col Val
0 D D0
1 C C0
2 B B0
3 A A0
index print_type_solid print_type_floral cluster
A 10 10 2
B 20 20 2
A 10 10 3
B 20 20 3
C 25 30 3
Can someone help me convert the above dataframe into the following nested dictionary where the cluster becomes the main key and and the print_type_x as key and then the values as shown in the expected output below ?
{
"2" :{
"print_type_solid" : {
"A": 10,
"B": 20
},
"print_type_floral" : {
"A": 10,
"B": 20
}
},
"3" :{
"print_type_solid" : {
"A": 10,
"B": 20,
"C": 25,
},
"print_type_floral" : {
"A": 10,
"B": 20,
"C": 30,
}
}
}
I tried this :
from collections import defaultdict
d = defaultdict()
d2={}
for k1, s in dct.items():
for k2, v in s.items():
for k3, r in v.items():
d.setdefault(k3, {})[k2] = r
d2[k1]=d
But I'm getting this :
{
"2" :{
"print_type_solid" : {
"A": 10,
"B": 20,
"C": 25
},
"print_type_floral" : {
"A": 10,
"B": 20,
"C": 30
}
},
"3" :{
"print_type_solid" : {
"A": 10,
"B": 20,
"C": 25,
},
"print_type_floral" : {
"A": 10,
"B": 20,
"C": 30,
}
}
}
And this is wrong because I'm getting C also in the dictionary for cluster 2.
You can use df.iterrows() to iterate your dataframe row-wise. To create the dictionary you can use this:
import pandas as pd
df = pd.DataFrame( {"index":list("ABABC"),
"print_type_solid":[10,20,10,20,25],
"print_type_floral":[10,20,10,20,30],
"cluster":[2,2,3,3,3] })
print(df)
d = {}
pts = "print_type_solid"
ptf = "print_type_floral"
for idx, row in df.iterrows():
key = d.setdefault(row["cluster"],{})
key_pts = key.setdefault(pts,{})
key_pts[row["index"]] = row[pts]
key_ptf = key.setdefault(ptf,{})
key_ptf[row["index"]] = row[ptf]
from pprint import pprint
pprint(d)
Output:
# df
index print_type_solid print_type_floral cluster
0 A 10 10 2
1 B 20 20 2
2 A 10 10 3
3 B 20 20 3
4 C 25 30 3
# dict
{2: {'print_type_floral': {'A': 10, 'B': 20},
'print_type_solid': {'A': 10, 'B': 20}},
3: {'print_type_floral': {'A': 10, 'B': 20, 'C': 30},
'print_type_solid': {'A': 10, 'B': 20, 'C': 25}}}
You could also use collections.defaultdict - but for that few datapoints this is not needed.