I have a dataframe where one of the columns has a dictionary in it
import pandas as pd
import numpy as np
def generate_dict():
return {'var1': np.random.rand(), 'var2': np.random.rand()}
data = {}
data[0] = {}
data[1] = {}
data[0]['A'] = generate_dict()
data[1]['A'] = generate_dict()
df = pd.DataFrame.from_dict(data, orient='index')
I would like to unpack the key/value pairs in the dictionary into a new dataframe, where each entry has it's own row. I can do that by iterating over the rows and appending to a new DataFrame:
def expand_row(row):
df_t = pd.DataFrame.from_dict({'value': row.A})
df_t.index.rename('row', inplace=True)
df_t.reset_index(inplace=True)
df_t['column'] = 'A'
return df_t
df_expanded = pd.DataFrame([])
for _, row in df.iterrows():
T = expand_row(row)
df_expanded = df_expanded.append(T, ignore_index=True)
This is rather slow, and my application is performance critical. I tihnk this is possible with df.apply. However as my function returns a DataFrame instead of a series, simply doing
df_expanded = df.apply(expand_row)
doesn't quite work. What would be the most performant way to do this?
Thanks in advance.
You can use nested list comprehension and then replace column 0 with constant A (column name):
d = df.A.to_dict()
df1 = pd.DataFrame([(key,key1,val1) for key,val in d.items() for key1,val1 in val.items()])
df1[0] = 'A'
df1.columns = ['columns','row','value']
print (df1)
columns row value
0 A var1 0.013872
1 A var2 0.192230
2 A var1 0.176413
3 A var2 0.253600
Another solution:
df1 = pd.DataFrame.from_records(df.A.values.tolist()).stack().reset_index()
df1['level_0'] = 'A'
df1.columns = ['columns','row','value']
print (df1)
columns row value
0 A var1 0.332594
1 A var2 0.118967
2 A var1 0.374482
3 A var2 0.263910
Related
I have a formatted data as dict[tuple[str, str], list[float]]
i want to convert it into a pandas dataframe
Example data:
{('A','B'): [-0.008035100996494293,0.008541940711438656]}
i tried using some data manipulations using split functions.
Expecting:-
import pandas as pd
data = {('A','B'): [-0.008035100996494293,0.008541940711438656], ('C','D'): [-0.008035100996494293,0.008541940711438656]}
title = []
heading = []
num_col1 = []
num_col2 = []
for key, val in data.items():
title.append(key[0])
heading.append(key[1])
num_col1.append(val[0])
num_col2.append(val[1])
data_ = {'title':title, 'heading':heading, 'num_col1':num_col1, 'num_col2':num_col1}
pd.DataFrame(data_)
Your best bet will be to construct your Index manually. For this we can use pandas.MultiIndex.from_tuples since your dictionary keys are stored as tuples. From there we just need to store the values of the dictionary into the body of a DataFrame.
import pandas as pd
data = {('A','B'): [-0.008035100996494293,0.008541940711438656]}
index = pd.MultiIndex.from_tuples(data.keys(), names=['title', 'heading'])
df = pd.DataFrame(data.values(), index=index).reset_index()
print(df)
title heading 0 1
0 A B -0.008035 0.008542
If you want chained operation, you can do:
import pandas as pd
data = {('A','B'): [-0.008035100996494293,0.008541940711438656]}
df = (
pd.DataFrame.from_dict(data, orient='index')
.pipe(lambda d:
d.set_axis(pd.MultiIndex.from_tuples(d.index, names=['title', 'heading']))
)
.reset_index()
)
print(df)
title heading 0 1
0 A B -0.008035 0.008542
Another possible solution, which works also if the tuples and lists vary in length:
pd.concat([pd.DataFrame.from_records([x for x in d.keys()],
columns=['title', 'h1', 'h2']),
pd.DataFrame.from_records([x[1] for x in d.items()])], axis=1)
Output:
title h1 h2 0 1 2
0 A B None -0.008035 0.008542 NaN
1 C B D -0.010351 1.008542 5.0
Data input:
d = {('A','B'): [-0.008035100996494293,0.008541940711438656],
('C','B', 'D'): [-0.01035100996494293,1.008541940711438656, 5]}
You can expand the keys and values as you iterate the dictionary items. Pandas will see 4 values which it will make into a row.
>>> import pandas as pd
>>> data = {('A','B'): [-0.008035100996494293,0.008541940711438656]}
>>> pd.DataFrame(((*d[0], *d[1]) for d in data.items()), columns=("Title", "Heading", "Foo", "Bar"))
Title Heading Foo Bar
0 A B -0.008035 0.008542
i have 5 different data frames that are output of different conditions or tables.
i want to have an output if these data-frames are empty or not. basically i will define with len(df) each data frame and will pass a string if they have anything in them.
def(df1,df2,df3,df4,df5)
if len(df1) > 0:
"df1 not empty"
else: ""
if len(df2) > 0:
"df2 not empty"
else: ""
then i want to append these string to each other and will have a string like
**df1 not empty, df3 not empty**
try this :
import pandas as pd
dfs = {'milk': pd.DataFrame(['a']), 'bread': pd.DataFrame(['b']), 'potato': pd.DataFrame()}
print(''.join(
[f'{name} not empty. ' for name, df in dfs.items() if (not df.empty)])
)
output:
milk not empty. bread not empty.
data = [1,2,3]
df = pd.DataFrame(data, columns=['col1']) #create not empty df
data1 = []
df1 = pd.DataFrame(data) #create empty df
dfs = [df, df1] #list them
#the "for loop" is replaced here by a list comprehension
#I used enumerate() to attribute an index to each df in the list of dfs, because otherwise in the print output if you call directly df0 or df1 it will print th entire dataframe, not only his name
print(' '.join([f'df{i} is not empty.' for i,df in enumerate(dfs) if not df.empty]))
Result:
df0 is not empty. df1 is not empty.
With a one-liner:
dfs = [df1,df2,df3,df4,df5]
output = ["your string here" for df in dfs if not df.empty]
You can then concatenate strings together, if you want:
final_string = "; ".join(output)
def Transformation_To_UpdateNex(df):
s = 'TERM-ID,NAME,QUALIFIER,FACET1_ID,FACET2_ID,FACET3_ID,FACET4_ID,GROUP1_ID,GROUP2_ID,GROUP3_ID,GROUP4_ID,IS_VALID,IS_SELLABLE,IS_PRIMARY,IS_BRANCHABLE,HAS_RULES,FOR_SUGGESTION,IS_SAVED,S_NEG,SCORE,GOOGLE_SV,CPC,SINGULARTEXT,SING_PLU_VORGABE'
df_Import = pd.DataFrame(columns = s.split(','))
d = {'TERMID':'TERM-ID', 'NAMECHANGE':'NAME', 'TYP':'QUALIFIER'}
df_Import = df.rename(columns = d).reindex(columns=df_Import.columns)
df_Import.to_csv("Update.csv", sep=";", index = False, encoding = "ISO-8859-1")
ValueError: cannot reindex from a duplicate axis
I am trying to take values from a filled Dataframe and transfer these values keeping the same structure to my new Dataframe (empty one described first in the code).
Any ideas how to solve the value error?
So error:
ValueError: cannot reindex from a duplicate axis
means there are duplicated columns names.
I think problem is with rename, because it create duplicated columns:
s = 'TERM-ID,NAME,QUALIFIER,FACET1_ID,NAMECHANGE,TYP'
df = pd.DataFrame(columns = s.split(','))
print (df)
Empty DataFrame
Columns: [TERM-ID, NAME, QUALIFIER, FACET1_ID, NAMECHANGE, TYP]
Index: []
Here after rename get duplicated NAME and QUALIFIER columns, because original columns are NAME and NAMECHANGE and also QUALIFIER and TYP pairs:
d = {'TERMID':'TERM-ID', 'NAMECHANGE':'NAME', 'TYP':'QUALIFIER'}
df1 = df.rename(columns = d)
print (df1)
Empty DataFrame
Columns: [TERM-ID, NAME, QUALIFIER, FACET1_ID, NAME, QUALIFIER]
Index: []
Possible solution is test, if exist column and filter dictionary:
d = {'TERMID':'TERM-ID', 'NAMECHANGE':'NAME', 'TYP':'QUALIFIER'}
d1 = {k: v for k, v in d.items() if v not in df.columns}
print (d1)
{}
df1 = df.rename(columns = d1)
print (df1)
Empty DataFrame
Columns: [TERM-ID, NAME, QUALIFIER, FACET1_ID, NAMECHANGE, TYP]
Index: []
I have a list of DataFrames that look like this,
dfs
[
var1 var1
14.171250 13.593813
13.578317 13.595329
10.301850 13.580139
9.930217 13.593278
6.192517 13.561943
7.738100 13.565149
6.197983 13.572509,
var1 var2
2.456183 5.907528
5.052017 5.955731
5.960000 5.972480
8.039317 5.984608
7.559217 5.985348
6.933633 5.979438
...
]
I want to rename var1 and var2 in each DataFrame to be Foo and Hoo.
I tried the following,
renames_dfs = []
for df in dfs:
renames_dfs.append(df.rename(columns={'var1':'Foo','var2':'Hoo'},inplace = True))
This returns an empty list of None. What mistake am I making here when i rename column names?
You can do it like this.
[df.rename(columns={'var1':'Foo','var2':'Hoo'},inplace=True) for df in dfs]
Output:
[None,None]
BUT....
dfs
Output:
[ Foo Hoo
0 0.463904 0.765987
1 0.473314 0.609793
2 0.505549 0.449539
3 0.508157 0.444993
4 0.604366 0.368044, Foo Hoo
0 0.241526 0.225990
1 0.609949 0.454891
2 0.523094 0.443431
3 0.525026 0.714601
4 0.002260 0.763454]
Your existing code returns None because inplace=True updates the reference inplace.
One efficient solution is to just assign to df.columns directly:
for df in dfs:
df.columns = ['foo', 'bar']
Will update all dataframes in the same list without having to create a new list.
Another option is using set_axis, if you are renaming all the columns:
df2 = [df.columns.set_axis(['foo', 'bar'], axis=1, inplace=False) for df in dfs]
If renaming only a subset, use rename instead.
Parameter inplace=True always return None.
So you can use:
renames_dfs = []
for df in dfs:
df.rename(columns={'var1':'Foo','var2':'Hoo'},inplace = True)
renames_dfs.append(df)
But I think better:
renames_dfs = []
for df in dfs:
renames_dfs.append(df.rename(columns={'var1':'Foo','var2':'Hoo'}))
I created this dataframe:
import pandas as pd
columns = pd.MultiIndex.from_tuples([("x", "", ""), ("values", "a", "a.b"), ("values", "c", "")])
df0 = pd.DataFrame([(0,10,20),(1,100,200)], columns=columns)
df0
I unload df0 to excel:
df0.to_excel("test.xlsx")
and load it again:
df1 = pd.read_excel("test.xlsx", header=[0,1,2])
df1
And I have Unnamed :... column names.
To make df1 look like inital df0 I run:
def rename_unnamed(df, label=""):
for i, columns in enumerate(df.columns.levels):
columns = columns.tolist()
for j, row in enumerate(columns):
if "Unnamed: " in row:
columns[j] = ""
df.columns.set_levels(columns, level=i, inplace=True)
return df
rename_unnamed(df1)
Well done. But is there any pandas way from box to do this?
Since pandas 0.21.0 the code should be like this
def rename_unnamed(df):
"""Rename unamed columns name for Pandas DataFrame
See https://stackoverflow.com/questions/41221079/rename-multiindex-columns-in-pandas
Parameters
----------
df : pd.DataFrame object
Input dataframe
Returns
-------
pd.DataFrame
Output dataframe
"""
for i, columns in enumerate(df.columns.levels):
columns_new = columns.tolist()
for j, row in enumerate(columns_new):
if "Unnamed: " in row:
columns_new[j] = ""
if pd.__version__ < "0.21.0": # https://stackoverflow.com/a/48186976/716469
df.columns.set_levels(columns_new, level=i, inplace=True)
else:
df = df.rename(columns=dict(zip(columns.tolist(), columns_new)),
level=i)
return df
Mixing answers from #jezrael and #dinya, and limited for pandas above 0.21.0 (after 2017) an option to solve this will be:
for i, columns_old in enumerate(df.columns.levels):
columns_new = np.where(columns_old.str.contains('Unnamed'), '-', columns_old)
df.rename(columns=dict(zip(columns_old, columns_new)), level=i, inplace=True)
You can use numpy.where with condition by contains:
for i, col in enumerate(df1.columns.levels):
columns = np.where(col.str.contains('Unnamed'), '', col)
df1.columns.set_levels(columns, level=i, inplace=True)
print (df1)
x values
a c
a.b
0 0 10 20
1 1 100 200