I have data-frame which contains json column and is quiet huge and is not very efficient, i would like to store it as nested data frame.
So sample data-frame looks like:
id date ag marks
0 I2213 2022-01-01 13:28:05.448054 [{'type': 'A', 'values': {'X': {'F1': 0.1, 'F2': 0.2}, 'U': {'F1': 0.3, 'F2': 0.4}}}, {'type': 'B', 'results': {'Y': {'F1': 0.3, 'F2': 0.2}}}] [{'type': 'A', 'marks': {'X': 0.5, 'U': 0.7}}, {'type': 'B', 'marks': {'Y': 0.4}}]
1 I2213 2022-01-01 14:28:05.448054 [{'type': 'B', 'values': {'Z': {'F1': 0.4, 'F2': 0.2}}}] [{'type': 'A', 'marks': {'X': 0.4, 'U': 0.6}}, {'type': 'B', 'marks': {'Y': 0.3, 'Z': 0.4}}]
2 I2213 2022-01-03 15:28:05.448054 [{'type': 'A', 'values': {'X': {'F1': 0.2, 'F2': 0.1}}}] [{'type': 'A', 'marks': {'X': 0.2, 'U': 0.9}}, {'type': 'B', 'marks': {'Y': 0.2}}]
Expected output:
grouped by date. Sample code for generating sample dataframe:
from datetime import datetime, timedelta
def sample_data():
ag_data = [
"[{'type': 'A', 'values': {'X': {'F1': 0.1, 'F2': 0.2}, 'U': {'F1': 0.3, 'F2': 0.4}}}, {'type': 'B', 'results': {'Y': {'F1': 0.3, 'F2': 0.2}}}]",
"[{'type': 'B', 'values': {'Z': {'F1': 0.4, 'F2': 0.2}}}]",
"[{'type': 'A', 'values': {'X': {'F1': 0.2, 'F2': 0.1}}}]",
]
marks_data = [
"[{'type': 'A', 'marks': {'X': 0.5, 'U': 0.7}}, {'type': 'B', 'marks': {'Y': 0.4}}]",
"[{'type': 'A', 'marks': {'X': 0.4, 'U': 0.6}}, {'type': 'B', 'marks': {'Y': 0.3, 'Z': 0.4}}]",
"[{'type': 'A', 'marks': {'X': 0.2, 'U': 0.9}}, {'type': 'B', 'marks': {'Y': 0.2}}]",
]
date_data = [
datetime.now() - timedelta(3, seconds=7200),
datetime.now() - timedelta(3, seconds=3600),
datetime.now() - timedelta(1),
]
df = pd.DataFrame()
df['date'] = date_data
df['ag'] = ag_data
df['marks'] = marks_data
df['id'] = 'I2213'
return df
I tried with json normalization, but it's creating dataframe in columnar fashion like:
d = a['ag'].apply(lambda x: pd.json_normalize(json.loads(x.replace("'", '"'))))
gives dataframe with columns type values.X.F1 values.X.F2 values.U.F1 values.U.F2 results.Y.F1 results.Y.F2 issue is how to put dict keys like X,Y, F1,F2 as rows instead of columns.
Is it possible to achieve the desired format as shown in image?
I have tried by creating helper function.
def ag_col_helper(ag_df):
s = pd.json_normalize(json.loads(ag_df.replace("\'", "\"")))
s.set_index('type', inplace=True)
s1 = s.melt(ignore_index=False, var_name='feature')
split_vals = s1['feature'].str.split(".", n = 2, expand = True)
s1['name'] = split_vals[1]
s1['feature'] = split_vals[2]
return s1.groupby(['type', 'name', 'feature']).first().dropna()
def marks_col_helper(marks_df):
s = pd.json_normalize(json.loads(marks_df.replace("\'", "\"")))
s.set_index('type', inplace=True)
s1 = s.melt(ignore_index=False, var_name='name', value_name='marks')
split_vals = s1['name'].str.split(".", n = 2, expand = True)
s1['name'] = split_vals[1]
return s1.groupby(['type', 'name']).first().dropna()
Then this can be applied to the column ag
df['ag'] = df['ag'].apply(lambda x: do_something(x))
df['marks'] = df['marks'].apply(lambda x: do_something_marks(x))[0]
Then we would get for
df.iloc[0]['ag']
value
type name feature
A U F1 0.3
F2 0.4
X F1 0.1
F2 0.2
B Y F1 0.3
F2 0.2
df.iloc[0]['marks']
marks
type name
A U 0.7
X 0.5
B Y 0.4
I think this one is what you are expecting.
For grouping the date column you can create another column using df['Date'] = df['date'].dt.date and perform a groupby.
It appears that you can set data frames as values within a dataframe. This:
import pandas as pd
#creating outer df
df = pd.DataFrame([{'a':1, 'b':2, 'inner':None},{'a':3, 'b':4, 'inner':None}])
#creating inner dfs
inner_1 = pd.DataFrame([{'time': 0, 'e': 1}, {'time': 1, 'e': 2}])
inner_2 = pd.DataFrame([{'time': 0, 'e': 6}, {'time': 1, 'e': 7}])
inners = [inner_1, inner_2]
df['inner'] = inners
print(df)
results in this:
a b inner
0 1 2 time e
0 0 1
1 1 2
1 3 4 time e
0 0 6
1 1 7
the print out quickly gets confusing, but it seems like it's what you want.
for your data specifically, take your lists of dicts and convert them to a df with pd.DataFrame. If you want to turn all your lists to dataframes, you can use something like this:
import pandas as pd
#creating outer df
df = pd.DataFrame([{'a':1, 'b':2, 'inner':None},{'a':3, 'b':4, 'inner':None}])
#creating inner dfs
inner_1 = [{'time': 0, 'e': 1}, {'time': 1, 'e': 2}]
inner_2 = [{'time': 0, 'e': 6}, {'time': 1, 'e': 7}]
inners = [inner_1, inner_2]
df['inner'] = inners
print('un-transformed')
print(df)
#transforming all lists into DFs
for i in range(df.shape[0]): #iterate over rows
for j in range(df.shape[1]): #iterate over columns
if type(df.iat[i,j]) == list: #filtering cells that are lists
df.iat[i, j] = pd.DataFrame(df.iat[i, j]) #convert to df
print("transformed")
print(df)
which returns
un-transformed
a b inner
0 1 2 [{'time': 0, 'e': 1}, {'time': 1, 'e': 2}]
1 3 4 [{'time': 0, 'e': 6}, {'time': 1, 'e': 7}]
transformed
a b inner
0 1 2 time e
0 0 1
1 1 2
1 3 4 time e
0 0 6
1 1 7
Related
I have a pandas dataframe that can be represented like:
test_dict = {('a', 1) : {'shared':0,'x':1, 'y':2, 'z':3},
('a', 2) : {'shared':1,'x':2, 'y':4, 'z':6},
('b', 1) : {'shared':0,'x':10, 'y':20, 'z':30},
('b', 2) : {'shared':1,'x':100, 'y':200, 'z':300}}
example = pd.DataFrame.from_dict(test_dict).T
I am trying to figure out a way to turn this into a dataframe that looks like this dictionary representation:
res_dict = {1 : {'shared':0,'a':{'x':1, 'y':2, 'z':3}, 'b':{'x':10, 'y':20, 'z':30}},
2 : {'shared':1,'a':{'x':2, 'y':4, 'z':6},'b':{'x':100, 'y':200, 'z':300}}}
Any suggestions appreciated!
Thanks
A possible solution, which uses only dataframe manipulations and then converts to dictionary:
xyz = ['x', 'y', 'z']
out = (example.assign(xyz=example[xyz].apply(list, axis=1)).reset_index()
.pivot(index='level_0', columns=['level_1', 'shared'], values='xyz')
.applymap(lambda x: dict(zip(xyz, x))))
out.columns = out.columns.rename(None, level=0)
out.index = out.index.rename(None)
(pd.concat([out.droplevel(1, axis=1),
out.columns.to_frame().reset_index(drop=True).iloc[:,1]
.to_frame().T.set_axis(out.columns.get_level_values(0), axis=1)])
.iloc[np.arange(-1, len(out))].to_dict())
Output:
{
1: {
'shared': 0,
'a': {'x': 1, 'y': 2, 'z': 3},
'b': {'x': 10, 'y': 20, 'z': 30}
},
2: {
'shared': 1,
'a': {'x': 2, 'y': 4, 'z': 6},
'b': {'x': 100, 'y': 200, 'z': 300}
}
}
I have data as
[{'name': 'A', 'subsets': ['X_1', 'X_A', 'X_B'], 'cluster': 0},
{'name': 'B', 'subsets': ['B_1', 'B_A'], 'cluster': 2},
{'name': 'C', 'subsets': ['X_1', 'X_A', 'X_B'], 'cluster': 0},
{'name': 'D', 'subsets': ['D_1', 'D_2', 'D_3', 'D_4'], 'cluster': 1}]
I need to represent it as
Cluster Number Subset Name
0 ['X_1', 'X_A', 'X_B'] A, C
1 ['D_1', 'D_2', 'D_3', 'D_4'] D
2 ['B_1', 'B_A'] B
For the sake of completeness, I think it is fair to mention that you can actually create a dataframe without json_normalize in your case and apply groupby as originally shown here:
import pandas as pd
data = [{'name': 'A', 'subsets': ['X_1', 'X_A', 'X_B'], 'cluster': 0},
{'name': 'B', 'subsets': ['B_1', 'B_A'], 'cluster': 2},
{'name': 'C', 'subsets': ['X_1', 'X_A', 'X_B'], 'cluster': 0},
{'name': 'D', 'subsets': ['D_1', 'D_2', 'D_3', 'D_4'], 'cluster': 1}]
df = pd.DataFrame(data).groupby('cluster')
.agg({'subsets':'first','name':', '.join})
.reset_index()
.set_index('cluster')
.rename_axis('Cluster Number')
subsets name
Cluster Number
0 [X_1, X_A, X_B] A, C
1 [D_1, D_2, D_3, D_4] D
2 [B_1, B_A] B
You can use json_normalize + groupby "cluster" and apply join to "name" and first to "subsets":
df = pd.json_normalize(data).groupby('cluster').agg({'subsets':'first','name':', '.join}).reset_index()
Output:
cluster subsets name
0 0 [X_1, X_A, X_B] A, C
1 1 [D_1, D_2, D_3, D_4] D
2 2 [B_1, B_A] B
I have an object like
l = [
{'id': 1, 'name': 'a', 'obj2': [{'a': 3, 'b': 6}, {'a':4, 'b': 5}], 'obj': [{'x': 6, 'y': 'p'}, {'x': 10, 'y': 'q', 'z': 'qqq'}]},
{'id': 2, 'name': 'b', 'obj': [{'x': 10, 'y': 'r'}], 'obj2': [{'a': 9, 'i': 's'}]}
]
and I want to make it a dataframe like:
id name a i b x y z
1 a 3 6 6 p
1 a 3 6 10 q qqq
1 a 4 5 6 p
1 a 4 5 10 q qqq
2 b 9 s 10 r
Inside the l, all keys will be the same. But I may have different l with different key name and different amount of objects with lists inside l[0].
Any help is much appreciated.
This is a perfect use case for pd.json_normalize:
l = [{'id': 1, 'name': 'a', 'obj': [{'x': 6, 'y': 'p'}, {'x': 10, 'y': 'q', 'z': 'qqq'}]},
{'id': 2, 'name': 'b', 'obj': [{'x': 10, 'y': 'r'}]}]
df = pd.json_normalize(l, 'obj', ['id', 'name'])
print(df)
# Output:
x y z id name
0 6 p NaN 1 a
1 10 q qqq 1 a
2 10 r NaN 2 b
Udpate:
I want to use the same code for every object that has that structure type, but maybe the id,name,obj will be named differently
keys = list(l[0].keys())
df = pd.json_normalize(l, keys[-1], keys[:-1])
print(df)
# Output:
x y z id name
0 6 p NaN 1 a
1 10 q qqq 1 a
2 10 r NaN 2 b
Pandas column of length n is of type list.
df['size'][0] = [{'Name': 'Total', 'Value': 50, 'Unit': 'Units'}]
type(df['Size'][0])
list
I'd like to convert the list to a dictionary. i.e type(df['Size'][0]) dict.
{'Name': 'Total',
'Value': 50,
'Unit': 'Units'}
For context, I am trying to parse out the dictionary into multiple columns.
# Unpack Size
for i, row in df.iterrows():
if type(row['Size'][i]) is dict:
dict_obj = row['Size'][i]
for key, val in dict_obj.items():
if key == 'Name':
df.loc[index, 'Size_Name'] = val
if key == 'Value':
df.loc[index, 'Size_Value'] = val
if key == 'Unit':
df.loc[index, 'Size_Unit'] = val
there can be n number of dictionaries.
When you have arbitary number of dictionaries in list use df.explode
df = pd.DataFrame({'size':[[{'a':1},{'b':1}],[{'a':2}],[{'c':2},{'d':2},{'e':4}]]})
df
size
0 [{'a': 1}, {'b': 1}]
1 [{'a': 2}]
2 [{'c': 2}, {'d': 2}, {'e': 4}]
df.explode('size')
size
0 {'a': 1}
0 {'b': 1}
1 {'a': 2}
2 {'c': 2}
2 {'d': 2}
2 {'e': 4}
If it's always list of one dictionary i.e df['size'][x] = [{...}] use itertools.chain.from_iterable
from itertools import chain
df['size'] = list(chain.from_iterable(df['size']))
If you have:
df['size'][0] = [{'Name': 'Total', 'Value': 50, 'Unit': 'Units'}]
type(df['Size'][0])
list
you should use:
type(df['Size'][0][0])
dict
And if you have several dictionaries in the list, increase the last index to get access to the rest of them.
I've got a list of daily values ordered into a list of dicts like so:
vals = [
{'date': '1-1-2014', 'a': 10, 'b': 33.5, 'c': 82, 'notes': 'high repeat rate'},
{'date': '2-1-2014', 'a': 5, 'b': 11.43, 'c': 182, 'notes': 'normal operations'},
{'date': '3-1-2014', 'a': 0, 'b': 0.5, 'c': 2, 'notes': 'high failure rate'},
...]
What I'd like to do is get an average of a, b & c for the month.
Is there a better way than doing something like:
val_points = {}
val_len = len(vals)
for day in vals:
for p in ['a', 'b', 'c']:
if val_points.has_key(p):
val_points += day[p]
else:
val_points = day[p]
val_avg = dict([(i, val_points[i] / val_len] for p in val_points])
I haven't run the code above, may have glitches but I hope I'm getting the idea across. I know there's probably a better way using some combination of operator, itertools and collections.
{p:sum(map(lambda x:x[p],vals))/len(vals) for p in ['a','b','c']}
output:
{'a': 5, 'c': 88, 'b': 15.143333333333333}
This might be slightly longer than Elisha's answer, but there are less intermediate data structures, hence it might be faster:
KEYS = ['a', 'b', 'c']
def sum_and_count(sums_and_counts, item, key):
prev_sum, prev_count = sums_and_counts.get(key, (0,0)) # using get to have a fall-back if there is nothing in our sums_and_counts
return (prev_sum+item.get(key, 0), prev_count+1) # using get to have a 0 default for a non-existing key in item
sums_and_counts = reduce(lambda sc, item: {key: sum_and_count(sc, item, key) for key in KEYS}, vals, {})
averages = {k:float(total)/no for (k,(total,no)) in sums_and_counts.iteritems()}
print averages
output:
{'a': 5.0, 'c': 88.66666666666667, 'b': 15.143333333333333}
As you want to calculate average by month(Here considering the date format in 'dd-mm-yyyy'):
vals = [
{'date': '1-1-2014', 'a': 10, 'b': 33.5, 'c': 82, 'notes': 'high repeat rate'},
{'date': '2-1-2014', 'a': 5, 'b': 11.43, 'c': 182, 'notes': 'normal operations'},
{'date': '3-1-2014', 'a': 20, 'b': 0.5, 'c': 2, 'notes': 'high failure rate'},
{'date': '3-2-2014', 'a': 0, 'b': 0.5, 'c': 2, 'notes': 'high failure rate'},
{'date': '4-2-2014', 'a': 20, 'b': 0.5, 'c': 2, 'notes': 'high failure rate'}
]
month = {}
for x in vals:
newKey = x['date'].split('-')[1]
if newKey not in month:
month[newKey] = {}
for k in 'abc':
if k in month[newKey]:
month[newKey][k].append(x[k])
else:
month[newKey][k] = [x[k]]
output = {}
for y in month:
if y not in output:
output[y] = {}
for z in month[y]:
output[y][z] = sum(month[y][z])/float(len(month[y][z]))
print output
OUTPUT:
{'1': {'a': 11.666666666666666, 'c': 88.66666666666667, 'b': 15.143333333333333},
'2': {'a': 10.0, 'c': 2.0, 'b': 0.5}}
If you have multiple month's data, Pandas will make your life a lot easier:
df = pandas.DataFrame(vals)
df.date = [pandas.datetools.parse(d, dayfirst=True) for d in df.date]
df.set_index('date', inplace=True)
means = df.resample('m', how='mean')
Results in:
a b c
date
2014-01-31 5 15.143333 88.666667