I try to clean the data with this code
empty = {}
mess = lophoc_clean.query("lop_diemquatrinh.notnull()")[['lop_id', 'lop_diemquatrinh']]
keys = []
values = []
for index, rows in mess.iterrows():
if len(rows['lop_diemquatrinh']) >4:
values.append(rows['lop_diemquatrinh'])
keys.append(rows['lop_id'])
df = pd.DataFrame(dict(zip(keys, values)), index = [0]).transpose()
df.columns = ['data']
The result is a dictionary like this
{'data': {37: '[{"date_update":"31-03-2022","diemquatrinh":"6.0"}]',
38: '[{"date_update":"11-03-2022","diemquatrinh":"6.25"}]',
44: '[{"date_update":"25-12-2021","diemquatrinh":"6.0"},{"date_update":"28-04-2022","diemquatrinh":"6.25"},{"date_update":"28-07-2022","diemquatrinh":"6.5"}]',
1095: '[{"date_update":null,"diemquatrinh":null}]'}}
However, I don't know how to make them into a DataFrame with 3 columns like this. Please help me. Thank you!
id
updated_at
diemquatrinh
38
11-03-2022
6.25
44
25-12-2021
6.0
44
28-04-2022
6.25
44
28-07-2022
6.5
1095
null
null
Here you go.
from json import loads
from pprint import pp
import pandas as pd
def get_example_data():
return [
dict(id=38, updated_at="2022-03-11", diemquatrinh=6.25),
dict(id=44, updated_at="2021-12-25", diemquatrinh=6),
dict(id=44, updated_at="2022-04-28", diemquatrinh=6.25),
dict(id=1095, updated_at=None),
]
df = pd.DataFrame(get_example_data())
df["updated_at"] = pd.to_datetime(df["updated_at"])
print(df.dtypes, "\n")
pp(loads(df.to_json()))
print()
print(df, "\n")
pp(loads(df.to_json(orient="records")))
It produces this output:
id int64
updated_at datetime64[ns]
diemquatrinh float64
dtype: object
{'id': {'0': 38, '1': 44, '2': 44, '3': 1095},
'updated_at': {'0': 1646956800000,
'1': 1640390400000,
'2': 1651104000000,
'3': None},
'diemquatrinh': {'0': 6.25, '1': 6.0, '2': 6.25, '3': None}}
id updated_at diemquatrinh
0 38 2022-03-11 6.25
1 44 2021-12-25 6.00
2 44 2022-04-28 6.25
3 1095 NaT NaN
[{'id': 38, 'updated_at': 1646956800000, 'diemquatrinh': 6.25},
{'id': 44, 'updated_at': 1640390400000, 'diemquatrinh': 6.0},
{'id': 44, 'updated_at': 1651104000000, 'diemquatrinh': 6.25},
{'id': 1095, 'updated_at': None, 'diemquatrinh': None}]
Either of the JSON datastructures
would be acceptable input
for creating a new DataFrame from scratch.
Related
I'm trying to create a nested Json file from a pandas dataframe. I found a similar question here but when I tried to apply the answer, the output wasn't what I really wanted. I tried to adjust the code to get the desired answer but I haven't been able to.
Let me explain the problem first then I will sow you what I have done so far.
I have the following dataframe:
Region staff_id rate dep
1 300047 77 4
1 300048 45 3
1 300049 32 7
2 299933 63 8
2 299938 86 7
Now I want the json object to look like this:
{'region': 1 :
{ 'Info': [
{'ID': 300047, 'Rate': 77, 'Dept': 4},
{'ID': 300048, 'Rate': 45, 'Dept': 3},
{'ID': 300049, 'Rate': 32, 'Dept': 7}
]
},
'region': 2 :
{ 'Info': [
{'ID': 299933, 'Rate': 63, 'Dept': 8},
{'ID': 299938, 'Rate': 86, 'Dept': 7}
]
}
}
So for every region, there is a tag called info and inside info there is all the rows of that region.
I tried this code from the previous answer:
json_output = list(df.apply(lambda row: {"region": row["Region"],"Info": [{
"ID": row["staff_id"], "Rate": row["rate"], "Dept": row["dep"]}]
},
axis=1).values)
Which will give me every row in the dataframe and not grouped by the region.
Sorry because this seems repetitive, but I have been trying to change that answer to fit mine and I would really appreciate your help.
As mention by Nick ODell, you can loop through the group by element
df = pd.DataFrame({"REGION":[1,1,1,2,2],
"staff_id": [1,2,3,4,5],
"rate": [77,45,32,63,86],
"dep":[4,3,7,8,7]})
desired_op = []
grp_element = list(df.groupby(["REGION"]))
for i in range(len(grp_element)):
empty_dict = {} # this dict will store data according to Region
lst_info = eval(grp_element[i][1][["staff_id","rate","dep"]].to_json(orient='records')) # converting to Json output of grouped data
empty_dict["REGION"] = grp_element[i][1]["REGION"].values[0] # to get Region number
empty_dict['info'] = lst_info
desired_op.append(empty_dict)
print(desired_op)
[{'REGION': 1,
'info': [{'staff_id': 1, 'rate': 77, 'dep': 4},
{'staff_id': 2, 'rate': 45, 'dep': 3},
{'staff_id': 3, 'rate': 32, 'dep': 7}]},
{'REGION': 2,
'info': [{'staff_id': 4, 'rate': 63, 'dep': 8},
{'staff_id': 5, 'rate': 86, 'dep': 7}]}]
I have created Data Frame, Check below snippet.
data = {'id': [101,102],
'name': ['xyz', 'xyz'],
'value1' : [41,42],
'value2' : [42,32]
}
df = pd.DataFrame(data, columns = ['id', 'name','value1','value2'])
print(df)
Output of Dataframe
id name value1 value2
101 xyz 41 42
102 xyy 42 32
Here I just want to create nested dictionary from this Data Frame.
Expected output
{'101':{'name':'xyz'
'data' : [{'value1' : 41,'value2':42},
{'value1': 42,'value2':32}]}}
I tried to do by following code but it's won't work so please could you me solve this
#Tried Snippet
code
print({n: grp.loc[n].to_dict('index')for n, grp in df.set_index(['id','name']).group by(level='id')})
output
{101: {'xyz': {'value1': 41, 'value2': 42}}, 102: {'xyz': {'value1': 42, 'value2': 32}}}
code
print({k:f.groupby('name')['value1'].apply(list).to_dict() for k, f in df.groupby('id')})
output
{101: {'xyz': [41]}, 102: {'xyz': [42]}}
required output
{'101':{'name':'xyz'
'data' : [{'value1' : 41,'value2':42},
{'value1': 42,'value2':32},
]}}
Let say df is :
df:
id name value1 value2
0 101 xyz 41 42
1 102 xyy 42 32
2 101 xyz 46 46
3 102 xyy 40 39
df.groupby(['id', 'name'])[['value1', 'value2']] \
.apply(lambda x: x.to_dict(orient='records')).reset_index(name='data')\
.set_index('id').to_dict(orient='index')
{101: {'name': 'xyz', 'data': [{'value1': 41, 'value2': 42}, {'value1': 46, 'value2': 46}]}, 102: {'name': 'xyy', 'data': [{'value1': 42, 'value2': 32}, {'value1': 40, 'value2': 39}]}}
I have the following data frame:
3 10 6 4 timestamp
462 75.768780 21.47490 20.725380 100.00000 2020-04-08 08:30:05
463 77.612500 21.47490 21.025310 100.00000 2020-04-08 08:30:35
464 77.612500 18.41914 21.025310 100.00000 2020-04-08 08:30:40
465 75.851290 18.41914 19.776660 100.00000 2020-04-08 08:31:05
466 2.908084 16.36317 4.460456 4.48286 2020-04-08 08:31:25
I want to convert this into a format like below if the df contains a time stamp column:
{datetime.datetime(2020, 4, 8, 8, 30, 5, tzinfo=<UTC>): {'3': 75.76878,
'10': 21.474900000000005,
'6': 20.72538,
'4': 100.0},
datetime.datetime(2020, 4, 8, 8, 30, 35, tzinfo=<UTC>): {'3': 77.6125,
'10': 21.474900000000005,
'6': 21.02531,
'4': 100.0},
}
If the data frame is in the format without timestamp column:
1 2 3
36026 7.2246 5.4106 0.0
36027 7.2154 5.4115 0.0
36028 7.1923 5.4111 0.0
36029 7.2028 5.4106 0.0
36030 7.2141 5.4123 0.0
I want to convert this data frame into a format like below:
{36026: {'1': 7.2246, '2': 5.4106, '3': 0.0},
36027: {'1': 7.2154, '2': 5.4115, '3': 0.0},
36028: {'1': 7.1923, '2': 5.4111, '3': 0.0},
36029: {'1': 7.2028, '2': 5.4106, '3': 0.0},
36030: {'1': 7.2141, '2': 5.4123, '3': 0.0}}
Currently what I am trying to achieve this using df.iterrows() in a for loop and then converting it to the corresponding format.
Any Efficient solution would be great.
Thanks in advance.
I have my dictionary as
{'id': '6576_926_1',
'name': 'xyz',
'm': 926,
0: {'id': '2896_926_2',
'name': 'lmn',
'm': 926},
1: {'id': '23_926_3',
'name': 'abc',
'm': 928}}
And I want to convert it into dataframe like
Id Name M
6576_926_1 Xyz 926
2896_926_2 Lmn 926
23_926_3 Abc 928
I am fine even if first row is not available as it doesn't have index. There are around 1.3 MN records and so speed is very important. I tried using a for loop and append statement and it takes forever
As you have mentioned that first row is not mandatory for you. So, here i've tried this. Hope this will solve your problem
import pandas as pd
lis = []
data = {
0: {'id': '2896_926_2', 'name': 'lmn', 'm': 926},
1: {'id': '23_926_3', 'name': 'abc', 'm': 928}
}
for key,val in data.iteritems():
lis.append(val)
d = pd.DataFrame(lis)
print d
Output--
id m name
0 2896_926_2 926 lmn
1 23_926_3 928 abc
And if you want to id as your index then add set_index
for i,j in data.iteritems():
lis.append(j)
d = pd.DataFrame(lis)
d = d.set_index('id')
print d
Output-
m name
id
2896_926_2 926 lmn
23_926_3 928 abc
You can use a loop to convert each dictionary's entries into a list, and then use panda's .from_dict to convert to a dataframe. Here's the example given:
>>> data = {'col_1': [3, 2, 1, 0], 'col_2': ['a', 'b', 'c', 'd']}
>>> pd.DataFrame.from_dict(data)
col_1 col_2
0 3 a
1 2 b
2 1 c
3 0 d
Use the following approach
import pandas as pd
data = pd.Dataframe(dict)
data = data.drop(0, axis=1)
data = data.drop(1, axis=1)
You can also try this
import pandas as pd
del dict['id']
del dict['name']
del dict['m']
pd.DataFrame(dict)
Try this code!! Still, complexity is O(n)
my_dict.pop('id')
my_dict.pop('name')
my_dict.pop('m')
data = [ row.values() for row in my_dict.values()]
pd.DataFrame(data=data, columns=['id','name','m'])
import pandas as pd
data={'id': '6576_926_1','name': 'xyz','m': 926,0: {'id': '2896_926_2', 'name': 'lmn', 'm': 926},1: {'id': '23_926_3', 'name': 'abc','m': 928}}
Id=[]
Name=[]
M=[]
for k,val in data.items():
if type(val) is dict:
Id.append(val['id'])
Name.append(val['name'])
M.append(val['m'])
df=pd.DataFrame({'Name':Name,'Id':Id,'M':M})
print(df)
mydict = {'id': '6576_926_1',
'name': 'xyz',
'm': 926,
0: {'id': '2896_926_2',
'name': 'lmn',
'm': 926},
1: {'id': '23_926_3',
'name': 'abc',
'm': 928}}
import pandas as pd
del mydict['id']
del mydict['name']
del mydict['m']
d = pd.DataFrame(mydict).T
One of the columns of my pandas dataframe looks like this
>> df
Item
0 [{"id":A,"value":20},{"id":B,"value":30}]
1 [{"id":A,"value":20},{"id":C,"value":50}]
2 [{"id":A,"value":20},{"id":B,"value":30},{"id":C,"value":40}]
I want to expand it as
A B C
0 20 30 NaN
1 20 NaN 50
2 20 30 40
I tried
dfx = pd.DataFrame()
for i in range(df.shape[0]):
df1 = pd.DataFrame(df.item[i]).T
header = df1.iloc[0]
df1 = df1[1:]
df1 = df1.rename(columns = header)
dfx = dfx.append(df1)
But this takes a lot of time as my data is huge. What is the best way to do this?
My original json data looks like this:
{
{
'_id': '5b1284e0b840a768f5545ef6',
'device': '0035sdf121',
'customerId': '38',
'variantId': '31',
'timeStamp': datetime.datetime(2018, 6, 2, 11, 50, 11),
'item': [{'id': A, 'value': 20},
{'id': B, 'value': 30},
{'id': C, 'value': 50}
},
{
'_id': '5b1284e0b840a768f5545ef6',
'device': '0035sdf121',
'customerId': '38',
'variantId': '31',
'timeStamp': datetime.datetime(2018, 6, 2, 11, 50, 11),
'item': [{'id': A, 'value': 20},
{'id': B, 'value': 30},
{'id': C, 'value': 50}
},
.............
}
I agree with #JeffH, you should really look at how you are constructing the DataFrame.
Assuming you are getting this from somewhere out of your control then you can convert to the your desired DataFrame with:
In []:
pd.DataFrame(df['Item'].apply(lambda r: {d['id']: d['value'] for d in r}).values.tolist())
Out[]:
A B C
0 20 30.0 NaN
1 20 NaN 50.0
2 20 30.0 40.0