pandas dataframe to custom nested json - python

I have a pandas dataframe that looks like this:
user_id cat_id prod_id score pref_prod
29762 9 3115 1.000000 335.0
29762 58 1335 1.000000 335.0
234894 58 1335 1.000000 335.0
413276 43 1388 1.000000 335.0
413276 58 1335 1.000000 335.0
413276 73 26 1.000000 335.0
9280593 9 137 1.000000 335.0
9280593 58 1335 1.000000 335.0
9280593 74 160 1.000000 335.0
4554542 66 1612 0.166667 197.0
4554542 66 1406 0.166767 197.0
4554542 66 2021 1.000000 197.0
I want to group this df by user_id & cat_id and convert it to json so that it looks something like this:
{
29762: {
'cat_id': {
9: [{
'prod_id': 3115,
'score': 1.0
}],
58: [{
'prod_id': 1335,
'score': 1.0
}]
},
'pref_prod': 335.0
}
234894: {
'cat_id': {
58: [{
'prod_id': 1335,
'score': 1.0
}]
},
'pref_prod': 335.0
}
413276: {
'cat_id': {
43: [{
'prod_id': 1388,
'score': 1.0,
'fav_provider': 335.0
}],
58: [{
'prod_id': 1335,
'score': 1.0,
'fav_provider': 335.0
}],
73: [{
'prod_id': 26,
'score': 1.0,
}]
},
'pref_prod': 335.0
}
4554542: {
'cat_id': {
66: [{
'prod_id': 1612,
'score': 0.166
}, {
'prod_id': 1406,
'score': 0.16
}, {
'prod_id': 2021,
'score': 1.0,
}]
},
'pref_prod': 197.0
}
}
As of now I can do
gb = df.groupby(['user_id', 'cat_id']).apply(lambda g: g.drop(['user_id', 'cat_id'], axis=1).to_dict(orient='records')).to_dict()
which gives me user_id and cat_id in tuple keys:
{
(29762, 9): [{
'prod_id': 3115,
'score': 1.0,
'pref_prod': 335.0
}],
(29762, 58): [{
'prod_id': 1335,
'score': 1.0,
'pref_prod': 335.0
}],
(234894, 58): [{
'prod_id': 1335,
'score': 1.0,
'pref_prod': 335.0
}],
(413276, 43): [{
'prod_id': 1388,
'score': 1.0,
'pref_prod': 335.0
}],
(413276, 58): [{
'prod_id': 1335,
'score': 1.0,
'pref_prod': 335.0
}],
(413276, 73): [{
'prod_id': 26,
'score': 1.0,
'pref_prod': 335.0
}],
(9280593, 9): [{
'prod_id': 137,
'score': 1.0,
'pref_prod': 335.0
}],
(9280593, 58): [{
'prod_id': 1335,
'score': 1.0,
'pref_prod': 335.0
}],
(9280593, 74): [{
'prod_id': 160,
'score': 1.0,
'pref_prod': 335.0
}],
(4554542,
66): [{
'prod_id': 1612,
'score': 0.16666666666666666,
'pref_prod': 197.0
}, {
'prod_id': 1406,
'score': 0.16676666666666665,
'pref_prod': 197.0
}, {
'prod_id': 2021,
'score': 1.0,
'pref_prod': 197.0
}]
}
How can I get the json in the desired format

I can't think of any direct way to do it with pandas only. But you can construct a new dictionary with the desired format based on gb, using a defaultdict
from collections import defaultdict
import json # just to prettyprint the resulting dictionary
gb = df.groupby(['user_id', 'cat_id']).apply(lambda g: g.drop(['user_id', 'cat_id'], axis=1).to_dict(orient='records')).to_dict()
d = defaultdict(lambda: {'cat_id':{}} )
for (user_id, cat_id), records in gb.items():
for record in records:
# drop 'pref_prod' key of each record
# I'm assuming its unique for each (user_id, cat_id) group
pref_prod = record.pop('pref_prod')
d[user_id]['cat_id'][cat_id] = records
d[user_id]['pref_prod'] = pref_prod
>>> print(json.dumps(d, indent=4))
{
"29762": {
"cat_id": {
"9": [
{
"prod_id": 3115,
"score": 1.0
}
],
"58": [
{
"prod_id": 1335,
"score": 1.0
}
]
},
"pref_prod": 335.0
},
"234894": {
"cat_id": {
"58": [
{
"prod_id": 1335,
"score": 1.0
}
]
},
"pref_prod": 335.0
},
"413276": {
"cat_id": {
"43": [
{
"prod_id": 1388,
"score": 1.0
}
],
"58": [
{
"prod_id": 1335,
"score": 1.0
}
],
"73": [
{
"prod_id": 26,
"score": 1.0
}
]
},
"pref_prod": 335.0
},
"4554542": {
"cat_id": {
"66": [
{
"prod_id": 1612,
"score": 0.166667
},
{
"prod_id": 1406,
"score": 0.166767
},
{
"prod_id": 2021,
"score": 1.0
}
]
},
"pref_prod": 197.0
},
"9280593": {
"cat_id": {
"9": [
{
"prod_id": 137,
"score": 1.0
}
],
"58": [
{
"prod_id": 1335,
"score": 1.0
}
],
"74": [
{
"prod_id": 160,
"score": 1.0
}
]
},
"pref_prod": 335.0
}
}

I used a namedtuple from a dataframe conversion to create the json tree. if the tree has more than one level than I would use recursion to build it. the dataframe did not contain lists of list so recursion was not required.
from io import StringIO
import io
from collections import namedtuple
data="""user_id,cat_id,prod_id,score,pref_prod
29762,9,3115,1.000000,335.0
29762,58,1335,1.000000,335.0
234894,58,1335,1.000000,335.0
413276,43,1388,1.000000,335.0
413276,58,335,1.000000,335.0
413276,73,26,1.000000,335.0
9280593,9,137,1.000000,335.0
9280593,58,1335,1.000000,335.0
9280593,74,160,1.000000,335.0
4554542,66,1612,0.166667,197.0
4554542,66,1406,0.166767,197.0
4554542,66,2021,1.000000,197.0"""
df = pd.read_csv(io.StringIO(data), sep=',')
Record=namedtuple('Generic',['user_id','cat_id','prod_id','score','pref_prod'])
def map_to_record(row):
return Record(row.user_id, row.cat_id, row.prod_id,row.score,row.pref_prod)
my_list = list(map(map_to_record, df.itertuples()))
def named_tuple_to_json(named_tuple):
"""
convert a named tuple to a json tree structure
"""
json_string="records:["
for record in named_tuple:
json_string+="{"
json_string+="'user_id': {},'cat_id': {},'prod_id': {},'score': {},'pref_prod': {},".format(
record.user_id,record.cat_id,record.prod_id,record.score,record.pref_prod)
json_string+="},"
json_string+="]"
return json_string
# convert the list of named tuples to a json tree structure
json_tree = named_tuple_to_json(my_list)
print(json_tree)
output
records:[{'user_id': 29762,'cat_id': 9,'prod_id': 3115,'score': 1.0,'pref_prod': 335.0,},{'user_id': 29762,'cat_id': 58,'prod_id': 1335,'score': 1.0,'pref_prod': 335.0,},{'user_id': 234894,'cat_id': 58,'prod_id': 1335,'score': 1.0,'pref_prod': 335.0,},{'user_id': 413276,'cat_id': 43,'prod_id': 1388,'score': 1.0,'pref_prod': 335.0,},{'user_id': 413276,'cat_id': 58,'prod_id': 335,'score': 1.0,'pref_prod': 335.0,},{'user_id': 413276,'cat_id': 73,'prod_id': 26,'score': 1.0,'pref_prod': 335.0,},{'user_id': 9280593,'cat_id': 9,'prod_id': 137,'score': 1.0,'pref_prod': 335.0,},{'user_id': 9280593,'cat_id': 58,'prod_id': 1335,'score': 1.0,'pref_prod': 335.0,},{'user_id': 9280593,'cat_id': 74,'prod_id': 160,'score': 1.0,'pref_prod': 335.0,},{'user_id': 4554542,'cat_id': 66,'prod_id': 1612,'score': 0.166667,'pref_prod': 197.0,},{'user_id': 4554542,'cat_id': 66,'prod_id': 1406,'score': 0.166767,'pref_prod': 197.0,},{'user_id': 4554542,'cat_id': 66,'prod_id': 2021,'score': 1.0,'pref_prod': 197.0,},]
​

Related

Returning data that is not in ElasticSearch as 0 in doc_count

I am filtering in ElasticSearch. I want doc_count to return 0 on non-data dates, but it doesn't print those dates at all, only dates with data are returned to me. do you know how i can do it? Here is the Python output:
0 NaN
1 NaN
2 NaN
3 NaN
4 NaN
...
33479 {'date': '2022-04-13T08:08:00.000Z', 'value': 7}
33480 {'date': '2022-04-13T08:08:00.000Z', 'value': 7}
33481 {'date': '2022-04-13T08:08:00.000Z', 'value': 7}
33482 {'date': '2022-04-13T08:08:00.000Z', 'value': 7}
33483 {'date': '2022-04-13T08:08:00.000Z', 'value': 7}
And here is my ElasticSearch filter:
"from": 0,
"size": 0,
"query": {
"bool": {
"must":
[
{
"range": {
"#timestamp": {
"gte": "now-1M",
"lt": "now"
}
}
}
]
}
},
"aggs": {
"continent": {
"terms": {
"field": "source.geo.continent_name.keyword"
},
"aggs": {
"_source": {
"date_histogram": {
"field": "#timestamp", "interval": "8m"
}}}}}}
You need to set min_doc_count value to 0 for aggregation where you want result with zero doc_count.
{
"from": 0,
"size": 0,
"query": {
"bool": {
"must": [
{
"range": {
"#timestamp": {
"gte": "now-1M",
"lt": "now"
}
}
}
]
}
},
"aggs": {
"continent": {
"terms": {
"field": "source.geo.continent_name.keyword",
"min_doc_count": 0
},
"aggs": {
"_source": {
"date_histogram": {
"field": "#timestamp",
"interval": "8m",
"min_doc_count": 0
}
}
}
}
}
}

drop selective columns pandas dataframe while flattening

I have a created a dataframe from a JSON but want to keep only the first 5 columns of the result.
Here is a part of the JSON:
{
"lat": 52.517,
"lon": 13.3889,
"timezone": "Europe/Berlin",
"timezone_offset": 7200,
"current": {
"dt": 1628156947,
"sunrise": 1628134359,
"sunset": 1628189532,
"temp": 295.54,
"feels_like": 295.43,
"pressure": 1009,
"humidity": 61,
"dew_point": 287.66,
"uvi": 4.53,
"clouds": 20,
"visibility": 10000,
"wind_speed": 3.58,
"wind_deg": 79,
"wind_gust": 4.92,
"weather": [
{
"id": 801,
"main": "Clouds",
"description": "few clouds",
"icon": "02d"
}
]
},
"hourly": [
{
"dt": 1628154000,
"temp": 295.26,
"feels_like": 295.09,
"pressure": 1009,
"humidity": 60,
"dew_point": 287.14,
"uvi": 4.01,
"clouds": 36,
"visibility": 10000,
"wind_speed": 3.6,
"wind_deg": 83,
"wind_gust": 4.76,
"weather": [
{
"id": 500,
"main": "Rain",
"description": "light rain",
"icon": "10d"
}
],
"pop": 0.49,
"rain": {
"1h": 0.52
}
},
{
"dt": 1628157600,
"temp": 295.54,
"feels_like": 295.43,
"pressure": 1009,
"humidity": 61,
"dew_point": 287.66,
"uvi": 4.53,
"clouds": 20,
"visibility": 10000,
"wind_speed": 3.76,
"wind_deg": 85,
"wind_gust": 4.91,
"weather": [
{
"id": 801,
"main": "Clouds",
"description": "few clouds",
"icon": "02d"
}
],
"pop": 0.55
},
{
"dt": 1628161200,
"temp": 295.58,
"feels_like": 295.42,
"pressure": 1009,
"humidity": 59,
"dew_point": 287.18,
"uvi": 4.9,
"clouds": 36,
"visibility": 10000,
"wind_speed": 3.58,
"wind_deg": 95,
"wind_gust": 4.73,
"weather": [
{
"id": 802,
"main": "Clouds",
"description": "scattered clouds",
"icon": "03d"
}
],
"pop": 0.59
}
]
}
I have flattened the JSON first like this:
df_history = pd.json_normalize(data_history, max_level=1)`
That gave me this structure:
lat lon timezone timezone_offset hourly current.dt current.sunrise current.sunset current.temp current.feels_like ... current.humidity current.dew_point current.uvi current.clouds current.visibility current.wind_speed current.wind_deg current.wind_gust current.weather current.rain
0 52.517 13.3889 Europe/Berlin 7200 [{'dt': 1627776000, 'temp': 17.82, 'feels_like... 1627855200 1627874869 1627930649 16.36 16.4 ... 90 14.72 0 0 10000 3.13 254 11.18 [{'id': 500, 'main': 'Rain', 'description': 'l... {'1h': 0.17}
But I want to keep only the columns up to the column "hourly" and then flatten it.
I have tried this but to no avail:
df_history_small = pd.json_normalize(data_history, record_path='hourly',meta=['dt','temp', 'humidity'], errors='ignore')
What am I doing wrong? How can I achieve my goal?
my final goal it to have a dataframe that looks like this:
lat lon timezone timezone_offset timestamp temp feels_like humidity pressure
0 52.517 13.3889 Europe/Berlin 7200 08/01/2021 00:00:00 17.82 17.46 69 1005
Try:
cols = ['lat', 'lon', 'timezone', 'timezone_offset',
'dt', 'temp', 'feels_like', 'humidity']
out = pd.json_normalize(data_history, ['hourly'], meta=cols[:4])[cols]
>>> out
lat lon timezone timezone_offset dt temp feels_like humidity
0 52.517 13.3889 Europe/Berlin 7200 1628154000 295.26 295.09 60
1 52.517 13.3889 Europe/Berlin 7200 1628157600 295.54 295.43 61
2 52.517 13.3889 Europe/Berlin 7200 1628161200 295.58 295.42 59
Feel free to convert dt to timestamp with:
df['timestamp'] = pd.to_datetime(out['dt'], unit='s')

flattened dictionary into nested dictionary of dictionaries of lists

So I can't seem to figure out how to effectively implement this. I'm looking to nest a flattened dictionary into dictionary of dictionaries of lists, based on specific keys as inputs. Trying so desperately to learn
Given that my data looks like this:
data= [
{
"player": "Kevin Durant",
"team": "Thunder",
"location": "Oklahoma City",
"points": 15
},
{
"player": "Jeremy Lin",
"team": "Lakers",
"location": "Los Angeles",
"points": 22
},
{
"player": "Kobe Bryant",
"team": "Lakers",
"location": "Los Angeles",
"points": 51
},
{
"player": "Blake Griffin",
"team": "Clippers",
"location": "Los Angeles",
"points": 26
}
]
I'd want to return something like this if I give it parameters like reorder(data,['location','team','player']) for an example
result={
"Los Angeles": {
"Clippers": {
"Blake Griffin": [
{
"points": 26
}
]
},
"Lakers": {
"Kobe Bryant": [
{
"points": 51
}
],
"Jeremy Lin": [
{
"points": 22
}
]
}
},
"Oklahoma City": {
"Thunder": {
"Kevin Durant": [
{
"points": 15
}
]
}
},
}
You can use the setdefault function to automatically build the nesting levels as you go through the data:
data= [
{
"player": "Kevin Durant",
"team": "Thunder",
"location": "Oklahoma City",
"points": 15
},
{
"player": "Jeremy Lin",
"team": "Lakers",
"location": "Los Angeles",
"points": 22
},
{
"player": "Kobe Bryant",
"team": "Lakers",
"location": "Los Angeles",
"points": 51
},
{
"player": "Blake Griffin",
"team": "Clippers",
"location": "Los Angeles",
"points": 26
}
]
nested = dict()
for d in data:
nested.setdefault(d["location"],dict()) \
.setdefault(d["team"], dict()) \
.setdefault(d["player"], list()) \
.append({"points":d["points"]})
output:
print(nested)
{ 'Oklahoma City':
{
'Thunder':
{ 'Kevin Durant': [{'points': 15}] }
},
'Los Angeles':
{
'Lakers':
{
'Jeremy Lin': [{'points': 22}],
'Kobe Bryant': [{'points': 51}]
},
'Clippers':
{ 'Blake Griffin': [{'points': 26}] }
}
}
[EDIT] Generalizing the approach
If you have to do this kind of thing often and on different types of dictionaries or hierarchies, you could generalize it in a function:
def dictNesting(data,*levels):
result = dict()
for d in data:
r = result
for level in levels[:-1]:
r = r.setdefault(d[level],dict())
r = r.setdefault(d[levels[-1]],list())
r.append({k:v for k,v in d.items() if k not in levels})
return result
You would then give the function a list of dictionaries followed by the names of the keys you want to nest:
byLocation = dictNesting(data,"location","team")
{ 'Oklahoma City':
{ 'Thunder': [
{'player': 'Kevin Durant', 'points': 15}]
},
'Los Angeles':
{'Lakers': [
{'player': 'Jeremy Lin', 'points': 22},
{'player': 'Kobe Bryant', 'points': 51}],
'Clippers': [
{'player': 'Blake Griffin', 'points': 26}]
}
}
If you want to group the same data in a different way, you just need to change the order of the field names:
byPlayer = dictNesting(data,"player","location","team")
{ 'Kevin Durant':
{ 'Oklahoma City':
{ 'Thunder': [{'points': 15}] }
},
'Jeremy Lin':
{ 'Los Angeles':
{'Lakers': [{'points': 22}]}
},
'Kobe Bryant':
{ 'Los Angeles':
{'Lakers': [{'points': 51}]}
},
'Blake Griffin':
{ 'Los Angeles':
{'Clippers': [{'points': 26}]}
}
}
From there you can have some fun with the function and improve it to aggregate the data at the lowest nesting level:
def dictNesting(data,*levels,aggregate=False):
result = dict()
for d in data:
r = result
for level in levels[:-1]:
r = r.setdefault(d[level],dict())
r = r.setdefault(d[levels[-1]],[list,dict][aggregate]())
content = ( (k,v) for k,v in d.items() if k not in levels)
if aggregate:
for k,v in content: r.setdefault(k,list()).append(v)
else:
r.append(dict(content))
return result
output:
byCity = dictNesting(data,"location","team",aggregate=True)
{ 'Oklahoma City':
{'Thunder':
{'player': ['Kevin Durant'], 'points': [15]}},
'Los Angeles':
{'Lakers':
{'player': ['Jeremy Lin', 'Kobe Bryant'], 'points': [22, 51]},
'Clippers':
{'player': ['Blake Griffin'], 'points': [26]}
}
}
lakersPlayers = byCity["Los Angeles"]["Lakers"]["player"]
# ['Jeremy Lin', 'Kobe Bryant']
lakersPoints = sum(byCity["Los Angeles"]["Lakers"]["points"])
# 73

How can I replace comma separated ids in a string by doing a lookup in json response in python

I would like to replace the ids in the following string by values which are in json.
target_country ="214,216,278,418"
Here is the json with the values
{
"data": [
{
"label": "BE",
"id": 214
},
{
"label": "CH",
"id": 215
},
{
"label": "DE",
"id": 274
},
{
"label": "ES",
"id": 216
},
{
"label": "EU",
"id": 416
},
{
"label": "GB",
"id": 218
},
{
"label": "HR",
"id": 278
},
{
"label": "US",
"id": 418
}
]
}
I want the result to be
target_country = 'BE, ES, HR, US'
What would be the best way to do it?
You can use pandas with isin:
>>> import pandas as pd
>>> df = pd.DataFrame(dealfields)
>>> df
id label
0 214 BE
1 215 CH
2 274 DE
3 216 ES
4 416 EU
5 218 GB
6 278 HR
7 418 US
>>> data = [{
"target_country": "214,216,278,418"
}]
>>> fields = df[df['id'].isin(map(int,data[0]["target_country"].split(',')))
]['label'].tolist()
>>> target_country = ', '.join(fields)
>>> target_country
'BE, ES, HR, US'
Easiest way is to create a dictionary from your list of dicts,
lst = [{'label': 'BE', 'id': 214},
{'label': 'CH', 'id': 215},
{'label': 'DE', 'id': 274},
{'label': 'ES', 'id': 216},
{'label': 'EU', 'id': 416},
{'label': 'GB', 'id': 218},
{'label': 'HR', 'id': 278},
{'label': 'US', 'id': 418}]
dct = {str(x['id']):x['label'] for x in lst}
>>>print(dct)
{'214': 'BE',
'215': 'CH',
'274': 'DE',
'216': 'ES',
'416': 'EU',
'218': 'GB',
'278': 'HR',
'418': 'US'}
then use list comprehension to convert you string,
data=[{"target_country": "214,216,278,418"}]
for item in data:
if item.get('target_country',None):
item['target_country']=','.join([dct.get(y) for y in item['target_country'].split(',')])
>>>data
[{'target_country': 'BE,ES,HR,US'}]
Create a dict with the JSON data:
d = {
"data": [
{
"label": "BE",
"id": 214
},
{
"label": "CH",
"id": 215
},
{
"label": "DE",
"id": 274
},
{
"label": "ES",
"id": 216
},
{
"label": "EU",
"id": 416
},
{
"label": "GB",
"id": 218
},
{
"label": "HR",
"id": 278
},
{
"label": "US",
"id": 418
}
]
}
then the following code should work:
result = [s["label"] for s in d['data'] if str(s["id"]) in target_country]
This loops through the "data" list inside your dictionary, then extracts the label for each entry that matches the str(s["id"]) in target_country.
>> print(result)
>> ['BE', 'ES', 'HR', 'US']

Group documents based on dynamic keys and convert keys to values

I have a data in MongoDB, the data like that:
{
"good": {
"d1": 2,
"d2": 56,
"d3": 3
},
"school": {
"d1": 4,
"d3": 5,
"d4": 12
}
},
{
"good": {
"d5": 4,
"d6": 5
},
"spark": {
"d5": 6,
"d6": 11,
"d7": 10
},
"school": {
"d5": 8,
"d8": 7
}
}
and I want to use pymongo mapreduce to generate data like this:
{
'word': 'good',
'info': [
{
'tbl_id': 'd1',
'term_freq': 2
},
{
'tbl_id': 'd2',
'term_freq': 56
},
{
'tbl_id': 'd3',
'term_freq': 3
},
{
'tbl_id': 'd5',
'term_freq': 4
},
{
'tbl_id': 'd6',
'term_freq': 5
}
]
}
{
'word': 'school',
'info': [
{
'tbl_id': 'd1',
'term_freq': 4
},
{
'tbl_id': 'd3',
'term_freq': 5
},
{
'tbl_id': 'd4',
'term_freq': 12
},
{
'tbl_id': 'd5',
'term_freq': 8
},
{
'tbl_id': 'd8',
'term_freq': 7
}
]
}
{
'word': 'spark',
'info': [
{
'tbl_id': 'd5',
'term_freq': 6
},
{
'tbl_id': 'd6',
'term_freq': 11
},
{
'tbl_id': 'd7',
'term_freq': 10
}
]
}
what should I do? Or there is other solutions?
You don't need *mapReduce` here. The aggregation framework can handle do this beautifully.
As of how this works, I suggest you have a look at each operator in the documentation.
_filter = {
"input": {"$objectToArray": "$$ROOT"},
"cond": {"$ne": ["$$this.k", "_id"]}
}
_map = {
"$map": {
"input": {"$filter": _filter},
"in": {
"k": "$$this.k",
"info": {
"$map": {
"input": {"$objectToArray": "$$this.v"},
"in": {"tbl_id": "$$this.k", "freq_term": "$$this.v"}
}
}
}
}
}
pipeline = [
{"$project": {"word": _map}},
{"$unwind": "$word"},
{
"$group": {
"_id": "$word.k",
"info": {
"$push": "$word.info"
}
}
},
{
"$project": {
"_id": 0,
"word": "$_id",
"info": {
"$reduce": {
"input": "$info",
"initialValue": [
],
"in": {
"$concatArrays": [
"$$value",
"$$this"
]
}
}
}
}
}
]
Then run with the .aggregate() method.
collection.aggregate(pipeline)

Categories

Resources