parsing json file with function into dataframe for analysis - python

Hi am working with two json files , and im having problem with the data cleaning.
Suppose a record in g1j or g2j looks like this:
{
'cls_loc': 'QOEBBG_K0101',
'date': 1584957443013,
'dur': 32,
'exp': [
{
'm': 'spot_excited',
's': 8.5,
't': 8.5,
'w': 'spot_bored',
'x': 'A'
},
{
's': 1.1,
't': 11.4,
'w': 'spot_scared',
'x': 'A'
}
],
'mod': 'Poster',
'pre': False,
'scr': 67,
'usr': 'QOGOBN',
'ver': '20.5.3'
}
What we want per row in our DataFrame is this:
{
'student_pin': 'QOGOBN', # from `usr`
'date': datetime.date(2020 3, 23), # from `date`, but parsed
'duration': 32, # from `dur`
'level': 3, # the "K" from `cls_loc`, mapped to int
'unit': 1, # from `cls_loc`, mapped to int
'module': 1, # from `cls_loc`, mapped to int
'accuracy': 0.5, # calcualted from `exp`
}
my code so far:
from datetime import datetime
import json
import numpy as np
import pandas as pd
from scipy import stats
with open('/content/drive/MyDrive/group1_exp_2020-04-08.json', 'r') as f:
g1j = json.loads(f.read())
with open('/content/drive/MyDrive/group2_exp_2020-04-22.json', 'r') as f:
g2j = json.loads(f.read())
#convert the integer timestamp to a datetime.date
def timestamp_to_date():
l =[]
for item in g1j:
timestamp =item['date']
timestamp = timestamp/1000
dt_obj = datetime.fromtimestamp(timestamp).strftime('%Y, %m, %d ')
l.append(dt_obj)
return l
timestamp_to_date()
def timestamp_to_date():
l =[]
for item in g2j:
timestamp =item['date']
timestamp = timestamp/1000
dt_obj = datetime.fromtimestamp(timestamp).strftime('%Y, %m, %d ')
l.append(dt_obj)
return l
#extract the level, unit, module, and accuracy here
def get_level(x):
loc = x['cls_loc'].split('_')[-1]
return level_map[loc[0]]
def get_unit(x):
loc = x['cls_loc'].split('_')[-1]
unit = loc[1:3]
return int(unit)
def get_module(x):
loc = x['cls_loc'].split('_')[-1]
module = loc[3:]
return int(module)
def get_accuracy(x):
challenges = [x for x in x['exp'] if x['x'] == 'A']
n = len(challenges)
if n == 0:
return 'N/A'
mistakes = [x for x in challenges if 'm' in x.keys()]
correct = n - len(mistakes)
return correct / n
#create the function to convert experience records to the pandas.DataFrame
def exp_to_df(g1j):
df = pd.DataFrame(f, columns=['exp'])
return df
def exp_to_df(g2j):
df = pd.DataFrame(f, columns=['exp'])
return df
#uses the function you just implemented, and checks that your function keeps the records and uses the right column names
g1 = exp_to_df(g1j)
g2 = exp_to_df(g2j)
assert len(g1) == len(g1j)
assert len(g2) == len(g2j)
columns = ['student_pin', 'date', 'level', 'unit', 'module', 'accuracy']
assert all(c in g1.columns for c in columns)
assert all(c in g2.columns for c in columns)
What am I doing wrong? It seems like def exp_to_df(g1j) and def exp_to_df(g2j) are wrong. Any suggestions?
Also is my def timestamp_to_date() also wrong?

I suggest using the pandas read_json() function to load your json directly into a dataframe (I added a couple dummy records):
g1 = pd.read_json('/content/drive/MyDrive/group1_exp_2020-04-08.json')
# cls_loc date dur exp mod pre scr usr ver
# 0 QOEBBG_K0101 2020-03-23 09:57:23.013 32 [{'m': 'spot_excited', 's': 8.5, 't': 8.5, 'w'... Poster False 67 QOGOBN 20.5.3
# 1 QOEBBG_K0102 2020-03-23 09:57:23.013 32 [{'m': 'spot_excited', 's': 8.5, 't': 8.5, 'w'... Poster False 67 QOGOBN 20.5.3
# 2 QOEBBG_K0103 2020-03-23 09:57:23.013 32 [{'s': 1.1, 't': 11.4, 'x': 'C'}] Poster False 67 QOGOBN 20.5.3
Then you can do all the data wrangling with pandas functions like
str.extract(),
assign(),
to_datetime(),
map(), and
apply():
# extract level, unit, module as columns
g1 = g1.assign(**g1.cls_loc
.str.extract(r'_([a-zA-Z])([0-9]{2})([0-9]{2})')
.rename({0: 'level', 1: 'unit', 2: 'module'}, axis=1))
# convert date to datetime
g1.date = pd.to_datetime(g1.date, unit='ms')
# map level to int
level_map = {'K': 3}
g1.level = g1.level.map(level_map)
# compute accuracy
def accuracy(exp):
challenges = [e for e in exp if e['x'] == 'A']
n = len(challenges)
if n == 0:
return np.nan
mistakes = [c for c in challenges if 'm' in c.keys()]
correct = n - len(mistakes)
return correct / n
g1['accuracy'] = g1.exp.apply(accuracy)
# rename usr -> student_pin
g1 = g1.rename({'usr': 'student_pin'}, axis=1)
# keep desired columns
columns = ['student_pin', 'date', 'level', 'unit', 'module', 'accuracy']
g1 = g1[columns]
Output:
student_pin date level unit module accuracy
0 QOGOBN 2020-03-23 09:57:23.013 3 01 01 0.500000
1 QOGOBN 2020-03-23 09:57:23.013 3 01 02 0.333333
2 QOGOBN 2020-03-23 09:57:23.013 3 01 03 NaN

Related

How to manipulate data from binance stream

I am trying to manipulate the following data from a websocket.
Here is the data:
{'e': 'kline', 'E': 1659440374345, 's': 'MATICUSDT', 'k': {'t': 1659440100000, 'T': 1659440399999, 's': 'MATICUSDT', 'i': '5m', 'f': 274454614, 'L': 274455188, 'o': '0.87210000', 'c': '0.87240000', 'h': '0.87240000', 'l': '0.87000000', 'v': '145806.50000000', 'n': 575, 'x': False, 'q': '127036.96453000', 'V': '76167.60000000', 'Q': '66365.16664000', 'B': '0'}}
I am trying to extract following: 'E', 's' AND 'c'. To manipulate to: 'E' = time, 's' = symbol and 'c' = PRICE
def createframe(msg):
df = pd.DataFrame([msg])
df = df.loc[:,['s','E','c']
df.columns = ['symbol', 'Time', 'Price']
df.Price = df.Price.astype(float)
df.Time = pd.to_datetime(df.Time, unit = 'ms')
return df
When I run the next line of code to pull data:
async with stream as receiver:
while True:
data = await receiver.recv()
data = json.loads(data)['data']
df = createframe(data)
print(df)
I am getting error that 'c' is not defined.
PLEASE HELP. THANK YOU
If you look at the data frame, you'll see that in column "k" you have a whole dictionary's worth of data. That's because the value of k is itself a dictionary. You're getting the error that c is not defined because it is not a column itself, just a piece of data in column "k".
In order to get all this data into individual columns, you'll have to "flatten" the data. You can do something like this:
def createframe(msg):
df = pd.DataFrame([msg])
df = df.loc[:,['s','E','c']]
df.columns = ['symbol', 'Time', 'Price']
df.Price = df.Price.astype(float)
df.Time = pd.to_datetime(df.Time, unit = 'ms')
return df
def flatten(data):
newdict = {}
for each in msg:
if isinstance(msg[each], dict):
for i in msg[each]:
newdict[i] = msg[each][i]
else:
newdict[each] = msg[each]
return newdict
async with stream as receiver:
while True:
data = await receiver.recv()
data = json.loads(data)['data']
data = flatten(data)
df = createframe(data)
print(df)
Hope this helps! If you have questions just comment on this answer.

How create dataframe from list of dictionary of multi level json

So I have a json file that have multiple level,by using pandas I can read the first level and put it in a dataframe,but the problem is as you can see in the dataframe Column Comments and hastags the second level is inside a column have format of list of dictionary,is there any solution for make the second level dictionary into dataframe. I try to use for loop and json_normalize but it always throw an error. Any suggestion? My code is like this
import pandas as pd
df2 = pd.read_json("data.json")
cid = []
for x in df2["comments"]:
cid.append(x.get('cid'))
data = pd.DataFrame({'cid':cid})
If i use the code it throw an error since I try access list by string not index.
AttributeError: 'list' object has no attribute 'get'
Even I change it into integer it got dictionary inside a column.How to change the dict inside the column or is there another easier way to do this? Dictionary in column
for x in df2["comments"]:
cid.append(x[0])
data = pd.DataFrame({'cid':cid})
for y in data:
print(y.get('cid'))
Example of first row of the data frame
[{'cid': '7000061798167266075', 'createTime': 1629828926, 'text': 'Done 🥰', 'diggCount': 1, 'replyCommentTotal': 0, 'uid': '6529379534374092801', 'uniqueId': 'alikayanti'}, {'cid': '6999869922566783771', 'createTime': 1629784228, 'text': 'haloo min, yg udah ikutan di misi sebelumnya boleh ikutan lagi gaa?', 'diggCount': 1, 'replyCommentTotal': 1, 'uid': '6842932775562642433', 'uniqueId': 'fia_654'}, {'cid': '7000248857603588891', 'createTime': 1629872457, 'text': 'bell bottoms maksudnya apa kak?\napakah benar artinya bel bawah?', 'diggCount': 0, 'replyCommentTotal': 2, 'uid': '6960940768727417857', 'uniqueId': 'keterimadiptn1'}, {'cid': '7000322023545455387', 'createTime': 1629889491, 'text': 'syudah🥰', 'diggCount': 0, 'replyCommentTotal': 0, 'uid': '6806645499672839170', 'uniqueId': 'miftahulhaqqu'}, {'cid': '7001271117180977947', 'createTime': 1630110475, 'text': 'kak, perpanjang dong waktu posting videonya :)', 'diggCount': 1, 'replyCommentTotal': 0, 'uid': '6921267441846830082', 'uniqueId': 'elisabetkst'}]
Maybe this solves your problem:
Defined the following function which unnests any json:
import json
import pandas as pd
def flatten_nested_json_df(df):
df = df.reset_index()
s = (df.applymap(type) == list).all()
list_columns = s[s].index.tolist()
s = (df.applymap(type) == dict).all()
dict_columns = s[s].index.tolist()
while len(list_columns) > 0 or len(dict_columns) > 0:
new_columns = []
for col in dict_columns:
horiz_exploded = pd.json_normalize(df[col]).add_prefix(f'{col}.')
horiz_exploded.index = df.index
df = pd.concat([df, horiz_exploded], axis=1).drop(columns=[col])
new_columns.extend(horiz_exploded.columns) # inplace
for col in list_columns:
#print(f"exploding: {col}")
df = df.drop(columns=[col]).join(df[col].explode().to_frame())
new_columns.append(col)
s = (df[new_columns].applymap(type) == list).all()
list_columns = s[s].index.tolist()
s = (df[new_columns].applymap(type) == dict).all()
dict_columns = s[s].index.tolist()
return df
and do this:
results = pd.json_normalize(data)
df = pd.DataFrame(results)
outdf = flatten_nested_json_df(df)
which returns:
index cid createTime \
0 0 7000061798167266075 1629828926
1 1 6999869922566783771 1629784228
2 2 7000248857603588891 1629872457
3 3 7000322023545455387 1629889491
4 4 7001271117180977947 1630110475
text diggCount \
0 Done 🥰 1
1 haloo min, yg udah ikutan di misi sebelumnya b... 1
2 bell bottoms maksudnya apa kak?\napakah benar ... 0
3 syudah🥰 0
4 kak, perpanjang dong waktu posting videonya :) 1
replyCommentTotal uid uniqueId
0 0 6529379534374092801 alikayanti
1 1 6842932775562642433 fia_654
2 2 6960940768727417857 keterimadiptn1
3 0 6806645499672839170 miftahulhaqqu
4 0 6921267441846830082 elisabetkst
I found the solution is through multiple for loop after we looping the row of the targeted column,and after that append it as a list
unique_id = []
cid = []
createTime = []
text = []
diggCount = []
replyCommentTotal = []
uid = []
for items in df2["comments"]:
for x in items:
unique_id.append(x["uniqueId"])
cid.append(x["cid"])
createTime.append(x["createTime"])
text.append(x["text"])
diggCount.append(x["diggCount"])
replyCommentTotal.append(x["replyCommentTotal"])
uid.append(x["uid"])
data = pd.DataFrame({'unique_id':unique_id,
'cid':cid,
'createTime':createTime,
'text':text,
'diggCount':diggCount,
'replyCommentTotal':replyCommentTotal,
'uid':uid})

Loop to match dictionary keys from list append dictionary with associated data in other columns

I want to loop through my data and population my dictonairies with 'event' value and their corresponding 'xCordAdjusted' and 'yCordAdjusted'
Dataframe:
season period teamCode event goal xCord xCordAdjusted yCord yCordAdjusted shotType playerPositionThatDidEvent playerNumThatDidEvent shooterPlayerId shooterName shooterLeftRight
2014 1 MTL MISS 0 61 61 29 29 WRIST C 51 8471976.0 David Desharnais L
2014 1 TOR SHOT 0 -54 54 29 -29 BACK C 42 8475098.0 Tyler Bozak R
2014 1 TOR SHOT 0 -40 40 32 -32 WRIST D 46 8471392.0 Roman Polak R
My work:
league_data = {};
league_data['SHOT'] = {};
league_data['SHOT']['x'] = [];
league_data['SHOT']['y'] = [];
league_data['GOAL'] = {};
league_data['GOAL']['x'] = [];
league_data['GOAL']['y'] = [];
league_data['MISS'] = {};
league_data['MISS']['x'] = [];
league_data['MISS']['y'] = [];
event_types = ['SHOT','GOAL','MISS']
for data in season_df:
for event in event_types:
if data in event_types:
if 'x' in range(0,100):
league_data[event]['x'].append(['xCordAdjusted'])
league_data[event]['y'].append(['yCordAdjusted'])
league_data
Output:
{'SHOT': {'x': [], 'y': []},
'GOAL': {'x': [], 'y': []},
'MISS': {'x': [], 'y': []}}
You can extract the desired information directly from the DataFrame in a vectorized fashion, instead of looping over it repeatedly:
league_data = {
'SHOT': {},
'GOAL': {},
'MISS': {},
}
for event in event_types:
mask = (season_df['event'] == event) & season_df['xCord'].between(0, 100)
x_adjusted = season_df.loc[mask, 'xCordAdjusted'].tolist()
y_adjusted = season_df.loc[mask, 'yCordAdjusted'].tolist()
league_data[event]['x'] = x_adjusted
league_data[event]['y'] = y_adjusted
gives
{'GOAL': {'x': [], 'y': []},
'MISS': {'x': [61], 'y': [-29]},
'SHOT': {'x': [], 'y': []}
}
Note that I adjusted the range condition since your original code if 'x' in range(0,100) doesn't do what you intend because it doesn't reference your DataFrame at all.
for data in season_df: iterate on columns, not rows.
Instead, use for index, row in season_df.iterrows()
However, iteration on rows is quite slow, so if your data is quite big, you can utilize vectorization.
Also, your code looks not working as you expected.. like if 'x' in range(0, 100). I re-code it on my assumption, try this.
for event in event_types:
matched_df = season_df[season_df['event'] == event]
x_matched_list = matched_df[(0 <= matched_df['xCordAdjusted']) & (matched_df['xCordAdjusted'] <= 100)]['xCordAdjusted'].tolist()
league_data[event]['x'] = x_matched_list # or extend
y_matched_list = matched_df[(0 <= matched_df['yCordAdjusted']) & (matched_df['yCordAdjusted'] <= 100)]['yCordAdjusted'].tolist()
league_data[event]['y'] = y_matched_list # or extend
But be careful with possibility of length 'xCordAdjusted' not matching with 'yCordAdjusted'

Python - reduce number of if statements

I am filtering a pandas dataframe based on one or more conditions, like so:
def filter_dataframe(dataframe, position=None, team_id=None, home=None, window=None, min_games=0):
df = dataframe.copy()
if position:
df = df[df['position_id'] == position]
if clube_id:
df = df[df['team_id'] == team_id]
if home:
if home == 'home':
df = df[df['home_dummy'] == 1.0]
elif home == 'away':
df = df[df['home_dummy'] == 0.0]
if window:
df = df[df['round_id'].between(1, window)]
if min_games:
df = df[df['games_num'] >= min_games]
return df
But I don't think this is elegant.
Is there a simpler way of achieving the same result?
I though of creating rules for conditions like in this SO answer and then use the method any(rules) in order to apply the filtering, if any, but I don't know how to approach this. Any ideas?
You could try something like this:
def filter_dataframe(dataframe, position=None, clube_id=None, team_id=None, home=None, window=None, min_games=0):
df = dataframe.copy()
masks = {
"mask1": [position is not None, df[df["position_id"] == position]],
"mask2": [clube_id is not None, df[df["team_id"] == team_id]],
"mask3": [home == "home", df[df["home_dummy"] == 1.0]],
"mask4": [home == "away", df[df["home_dummy"] == 0.0]],
"mask5": [window is not None, df[df["round_id"].between(1, window)]],
"mask6": [min_games is not None, df[df["games_num"] >= min_games]],
}
for value in masks.values():
if value[0]:
df = value[1]
return df

When attempting to pass a dictionary to a pandas dataframe, how to resolve: ValueError: If using all scalar values, you must pass an index

I am extracting data from a REST API that I need to write to a SQL table.
My approach is adding the JSON data to a dictionary, pass the dictionary to a dataframe and write the dataframe to SQL.
I get the following error when passing the balances() function to the dataframe:
ValueError: If using all scalar values, you must pass an index
What am I doing wrong? Also feel free to provide feedback on the structure of my code, I feel like there are easier ways of extracting the data with less code.
def balances():
for b in get_balances["balances"]:
result = {}
result["employeeID"] = int(b.get("employeeID"))
result["resourceID"] = int(b.get("resourceID"))
result["resourceType"] = int(b.get("resourceType"))
if b.get("startDate") is None:
pass
else:
result["startDate"] = b.get("startDate").split("#")[0]
if b.get("endDate") is None:
pass
else:
result["endDate"] = b.get("endDate").split("#")[0]
result["minutesLeft"] = b.get("minutesLeft")
result["minutestoTake"] = b.get("minutestoTake")
result["minutesTaken"] = b.get("minutesTaken")
result["minutesTakenPast"] = b.get("minutesTakenPast")
result["minutestakenFuture"] = b.get("minutesTakenFuture")
result["periodMinutesToTake"] = b.get("periodMinutesToTake")
result["periodMinutesTaken"] = b.get("periodMinutesTaken")
for h in b.get("history"):
if h.get("planningDate") is None:
pass
else:
result["planningDate"] = h.get("planningDate").split("#")[0]
result["resourceTypeHistory"] = h.get("resourceType")
result["resourceIDHistory"] = h.get("resourceID")
result["minutes"] = h.get("minutes")
result["balanceMinutes"] = h.get("balanceMinutes")
result["remark"] = h.get("remark")
yield result
print(pd.DataFrame(balances()))
#ValueError: If using all scalar values, you must pass an index
Sample output data of 2 rows:
{'employeeID': 569, 'resourceID': 230, 'resourceType': 144, 'startDate': '2020-01-01', 'endDate': '2020-12-31', 'minutesLeft': 11281, 'minutestoTake': None, 'minutesTaken': 960, 'minutesTakenPast': 0, 'minutestakenFuture': -960, 'periodMinutesToTake': 0, 'periodMinutesTaken': 0, 'planningDate': '2020-01-01', 'resourceTypeHistory': 15, 'resourceIDHistory': 3, 'minutes': 12000, 'balanceMinutes': 12000, 'remark': ''}
{'employeeID': 877, 'resourceID': 33, 'resourceType': 125, 'startDate': '2020-01-01', 'endDate': '2020-12-31', 'minutesLeft': 11281, 'minutestoTake': None, 'minutesTaken': 960, 'minutesTakenPast': 0, 'minutestakenFuture': -960, 'periodMinutesToTake': 0, 'periodMinutesTaken': 0, 'planningDate': '2020-06-05', 'resourceTypeHistory': 2, 'resourceIDHistory': 3, 'minutes': -480, 'balanceMinutes': 11281, 'remark': ''}
Works using json_normalize for your sample JSON:
import pandas as pd
import json
with open('1.json', 'r+') as f:
data = json.load(f)
df = pd.json_normalize(data)
print(df)
employeeID resourceID resourceType startDate endDate ... resourceTypeHistory resourceIDHistory minutes balanceMinutes remark
0 569 230 144 2020-01-01 2020-12-31 ... 15 3 12000 12000
1 877 33 125 2020-01-01 2020-12-31 ... 2 3 -480 11281

Categories

Resources