how is the output of this nested loop being calculated? - python

Hi I have this calculation but I am failing to understand how this line [array([1050885., 1068309., 1085733., 1103157., 1120581.]) of the output is calculated, please explain.
creating sample data:
#creating sample data:
data1 = pd.DataFrame({"client": ['x1', 'x2'],
"cat": ['Bb', 'Ee'],
"amt": [1000,300],
"time":[2, 3],
"group":[10, 25]})
listc = ['Aa','Bb','Cc','Dd','Ee']
val1 = pd.DataFrame({'time': [1, 2, 3],
'lim %': [0.1, 0.11, 0.112]})
val2 = pd.concat([pd.DataFrame({'group':g, 'perc': 0.99, 'time':range(1, 11)}
for g in data1['group'].unique())]).explode('time')
mat = np.arange(75).reshape(3,5,5)
vals = [val1, val2]
data1['cat'] = pd.Categorical(data1['cat'],
categories=listc,
ordered=True).codes
for i in range(len(vals)):
if 'group' in vals[i].columns:
vals[i] = vals[i].set_index(['time', 'group'])
else:
vals[i] = vals[i].set_index(['time'])
#nested loop calculation
calc = {}
for client, cat, amt, start, group in data1.itertuples(name=None, index=False):
for time in range(start, len(mat)+1):
if time == start:
calc[client] = [[amt * mat[time-1, cat, :]]]
else:
calc[client].append([calc[client][-1][-1] # mat[time-1]])
for valcal in vals:
if isinstance(valcal.index, pd.MultiIndex):
value = valcal.loc[(time, group)].iat[0]
else:
value = valcal.loc[time].iat[0]
calc[client][-1].append(value * calc[client][-1][-1])
output:
{'x1': [[array([30000, 31000, 32000, 33000, 34000]),
array([3300., 3410., 3520., 3630., 3740.]),
array([3267. , 3375.9, 3484.8, 3593.7, 3702.6])],
[array([1050885., 1068309., 1085733., 1103157., 1120581.]), #how is this line calculated?
array([117699.12 , 119650.608, 121602.096, 123553.584, 125505.072]),
array([116522.1288 , 118454.10192, 120386.07504, 122318.04816,
124250.02128])]],
'x2': [[array([21000, 21300, 21600, 21900, 22200]),
array([2352. , 2385.6, 2419.2, 2452.8, 2486.4]),
array([2328.48 , 2361.744, 2395.008, 2428.272, 2461.536])]]}
what I need the calc for this line to be is:
[array([1050885., 1068309., 1085733., 1103157., 1120581.])
it should take array([3267. , 3375.9, 3484.8, 3593.7, 3702.6])] multiplied by mat at time 3, how can I get it to do this?

Related

parsing json file with function into dataframe for analysis

Hi am working with two json files , and im having problem with the data cleaning.
Suppose a record in g1j or g2j looks like this:
{
'cls_loc': 'QOEBBG_K0101',
'date': 1584957443013,
'dur': 32,
'exp': [
{
'm': 'spot_excited',
's': 8.5,
't': 8.5,
'w': 'spot_bored',
'x': 'A'
},
{
's': 1.1,
't': 11.4,
'w': 'spot_scared',
'x': 'A'
}
],
'mod': 'Poster',
'pre': False,
'scr': 67,
'usr': 'QOGOBN',
'ver': '20.5.3'
}
What we want per row in our DataFrame is this:
{
'student_pin': 'QOGOBN', # from `usr`
'date': datetime.date(2020 3, 23), # from `date`, but parsed
'duration': 32, # from `dur`
'level': 3, # the "K" from `cls_loc`, mapped to int
'unit': 1, # from `cls_loc`, mapped to int
'module': 1, # from `cls_loc`, mapped to int
'accuracy': 0.5, # calcualted from `exp`
}
my code so far:
from datetime import datetime
import json
import numpy as np
import pandas as pd
from scipy import stats
with open('/content/drive/MyDrive/group1_exp_2020-04-08.json', 'r') as f:
g1j = json.loads(f.read())
with open('/content/drive/MyDrive/group2_exp_2020-04-22.json', 'r') as f:
g2j = json.loads(f.read())
#convert the integer timestamp to a datetime.date
def timestamp_to_date():
l =[]
for item in g1j:
timestamp =item['date']
timestamp = timestamp/1000
dt_obj = datetime.fromtimestamp(timestamp).strftime('%Y, %m, %d ')
l.append(dt_obj)
return l
timestamp_to_date()
def timestamp_to_date():
l =[]
for item in g2j:
timestamp =item['date']
timestamp = timestamp/1000
dt_obj = datetime.fromtimestamp(timestamp).strftime('%Y, %m, %d ')
l.append(dt_obj)
return l
#extract the level, unit, module, and accuracy here
def get_level(x):
loc = x['cls_loc'].split('_')[-1]
return level_map[loc[0]]
def get_unit(x):
loc = x['cls_loc'].split('_')[-1]
unit = loc[1:3]
return int(unit)
def get_module(x):
loc = x['cls_loc'].split('_')[-1]
module = loc[3:]
return int(module)
def get_accuracy(x):
challenges = [x for x in x['exp'] if x['x'] == 'A']
n = len(challenges)
if n == 0:
return 'N/A'
mistakes = [x for x in challenges if 'm' in x.keys()]
correct = n - len(mistakes)
return correct / n
#create the function to convert experience records to the pandas.DataFrame
def exp_to_df(g1j):
df = pd.DataFrame(f, columns=['exp'])
return df
def exp_to_df(g2j):
df = pd.DataFrame(f, columns=['exp'])
return df
#uses the function you just implemented, and checks that your function keeps the records and uses the right column names
g1 = exp_to_df(g1j)
g2 = exp_to_df(g2j)
assert len(g1) == len(g1j)
assert len(g2) == len(g2j)
columns = ['student_pin', 'date', 'level', 'unit', 'module', 'accuracy']
assert all(c in g1.columns for c in columns)
assert all(c in g2.columns for c in columns)
What am I doing wrong? It seems like def exp_to_df(g1j) and def exp_to_df(g2j) are wrong. Any suggestions?
Also is my def timestamp_to_date() also wrong?
I suggest using the pandas read_json() function to load your json directly into a dataframe (I added a couple dummy records):
g1 = pd.read_json('/content/drive/MyDrive/group1_exp_2020-04-08.json')
# cls_loc date dur exp mod pre scr usr ver
# 0 QOEBBG_K0101 2020-03-23 09:57:23.013 32 [{'m': 'spot_excited', 's': 8.5, 't': 8.5, 'w'... Poster False 67 QOGOBN 20.5.3
# 1 QOEBBG_K0102 2020-03-23 09:57:23.013 32 [{'m': 'spot_excited', 's': 8.5, 't': 8.5, 'w'... Poster False 67 QOGOBN 20.5.3
# 2 QOEBBG_K0103 2020-03-23 09:57:23.013 32 [{'s': 1.1, 't': 11.4, 'x': 'C'}] Poster False 67 QOGOBN 20.5.3
Then you can do all the data wrangling with pandas functions like
str.extract(),
assign(),
to_datetime(),
map(), and
apply():
# extract level, unit, module as columns
g1 = g1.assign(**g1.cls_loc
.str.extract(r'_([a-zA-Z])([0-9]{2})([0-9]{2})')
.rename({0: 'level', 1: 'unit', 2: 'module'}, axis=1))
# convert date to datetime
g1.date = pd.to_datetime(g1.date, unit='ms')
# map level to int
level_map = {'K': 3}
g1.level = g1.level.map(level_map)
# compute accuracy
def accuracy(exp):
challenges = [e for e in exp if e['x'] == 'A']
n = len(challenges)
if n == 0:
return np.nan
mistakes = [c for c in challenges if 'm' in c.keys()]
correct = n - len(mistakes)
return correct / n
g1['accuracy'] = g1.exp.apply(accuracy)
# rename usr -> student_pin
g1 = g1.rename({'usr': 'student_pin'}, axis=1)
# keep desired columns
columns = ['student_pin', 'date', 'level', 'unit', 'module', 'accuracy']
g1 = g1[columns]
Output:
student_pin date level unit module accuracy
0 QOGOBN 2020-03-23 09:57:23.013 3 01 01 0.500000
1 QOGOBN 2020-03-23 09:57:23.013 3 01 02 0.333333
2 QOGOBN 2020-03-23 09:57:23.013 3 01 03 NaN

dict object returned inf values, but it should have been numeric

Tried appling the following code:
df = pd.read_csv('data_sample_ltv.csv')
# Convert date to year
date_mapper = {date: pd.to_datetime(date).year for date in df['transaction_date'].unique()}
year = df['transaction_date'].map(date_mapper)
df['year'] = year
# Convert tier to categorical
tiers = pd.Categorical(df['customer_tier'],
categories=['Gold', 'Silver', 'Bronze', 'Free-Trial'],
ordered=False)
df['customer_tier'] = tiers
# Create highest tier mapper
def highest_tier(c_id, df=df):
tier = df.loc[df.customer_id == c_id]['customer_tier'].sort_values().iloc[0]
return tier
tier_mapper = {
cust_id: highest_tier(cust_id) for cust_id in df['customer_id'].unique()
}
# Aggregate the data
customer_df = df.groupby(['customer_id']).agg({
'transaction_amount': ['sum', 'count'],
'year': [pd.Series.nunique]
})
customer_df['highest_tier'] = customer_df.index.map(tier_mapper)
customer_df['lifespan'] = customer_df[('year', 'nunique')]
customer_df['avg_trn_amt'] = customer_df[('transaction_amount', 'sum')] / customer_df[('transaction_amount', 'count')]
customer_df['avg_trn_per_yr'] = customer_df[('transaction_amount', 'count')] / customer_df['lifespan']
# Create the LTV function
def ltv(df, tier=None):
if tier:
df = df.loc[df['highest_tier'] == tier]
ltv_dict = {
'avg_lifespan': round(df['lifespan'].mean(), 1),
'avg_trn_per_yr': round(df['avg_trn_per_yr'].mean(), 1),
'avg_trn_amt': round(df['avg_trn_amt'].mean(), 2),
'ltv': None
}
ltv_dict['ltv'] = round(
ltv_dict['avg_lifespan'] * ltv_dict['avg_trn_per_yr'] * ltv_dict['avg_trn_amt'], 2)
return ltv_dict
# Calculate the LTVs for each of our customer segments
ltv_all = ltv(customer_df)
ltv_gold = ltv(customer_df, 'Gold')
ltv_silver = ltv(customer_df, 'Silver')
ltv_bronze = ltv(customer_df, 'Bronze')
print(f"The lifetime value of our Gold tier is: {ltv_gold['ltv']} while the ltv of bronze is {ltv_bronze['ltv']}")
But the ltv results were inf, and not a numeric value:
The lifetime value of our Gold tier is: inf while the ltv of bronze is inf
The results from each ltv were all inf:
{'avg_lifespan': 1.2, 'avg_trn_per_yr': inf, 'avg_trn_amt': 82.23, 'ltv': inf}
{'avg_lifespan': 1.1, 'avg_trn_per_yr': inf, 'avg_trn_amt': 39.9, 'ltv': inf}
{'avg_lifespan': 1.3, 'avg_trn_per_yr': inf, 'avg_trn_amt': 128.13, 'ltv': inf}
Can someone help me understand what went wrong and what should I do to transform the inf into a numeric value? Thanks.

Row comparison and append loop by columns

I have a bunch of school data that I maintain on a master list for monthly testing scores. Everytime a child takes a score and there is an update on 'Age', 'Score', 'School' I would insert a new row with updated data and keep track of all the changes. I am trying to figure out a python script to do this but since I am a newbie, I keep running in to issues.
I tried writing a loop but keep getting errors to include "False", "The Truth value of a series is ambigious", "tuple indices must be integers, not str"
master_df = pd.DataFrame({'ID': ['A', 'B', 'C', 'D'],
'Age':[15,14,17,13],
'School':['AB', 'CD', 'EF', 'GH'],
'Score':[80, 75, 62, 100],
'Date': ['3/1/2019', '3/1/2019', '3/1/2019', '3/1/2019']})
updates_df = pd.DataFrame({'ID': ['A', 'B', 'C', 'D'],
'Age':[16,14,17,13],
'School':['AB', 'ZX', 'EF', 'GH'],
'Score':[80, 90, 62, 100],
'Date': ['4/1/2019', '4/1/2019', '4/1/2019', '4/1/2019']})
# What I am trying to get is:
updated_master = pd.DataFrame({'ID': ['A', 'A', 'B', 'B', 'C','D'],
'Age':[15,16,14,14,17,13],
'School':['AB', 'AB', 'CD', 'ZX', 'EF', 'GH'],
'Score':[80, 80, 75, 90, 62, 100],
'Date': ['3/1/2019', '4/1/2019', '3/1/2019', '4/1/2019', '3/1/2019', '3/1/2019']})
temp_delta_list = []
m_score = master_df.iloc[1:, master_df.columns.get_loc('Score')]
m_age = master_df.iloc[1:, master_df.columns.get_loc('Age')]
m_school = master_df.iloc[1:, master_df.columns.get_loc('School')]
u_score = updates_df.iloc[1:, updates_df.columns.get_loc('Score')]
u_age = updates_df.iloc[1:, updates_df.columns.get_loc('Age')]
u_school = updates_df.iloc[1:, updates_df.columns.get_loc('School')]
for i in updates_df['ID'].values:
updated_temp_score = updates_df[updates_df['ID'] == i], u_score
updated_temp_age = updates_df[updates_df['ID'] == i], u_age
updated_temp_school = updates_df[updates_df['ID'] == i], u_school
master_temp_score = master_df[master_df['ID'] == i], m_score
master_temp_age = master_df[master_df['ID'] == i], m_age
master_temp_school = updates_df[master_df['ID'] == i], m_school
if (updated_temp_score == master_temp_score) | (updated_temp_age == master_temp_age) | (updated_temp_school == master_temp_school):
pass
else:
temp_deltas = updates_df[(updates_df['ID'] == i)]
temp_delta_list.append(temp_deltas)
I ultimately want to have the loop compare each row values for each ID and return rows that have any difference and then append the master_df

trouble with my nested for loops achieve results similar to a sumif

I'm trying to cycle through 2 lists using for loops to calculate the sum for each unique reference. I suppose I'm looking for a pythonic sumif!
# list of data ("user_ID", "contract_Number", "weight", "type")
list1 = [
('1','261','6.2','Input'),
('1','262','7.2','Input'),
('1','263','5.2','Input'),
('1','264','8.2','Input'),
('1','261','3.2','Input'),
('1','262','2.2','Input'),
('1','262','7.2','Input'),
('1','263','4.2','Input'),
('1','264','6.2','Input'),
('1','265','6.2','Input'),
('1','261','9.2','Input'),
('1','261','10.2','Input')
]
contract_list = []
# create a list of contract numbers
for data_row in list1:
if data_row[0] == "1" and data_row[3] == "Input":
contract_list.append(data_row[1])
#remove duplication - left with a list of unique contract numbers
contract_list = list(dict.fromkeys(contract_list))
print(contract_list)
# I'm trying this...[28.6, 16.6, 9.4, 14.4, 6.2]
tally_list = []
tally = 0
for c in contract_list:
for l in list1:
if data_row[0] == '1' and data_row[1] == contract_list[0]:
tally = tally + float(data_row[2])
tally_list.append(tally)
print(tally_list)
I'm expecting...
['261', '262', '263', '264', '265']
[28.6, 16.6, 9.4, 14.4, 6.2]
I'm getting...
['261', '262', '263', '264', '265']
[122.40000000000002, 244.7999999999999, 367.19999999999976, 489.5999999999996, 612.0]
# I'm trying this...[28.6, 16.6, 9.4, 14.4, 6.2]
tally_list = []
tally = 0
for c in contract_list:
for l in list1: #<----------
if data_row[0] == '1' and data_row[1] == contract_list[0]:
tally = tally + float(data_row[2])
tally_list.append(tally)
In the marked row, it looks like you want to use the data_row variable instead of l
Actually, try this, you need to additionally reset tally and also use c instead of contract_list[0] in the final if statement.
# I'm trying this...[28.6, 16.6, 9.4, 14.4, 6.2]
tally_list = []
tally = 0
for c in contract_list:
for data_row in list1:
if data_row[0] == '1' and data_row[1] == c: #<----
tally = tally + float(data_row[2])
tally_list.append(tally)
tally=0 #<---
print(tally_list)
Just another approach using a defaultdict
from collections import defaultdict
list1 = [
('1','261','6.2','Input'),
('1','262','7.2','Input'),
('1','263','5.2','Input'),
('1','264','8.2','Input'),
('1','261','3.2','Input'),
('1','262','2.2','Input'),
('1','262','7.2','Input'),
('1','263','4.2','Input'),
('1','264','6.2','Input'),
('1','265','6.2','Input'),
('1','261','9.2','Input'),
('1','261','10.2','Input')
]
d = defaultdict(int)
for tup in list1:
if tup[0] == '1' and tup[3] == 'Input':
d[tup[1]] += float(tup[2])
contract_list = list(d)
print(contract_list)
tally_list = [format(v, '.1f') for v in d.values()]
print(tally_list)
Output:
['261', '262', '263', '264', '265']
['28.8', '16.6', '9.4', '14.4', '6.2']

Trying to get a weighted average out of a dictionary of grades data

I am trying to return the weighted average of the student's grades based on the last definition. I have the dictionaries defined, but think my attempt to pull the numbers out is incorrect.
def Average(lst):
return sum(lst) / len(lst)
# Driver Code
lst = [1,2,3,4,5]
average = Average(lst)
print("Average of the list =", average)
def get_weighted_average(student):
return average('homework')*0.10 + average('quizzes')*0.30 + average('tests')*.60
#driver code
students = [steve, alice, tyler]
print(get_weighted_average('steve'))
How to get a weighted average out of a dictionary of grades above?
What is the primary source of your data? Text? Anyway, it looks like you have something like this in mind.
Imperative approach
1 - Your "database"
students_marks = {
'steve':{
'homework':[1,2,3,4,5],
'quizzes' :[5,4,3,2,1],
'tests' :[0,0,0,0,0],
},
'alice':{
'homework':[5,4,3,2,1],
'quizzes' :[0,0,0,0,0],
'tests' :[1,2,3,4,5],
},
}
use case:
>>> students_marks['steve']
{'homework': [1, 2, 3, 4, 5], 'quizzes': [5, 4, 3, 2, 1], 'tests': [0, 0, 0, 0, 0]}
>>> students_marks['steve']['homework']
[1, 2, 3, 4, 5]
2 - The definition of average and get_weighted_average
def average(lst):
return sum(lst)/len(lst) # Python3
#return sum(lst)/float(len(lst)) # Python2
def get_weighted_average(student_name):
student_marks = students_marks[student_name]
return round(
average(student_marks['homework'])*.1
+ average(student_marks['quizzes'])*.3
+ average(student_marks['tests'])*.6
, 2)
use case:
>>> get_weighted_average('steve')
1.2
>>> get_weighted_average('alice')
2.1
or using list
>>> students_names = ['steve', 'alice']
>>> [get_weighted_average(name) for name in students_names]
[1.2, 2.1]
or using dict
>>> {name:get_weighted_average(name) for name in students_names}
{'steve': 1.2, 'alice': 2.1}
Object-Oriented (OO) approach
All this being shown, what you want to do would probably be better done by programming in an OO manner. A quick example
class Student(object):
homeworks_weight = .1
quizzes_weight = .3
tests_weight = .6
def __init__(self, name, homeworks_marks, quizzes_marks, tests_marks):
self.name = name
self.homeworks_marks = homeworks_marks
self.quizzes_marks = quizzes_marks
self.tests_marks = tests_marks
#staticmethod
def average(marks):
return sum(marks)/len(marks)
def get_gpa(self, rd=2):
return round(
self.average(self.homeworks_marks)*self.homeworks_weight
+ average(self.quizzes_marks)*self.quizzes_weight
+ average(self.tests_marks)*self.tests_weight
, rd)
use case:
>>> steve = Student(
name = 'Steve',
homeworks_marks = [1,2,3,4,5],
quizzes_marks = [5,4,3,2,1],
tests_marks = [0,0,0,0,0]
)
>>> steve.get_gpa()
1.2
>>> steve.homeworks_marks
[1, 2, 3, 4, 5]

Categories

Resources