how is the output of this nested loop being calculated?

how is the output of this nested loop being calculated? - python

Hi I have this calculation but I am failing to understand how this line [array([1050885., 1068309., 1085733., 1103157., 1120581.]) of the output is calculated, please explain.
creating sample data:
#creating sample data:
data1 = pd.DataFrame({"client": ['x1', 'x2'],
"cat": ['Bb', 'Ee'],
"amt": [1000,300],
"time":[2, 3],
"group":[10, 25]})
listc = ['Aa','Bb','Cc','Dd','Ee']
val1 = pd.DataFrame({'time': [1, 2, 3],
'lim %': [0.1, 0.11, 0.112]})
val2 = pd.concat([pd.DataFrame({'group':g, 'perc': 0.99, 'time':range(1, 11)}
for g in data1['group'].unique())]).explode('time')
mat = np.arange(75).reshape(3,5,5)
vals = [val1, val2]
data1['cat'] = pd.Categorical(data1['cat'],
categories=listc,
ordered=True).codes
for i in range(len(vals)):
if 'group' in vals[i].columns:
vals[i] = vals[i].set_index(['time', 'group'])
else:
vals[i] = vals[i].set_index(['time'])
#nested loop calculation
calc = {}
for client, cat, amt, start, group in data1.itertuples(name=None, index=False):
for time in range(start, len(mat)+1):
if time == start:
calc[client] = [[amt * mat[time-1, cat, :]]]
else:
calc[client].append([calc[client][-1][-1] # mat[time-1]])
for valcal in vals:
if isinstance(valcal.index, pd.MultiIndex):
value = valcal.loc[(time, group)].iat[0]
else:
value = valcal.loc[time].iat[0]
calc[client][-1].append(value * calc[client][-1][-1])
output:
{'x1': [[array([30000, 31000, 32000, 33000, 34000]),
array([3300., 3410., 3520., 3630., 3740.]),
array([3267. , 3375.9, 3484.8, 3593.7, 3702.6])],
[array([1050885., 1068309., 1085733., 1103157., 1120581.]), #how is this line calculated?
array([117699.12 , 119650.608, 121602.096, 123553.584, 125505.072]),
array([116522.1288 , 118454.10192, 120386.07504, 122318.04816,
124250.02128])]],
'x2': [[array([21000, 21300, 21600, 21900, 22200]),
array([2352. , 2385.6, 2419.2, 2452.8, 2486.4]),
array([2328.48 , 2361.744, 2395.008, 2428.272, 2461.536])]]}
what I need the calc for this line to be is:
[array([1050885., 1068309., 1085733., 1103157., 1120581.])
it should take array([3267. , 3375.9, 3484.8, 3593.7, 3702.6])] multiplied by mat at time 3, how can I get it to do this?

Related

parsing json file with function into dataframe for analysis

Hi am working with two json files , and im having problem with the data cleaning.
Suppose a record in g1j or g2j looks like this:
{
'cls_loc': 'QOEBBG_K0101',
'date': 1584957443013,
'dur': 32,
'exp': [
{
'm': 'spot_excited',
's': 8.5,
't': 8.5,
'w': 'spot_bored',
'x': 'A'
},
{
's': 1.1,
't': 11.4,
'w': 'spot_scared',
'x': 'A'
}
],
'mod': 'Poster',
'pre': False,
'scr': 67,
'usr': 'QOGOBN',
'ver': '20.5.3'
}
What we want per row in our DataFrame is this:
{
'student_pin': 'QOGOBN', # from `usr`
'date': datetime.date(2020 3, 23), # from `date`, but parsed
'duration': 32, # from `dur`
'level': 3, # the "K" from `cls_loc`, mapped to int
'unit': 1, # from `cls_loc`, mapped to int
'module': 1, # from `cls_loc`, mapped to int
'accuracy': 0.5, # calcualted from `exp`
}
my code so far:
from datetime import datetime
import json
import numpy as np
import pandas as pd
from scipy import stats
with open('/content/drive/MyDrive/group1_exp_2020-04-08.json', 'r') as f:
g1j = json.loads(f.read())
with open('/content/drive/MyDrive/group2_exp_2020-04-22.json', 'r') as f:
g2j = json.loads(f.read())
#convert the integer timestamp to a datetime.date
def timestamp_to_date():
l =[]
for item in g1j:
timestamp =item['date']
timestamp = timestamp/1000
dt_obj = datetime.fromtimestamp(timestamp).strftime('%Y, %m, %d ')
l.append(dt_obj)
return l
timestamp_to_date()
def timestamp_to_date():
l =[]
for item in g2j:
timestamp =item['date']
timestamp = timestamp/1000
dt_obj = datetime.fromtimestamp(timestamp).strftime('%Y, %m, %d ')
l.append(dt_obj)
return l
#extract the level, unit, module, and accuracy here
def get_level(x):
loc = x['cls_loc'].split('_')[-1]
return level_map[loc[0]]
def get_unit(x):
loc = x['cls_loc'].split('_')[-1]
unit = loc[1:3]
return int(unit)
def get_module(x):
loc = x['cls_loc'].split('_')[-1]
module = loc[3:]
return int(module)
def get_accuracy(x):
challenges = [x for x in x['exp'] if x['x'] == 'A']
n = len(challenges)
if n == 0:
return 'N/A'
mistakes = [x for x in challenges if 'm' in x.keys()]
correct = n - len(mistakes)
return correct / n
#create the function to convert experience records to the pandas.DataFrame
def exp_to_df(g1j):
df = pd.DataFrame(f, columns=['exp'])
return df
def exp_to_df(g2j):
df = pd.DataFrame(f, columns=['exp'])
return df
#uses the function you just implemented, and checks that your function keeps the records and uses the right column names
g1 = exp_to_df(g1j)
g2 = exp_to_df(g2j)
assert len(g1) == len(g1j)
assert len(g2) == len(g2j)
columns = ['student_pin', 'date', 'level', 'unit', 'module', 'accuracy']
assert all(c in g1.columns for c in columns)
assert all(c in g2.columns for c in columns)
What am I doing wrong? It seems like def exp_to_df(g1j) and def exp_to_df(g2j) are wrong. Any suggestions?
Also is my def timestamp_to_date() also wrong?

I suggest using the pandas read_json() function to load your json directly into a dataframe (I added a couple dummy records):
g1 = pd.read_json('/content/drive/MyDrive/group1_exp_2020-04-08.json')
# cls_loc date dur exp mod pre scr usr ver
# 0 QOEBBG_K0101 2020-03-23 09:57:23.013 32 [{'m': 'spot_excited', 's': 8.5, 't': 8.5, 'w'... Poster False 67 QOGOBN 20.5.3
# 1 QOEBBG_K0102 2020-03-23 09:57:23.013 32 [{'m': 'spot_excited', 's': 8.5, 't': 8.5, 'w'... Poster False 67 QOGOBN 20.5.3
# 2 QOEBBG_K0103 2020-03-23 09:57:23.013 32 [{'s': 1.1, 't': 11.4, 'x': 'C'}] Poster False 67 QOGOBN 20.5.3
Then you can do all the data wrangling with pandas functions like
str.extract(),
assign(),
to_datetime(),
map(), and
apply():
# extract level, unit, module as columns
g1 = g1.assign(**g1.cls_loc
.str.extract(r'_([a-zA-Z])([0-9]{2})([0-9]{2})')
.rename({0: 'level', 1: 'unit', 2: 'module'}, axis=1))
# convert date to datetime
g1.date = pd.to_datetime(g1.date, unit='ms')
# map level to int
level_map = {'K': 3}
g1.level = g1.level.map(level_map)
# compute accuracy
def accuracy(exp):
challenges = [e for e in exp if e['x'] == 'A']
n = len(challenges)
if n == 0:
return np.nan
mistakes = [c for c in challenges if 'm' in c.keys()]
correct = n - len(mistakes)
return correct / n
g1['accuracy'] = g1.exp.apply(accuracy)
# rename usr -> student_pin
g1 = g1.rename({'usr': 'student_pin'}, axis=1)
# keep desired columns
columns = ['student_pin', 'date', 'level', 'unit', 'module', 'accuracy']
g1 = g1[columns]
Output:
student_pin date level unit module accuracy
0 QOGOBN 2020-03-23 09:57:23.013 3 01 01 0.500000
1 QOGOBN 2020-03-23 09:57:23.013 3 01 02 0.333333
2 QOGOBN 2020-03-23 09:57:23.013 3 01 03 NaN

dict object returned inf values, but it should have been numeric

Tried appling the following code:
df = pd.read_csv('data_sample_ltv.csv')
# Convert date to year
date_mapper = {date: pd.to_datetime(date).year for date in df['transaction_date'].unique()}
year = df['transaction_date'].map(date_mapper)
df['year'] = year
# Convert tier to categorical
tiers = pd.Categorical(df['customer_tier'],
categories=['Gold', 'Silver', 'Bronze', 'Free-Trial'],
ordered=False)
df['customer_tier'] = tiers
# Create highest tier mapper
def highest_tier(c_id, df=df):
tier = df.loc[df.customer_id == c_id]['customer_tier'].sort_values().iloc[0]
return tier
tier_mapper = {
cust_id: highest_tier(cust_id) for cust_id in df['customer_id'].unique()
}
# Aggregate the data
customer_df = df.groupby(['customer_id']).agg({
'transaction_amount': ['sum', 'count'],
'year': [pd.Series.nunique]
})
customer_df['highest_tier'] = customer_df.index.map(tier_mapper)
customer_df['lifespan'] = customer_df[('year', 'nunique')]
customer_df['avg_trn_amt'] = customer_df[('transaction_amount', 'sum')] / customer_df[('transaction_amount', 'count')]
customer_df['avg_trn_per_yr'] = customer_df[('transaction_amount', 'count')] / customer_df['lifespan']
# Create the LTV function
def ltv(df, tier=None):
if tier:
df = df.loc[df['highest_tier'] == tier]
ltv_dict = {
'avg_lifespan': round(df['lifespan'].mean(), 1),
'avg_trn_per_yr': round(df['avg_trn_per_yr'].mean(), 1),
'avg_trn_amt': round(df['avg_trn_amt'].mean(), 2),
'ltv': None
}
ltv_dict['ltv'] = round(
ltv_dict['avg_lifespan'] * ltv_dict['avg_trn_per_yr'] * ltv_dict['avg_trn_amt'], 2)
return ltv_dict
# Calculate the LTVs for each of our customer segments
ltv_all = ltv(customer_df)
ltv_gold = ltv(customer_df, 'Gold')
ltv_silver = ltv(customer_df, 'Silver')
ltv_bronze = ltv(customer_df, 'Bronze')
print(f"The lifetime value of our Gold tier is: {ltv_gold['ltv']} while the ltv of bronze is {ltv_bronze['ltv']}")
But the ltv results were inf, and not a numeric value:
The lifetime value of our Gold tier is: inf while the ltv of bronze is inf
The results from each ltv were all inf:
{'avg_lifespan': 1.2, 'avg_trn_per_yr': inf, 'avg_trn_amt': 82.23, 'ltv': inf}
{'avg_lifespan': 1.1, 'avg_trn_per_yr': inf, 'avg_trn_amt': 39.9, 'ltv': inf}
{'avg_lifespan': 1.3, 'avg_trn_per_yr': inf, 'avg_trn_amt': 128.13, 'ltv': inf}
Can someone help me understand what went wrong and what should I do to transform the inf into a numeric value? Thanks.

Row comparison and append loop by columns

I have a bunch of school data that I maintain on a master list for monthly testing scores. Everytime a child takes a score and there is an update on 'Age', 'Score', 'School' I would insert a new row with updated data and keep track of all the changes. I am trying to figure out a python script to do this but since I am a newbie, I keep running in to issues.
I tried writing a loop but keep getting errors to include "False", "The Truth value of a series is ambigious", "tuple indices must be integers, not str"
master_df = pd.DataFrame({'ID': ['A', 'B', 'C', 'D'],
'Age':[15,14,17,13],
'School':['AB', 'CD', 'EF', 'GH'],
'Score':[80, 75, 62, 100],
'Date': ['3/1/2019', '3/1/2019', '3/1/2019', '3/1/2019']})
updates_df = pd.DataFrame({'ID': ['A', 'B', 'C', 'D'],
'Age':[16,14,17,13],
'School':['AB', 'ZX', 'EF', 'GH'],
'Score':[80, 90, 62, 100],
'Date': ['4/1/2019', '4/1/2019', '4/1/2019', '4/1/2019']})
# What I am trying to get is:
updated_master = pd.DataFrame({'ID': ['A', 'A', 'B', 'B', 'C','D'],
'Age':[15,16,14,14,17,13],
'School':['AB', 'AB', 'CD', 'ZX', 'EF', 'GH'],
'Score':[80, 80, 75, 90, 62, 100],
'Date': ['3/1/2019', '4/1/2019', '3/1/2019', '4/1/2019', '3/1/2019', '3/1/2019']})
temp_delta_list = []
m_score = master_df.iloc[1:, master_df.columns.get_loc('Score')]
m_age = master_df.iloc[1:, master_df.columns.get_loc('Age')]
m_school = master_df.iloc[1:, master_df.columns.get_loc('School')]
u_score = updates_df.iloc[1:, updates_df.columns.get_loc('Score')]
u_age = updates_df.iloc[1:, updates_df.columns.get_loc('Age')]
u_school = updates_df.iloc[1:, updates_df.columns.get_loc('School')]
for i in updates_df['ID'].values:
updated_temp_score = updates_df[updates_df['ID'] == i], u_score
updated_temp_age = updates_df[updates_df['ID'] == i], u_age
updated_temp_school = updates_df[updates_df['ID'] == i], u_school
master_temp_score = master_df[master_df['ID'] == i], m_score
master_temp_age = master_df[master_df['ID'] == i], m_age
master_temp_school = updates_df[master_df['ID'] == i], m_school
if (updated_temp_score == master_temp_score) | (updated_temp_age == master_temp_age) | (updated_temp_school == master_temp_school):
pass
else:
temp_deltas = updates_df[(updates_df['ID'] == i)]
temp_delta_list.append(temp_deltas)
I ultimately want to have the loop compare each row values for each ID and return rows that have any difference and then append the master_df

trouble with my nested for loops achieve results similar to a sumif

I'm trying to cycle through 2 lists using for loops to calculate the sum for each unique reference. I suppose I'm looking for a pythonic sumif!
# list of data ("user_ID", "contract_Number", "weight", "type")
list1 = [
('1','261','6.2','Input'),
('1','262','7.2','Input'),
('1','263','5.2','Input'),
('1','264','8.2','Input'),
('1','261','3.2','Input'),
('1','262','2.2','Input'),
('1','262','7.2','Input'),
('1','263','4.2','Input'),
('1','264','6.2','Input'),
('1','265','6.2','Input'),
('1','261','9.2','Input'),
('1','261','10.2','Input')
]
contract_list = []
# create a list of contract numbers
for data_row in list1:
if data_row[0] == "1" and data_row[3] == "Input":
contract_list.append(data_row[1])
#remove duplication - left with a list of unique contract numbers
contract_list = list(dict.fromkeys(contract_list))
print(contract_list)
# I'm trying this...[28.6, 16.6, 9.4, 14.4, 6.2]
tally_list = []
tally = 0
for c in contract_list:
for l in list1:
if data_row[0] == '1' and data_row[1] == contract_list[0]:
tally = tally + float(data_row[2])
tally_list.append(tally)
print(tally_list)
I'm expecting...
['261', '262', '263', '264', '265']
[28.6, 16.6, 9.4, 14.4, 6.2]
I'm getting...
['261', '262', '263', '264', '265']
[122.40000000000002, 244.7999999999999, 367.19999999999976, 489.5999999999996, 612.0]

# I'm trying this...[28.6, 16.6, 9.4, 14.4, 6.2]
tally_list = []
tally = 0
for c in contract_list:
for l in list1: #<----------
if data_row[0] == '1' and data_row[1] == contract_list[0]:
tally = tally + float(data_row[2])
tally_list.append(tally)
In the marked row, it looks like you want to use the data_row variable instead of l
Actually, try this, you need to additionally reset tally and also use c instead of contract_list[0] in the final if statement.
# I'm trying this...[28.6, 16.6, 9.4, 14.4, 6.2]
tally_list = []
tally = 0
for c in contract_list:
for data_row in list1:
if data_row[0] == '1' and data_row[1] == c: #<----
tally = tally + float(data_row[2])
tally_list.append(tally)
tally=0 #<---
print(tally_list)

Just another approach using a defaultdict
from collections import defaultdict
list1 = [
('1','261','6.2','Input'),
('1','262','7.2','Input'),
('1','263','5.2','Input'),
('1','264','8.2','Input'),
('1','261','3.2','Input'),
('1','262','2.2','Input'),
('1','262','7.2','Input'),
('1','263','4.2','Input'),
('1','264','6.2','Input'),
('1','265','6.2','Input'),
('1','261','9.2','Input'),
('1','261','10.2','Input')
]
d = defaultdict(int)
for tup in list1:
if tup[0] == '1' and tup[3] == 'Input':
d[tup[1]] += float(tup[2])
contract_list = list(d)
print(contract_list)
tally_list = [format(v, '.1f') for v in d.values()]
print(tally_list)
Output:
['261', '262', '263', '264', '265']
['28.8', '16.6', '9.4', '14.4', '6.2']

Trying to get a weighted average out of a dictionary of grades data

I am trying to return the weighted average of the student's grades based on the last definition. I have the dictionaries defined, but think my attempt to pull the numbers out is incorrect.
def Average(lst):
return sum(lst) / len(lst)
# Driver Code
lst = [1,2,3,4,5]
average = Average(lst)
print("Average of the list =", average)
def get_weighted_average(student):
return average('homework')*0.10 + average('quizzes')*0.30 + average('tests')*.60
#driver code
students = [steve, alice, tyler]
print(get_weighted_average('steve'))
How to get a weighted average out of a dictionary of grades above?

What is the primary source of your data? Text? Anyway, it looks like you have something like this in mind.
Imperative approach
1 - Your "database"
students_marks = {
'steve':{
'homework':[1,2,3,4,5],
'quizzes' :[5,4,3,2,1],
'tests' :[0,0,0,0,0],
},
'alice':{
'homework':[5,4,3,2,1],
'quizzes' :[0,0,0,0,0],
'tests' :[1,2,3,4,5],
},
}
use case:
>>> students_marks['steve']
{'homework': [1, 2, 3, 4, 5], 'quizzes': [5, 4, 3, 2, 1], 'tests': [0, 0, 0, 0, 0]}
>>> students_marks['steve']['homework']
[1, 2, 3, 4, 5]
2 - The definition of average and get_weighted_average
def average(lst):
return sum(lst)/len(lst) # Python3
#return sum(lst)/float(len(lst)) # Python2
def get_weighted_average(student_name):
student_marks = students_marks[student_name]
return round(
average(student_marks['homework'])*.1
+ average(student_marks['quizzes'])*.3
+ average(student_marks['tests'])*.6
, 2)
use case:
>>> get_weighted_average('steve')
1.2
>>> get_weighted_average('alice')
2.1
or using list
>>> students_names = ['steve', 'alice']
>>> [get_weighted_average(name) for name in students_names]
[1.2, 2.1]
or using dict
>>> {name:get_weighted_average(name) for name in students_names}
{'steve': 1.2, 'alice': 2.1}
Object-Oriented (OO) approach
All this being shown, what you want to do would probably be better done by programming in an OO manner. A quick example
class Student(object):
homeworks_weight = .1
quizzes_weight = .3
tests_weight = .6
def __init__(self, name, homeworks_marks, quizzes_marks, tests_marks):
self.name = name
self.homeworks_marks = homeworks_marks
self.quizzes_marks = quizzes_marks
self.tests_marks = tests_marks
#staticmethod
def average(marks):
return sum(marks)/len(marks)
def get_gpa(self, rd=2):
return round(
self.average(self.homeworks_marks)*self.homeworks_weight
+ average(self.quizzes_marks)*self.quizzes_weight
+ average(self.tests_marks)*self.tests_weight
, rd)
use case:
>>> steve = Student(
name = 'Steve',
homeworks_marks = [1,2,3,4,5],
quizzes_marks = [5,4,3,2,1],
tests_marks = [0,0,0,0,0]
)
>>> steve.get_gpa()
1.2
>>> steve.homeworks_marks
[1, 2, 3, 4, 5]

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

how is the output of this nested loop being calculated? - python

Related

parsing json file with function into dataframe for analysis

dict object returned inf values, but it should have been numeric

Row comparison and append loop by columns

trouble with my nested for loops achieve results similar to a sumif

Trying to get a weighted average out of a dictionary of grades data

Categories

Resources