How to create a list of totals for durations? - python

I want to calculate a bonus based on the two consecutive months where sales where the most. So I can iterate a total for every two consecutive months to find the Max value ie get
value = Max[total_between_firstdayMonth1_and_lastDayMonth2, total_between_firstdayMonth2_and_lastDayMonth3, ... , total_between_firstdaySecondToLastMonth_andlastDayLastMonth]
So I might need a list of pairs of datetime objects or something similar.
start= model.Order.order('created').get().created # get the oldest order
end = model.Order.order('-created').get().created # get the newest order
So inbetween start and end I must partition the time in overlapping pairs of consecutive 2 months eg. if first order was in december 2008 and the last order was in november 2011 then the list from where to pick the max should be [total_december2008 + total_january2009, total_january2009 + total_february2009, ... , total_october2011 + total_november2011]
But then how do I get the last day of the second month if I know the start like above? How can I create the list of times and totals?
I might not be able to create the list of totals right away but if I can create the list of starts and ends then I can call a helper function we can assume eg.
total(start_datetime, end_datetime)
Thanks for any help
Update
I think I found how to calculate the time for an example interval where the timeline is from any date to last day next month:
>>> d = date(2007,12,18)
>>> print d
2007-12-18
>>> d + relativedelta(months=2) - timedelta(days=d.day)
datetime.date(2008, 1, 31)
Update 2
I can calculate upto the first level the first duration. Now I only have to generalize it to loop through all the durations and check which was the highest level:
def level(self):
startdate = model.Order.all().filter('status =', 'PAID').filter('distributor_id =' , self._key.id()).get().created.date()
last_day_nextmonth =startdate + relativedelta(months=2) - timedelta(days=1)
if self.personal_silver(startdate, last_day_nextmonth) + self.non_manager_silver(startdate, last_day_nextmonth) < 25:
maxlevel = _('New distributor')
elif self.personal_silver(startdate, last_day_nextmonth) + self.non_manager_silver(startdate, last_day_nextmonth) > 25:
maxlevel = _('Assistant Teamleader')
return maxlevel
Update 3
Closer to what I mean is taking the max of some function values from beginning up to now. Basecase can be that last day next month is is the future and the helper function can be recursive but I didn't have time or help to make it recursive to it only works for the first 2 periods now ie 4 months from start:
def level(self):
startdate = model.Order.all().filter('status =', 'PAID'
).filter('distributor_id =',
self._key.id()).get().created.date()
last_day_nextmonth = startdate + relativedelta(months=2) \
- timedelta(days=1)
total = self.personal_silver(startdate, last_day_nextmonth) + self.non_manager_silver(startdate, last_day_nextmonth)
if total >= 125:
level = 5
elif total >= 75:
level = 4
elif total >= 25:
level = 3
elif total >= 2:
level = 2
else:
level = 1
return self.levelHelp(level, last_day_nextmonth + timedelta(days=1))
def levelHelp(self, level, startdate):
#if startdate in future return level
last_day_nextmonth = startdate + relativedelta(months=2) \
- timedelta(days=1)
total = self.personal_silver(startdate, last_day_nextmonth) + self.non_manager_silver(startdate, last_day_nextmonth)
if total >= 125:
newlevel = 5
elif total >= 75:
newlevel = 4
elif total >= 25:
newlevel = 3
elif total >= 2:
newlevel = 2
else:
newlevel = 1
return level if level > newlevel else newlevel
Update 4
I added the recursion where base case is that next step is in the future, if so it will return the max level:
def level(self):
startdate = model.Order.all().filter('status =', 'PAID'
).filter('distributor_id =',
self._key.id()).get().created.date()
last_day_nextmonth = startdate + relativedelta(months=2) \
- timedelta(days=1)
total = self.personal_silver(startdate, last_day_nextmonth) + self.non_manager_silver(startdate, last_day_nextmonth)
if total >= 125:
level = 5
elif total >= 75:
level = 4
elif total >= 25:
level = 3
elif total >= 2:
level = 2
else:
level = 1
return self.levelHelp(level, last_day_nextmonth + timedelta(days=1))
def levelHelp(self, level, startdate):
last_day_nextmonth = startdate + relativedelta(months=2) \
- timedelta(days=1)
total = self.personal_silver(startdate, last_day_nextmonth) + self.non_manager_silver(startdate, last_day_nextmonth)
if total >= 125:
newlevel = 5
elif total >= 75:
newlevel = 4
elif total >= 25:
newlevel = 3
elif total >= 2:
newlevel = 2
else:
newlevel = 1
maxlevel = level if level > newlevel else newlevel
nextstart = last_day_nextmonth + timedelta(days=1)
now = datetime.now().date()
if nextstart > now: #next start in is the future
return maxlevel
else: return self.levelHelp(maxlevel, nextstart)

This sounds like a fine job for functional approach. At the end there is a full working example, but I just want to emphasize the elegance and simplicity of the core function, written in FP style:
def find_best_two_months(orders):
first = lambda x: x[0]
second = lambda x: x[1]
orders_by_year_and_month = [
("%04d-%02d" % (date.year, date.month), amount)
for date, amount in orders]
sorted_orders = sorted(orders_by_year_and_month, key=first)
totals_by_month = [
(ym, sum(map(second, groupped_orders)))
for ym, groupped_orders in groupby(sorted_orders, key=first)]
totals_two_months = [
( "%s - %s" % (m1[0], m2[0]), m1[1]+m2[1] )
for m1, m2 in zip(totals_by_month, totals_by_month[1:]) ]
return max(totals_two_months, key=second)
Here is a full working example with comments:
#!/usr/bin/python
from random import randint
from datetime import date, timedelta
from itertools import groupby
""" finding best two months the functional way """
def find_best_two_months(orders):
"""
Expect a list of tuples of form (date_of_order, amount):
[ (date1, amount1), (date2, amount2), ...]
"""
" helper functions for extracting first or second from tuple "
first = lambda x: x[0]
second = lambda x: x[1]
" converts [(date, amount)] -> [(YYYY-MM, amount)] "
orders_by_year_and_month = [ ("%04d-%02d" % (date.year, date.month), amount) for date, amount in orders]
" Sorts by YYYY-MM. This step can be omitted if orders were already sorted by date"
sorted_orders = sorted(orders_by_year_and_month, key=first)
" Compresses orders from the same month, so ve get [(YYYY-MM), total_amount_of_orders]"
totals_by_month = [ (ym, sum(map(lambda x:x[1], groupped_orders)))
for ym, groupped_orders in groupby(sorted_orders, key=first)]
" Zips orders to two month periods"
totals_two_months = [ ("%s - %s" % (m1[0], m2[0]), m1[1]+m2[1]) for m1, m2 in zip(totals_by_month, totals_by_month[1:]) ]
" Returns two-month period with maximum total amount. If there were many periods with max amount, only the first is returned "
return max(totals_two_months, key=second)
"""
this code is for generating random list of orders
and is not a part of the solution
"""
MIN_AMOUNT=70
MAX_AMOUNT=500
MAX_DAY_SPREAD=5
def gen_order(last_date):
""" returns (order_date, amount) """
days = timedelta()
return (
last_date+timedelta(days=randint(0, MAX_DAY_SPREAD)), # new date
randint(MIN_AMOUNT, MAX_AMOUNT)) # amount
def gen_orders(total, start_date):
orders = []
last_date = start_date
for i in range(total):
order = gen_order(last_date)
orders.append(order)
last_date = order[0]
return orders
if __name__ == "__main__":
orders = gen_orders(300, date(2010,1,1))
print find_best_two_months(orders)

Related

Finding weekday with specific date

I am learning python and going through some interactive exercises. Specifically, I'm working on Friday the 13th.
I have rewritten several iterations of this but can never seem to lock it down. With this version, it seems to get hung up when run with the simulated start date of 2025-06-12 which means there's a problem with the "this month" section. Since it returns an accurate Friday the 13th except not 2025-06-13, I suspect it's a problem with the elif statement, particularly the
and date.fromisoformat(current_year + '-' + current_month + '-13').weekday == 4:
Here's the most recent iteration of this.
def friday_the_13th():
from datetime import date
current_year = str(date.today().year)
current_month = str(date.today().month)
if len(current_month) == 1: current_month = '0' + current_month
#Function to increment to the 13th of next month
def NextMonth13(startdate):
lst_date = str(startdate)
lst_date = lst_date.split('-')
month = int(lst_date[1])
if month == 12:
year = str(int(lst_date[0]) + 1)
month = '01'
return str(year + '-' + month + '-' + '13')
else:
year = lst_date[0]
month = str(month + 1)
if len(month) == 1: month = '0' + month
return str(year + '-' + month + '-' + '13')
# Return today if today is Friday the 13th
if date.today().weekday() == 4 and date.today().day == 13:
return date.today()
# Check if this month's 13th is in the future and if it's a Friday
elif date.today().day < 13 and date.fromisoformat(current_year + '-' + current_month + '-13').weekday == 4:
return str(date.fromisoformat(current_year + '-' + current_month + '-13'))
#Check next month and return result if Friday 13
else:
result = NextMonth13(date.today())
while not (date.fromisoformat(result).weekday() == 4):
result = NextMonth13(result)
if date.fromisoformat(result).weekday() == 4:
return result
Would someone mind giving me some guidance on what I might be doing wrong?
First, your error is that you forgot the parenthesis after the weekday method call: date.fromisoformat(current_year + '-' + current_month + '-13').weekday() == 4 (FYI, date.fromisoformat(current_year + '-' + current_month + '-13').weekday returns the memory address of the method, something like this <built-in method weekday of datetime.date object at 0x7fa4e36058f0>. As you can see, it is nowhere near the result you were expecting, so it was normal for your program to behave this way.)
Second, you are needlessly complicating yourself by doing str conversions all the time:
def friday_the_13th():
from datetime import datetime, timedelta
days_passed = 0
today = datetime.today()
while True:
curr = today + timedelta(days=days_passed)
if curr.day == 13 and datetime.weekday(curr) == 4:
return str(datetime.date(curr))
days += 1
This is more readable and less prone to error as you only convert to string at the end, after you've handled all your calculations.
Not sure if it helps but to calculate the future Friday 13th you can do something like:
import datetime
def get_13th_future(startdate, months):
result=[]
year=startdate.year
month=startdate.month
checkdate = datetime.date(year=year, month=month, day=13)
for i in range(month):
if checkdate.weekday()==4:
result.append(checkdate.isoformat())
month+=1
if month==13:
year+=1
month=1
checkdate=datetime.date(year=year,month=month,day=13)
return result
startdate=datetime.datetime.now()
print(get_13th_future(startdate,1000))
If you like to search for a specific date you might construct a set instead of the list.

How do I create a time series with 15min buckets in pyspark?

I'm trying to create a report that shows the total number of minutes worked by a group of employees in 15 minute increments.
The source table has the time in/out and total minutes worked, one record for each employee.
I've create a RDD row wise mapping function to loop through the number of hours in a day, then an inner loop for each 15 minute increment.
Each loop should add a column to the RDD row dictionary.
I've confirmed the resulting schema contains these new columns, but I'm missing lots of data in the final output.
I'm not sure if it's a problem with the row iteration or the stacking.
This is the starting schema -
Any ideas?
final schema -
Updated code -
def create_time_block_columns(row_dict):
inhour = row_dict['inhour']
outhour = row_dict['outhour']
inminute = row_dict['inminute']
outminute = row_dict['outminute']
# loop through hours of day
for i in range(24):
# loop through quarter hour blocks
for j in range(1,5):
lowerBound = (j-1)*15
upperBound = j*15
# create column names like 't_0_0', 't_0_15', t_0_30', 't_0_45', 't_1_0', etc...
timeBlockColumnName = F't_{i}_{lowerBound}'
# Add a new key in the dictionary with the new column name and value.
# initialized to 0
row_dict[timeBlockColumnName] = 0
# if the employee was currently clocked in
if (inhour <= i) & (outhour >= i):
# if the inhour is the current time block hour and the outhour is in a future time block
# this means they worked the rest of the hour
# start_during_end_after
if (i == inhour) & (outhour > i):
if (inminute >= lowerBound):
row_dict[timeBlockColumnName] = (upperBound - inminute)
else:
row_dict[timeBlockColumnName] = 15
# if the current row is completely within the current time block [hour and minutes]
# this means they worked all 15 minutes of each hour quarter
elif (i < inhour) & (i > outhour):
row_dict[timeBlockColumnName] = 15
# if the inhour is before the current timeblock hour, and outhour is the current hour
# this means they worked all minutes in the current block up-to the outminute
elif (i < inhour) & (i == outhour):
if (outminute < lowerBound):
row_dict[timeBlockColumnName] = outminute - lowerBound
else:
row_dict[timeBlockColumnName] = 15
# if the inhour and outhour are the current timeblock hour, and they are the same hour,
# we'll calculated the difference between minutes
elif (i == inhour) & (i == outhour):
if (inminute >= lowerBound) & (outminute <= upperBound):
row_dict[timeBlockColumnName] = outminute - inminute
elif (inminute < lowerBound) & (outminute >= upperBound):
row_dict[timeBlockColumnName] = 15
elif (inminute >= lowerBound) & (outminute >= upperBound):
row_dict[timeBlockColumnName] = upperBound - inminute
elif (inminute < lowerBound) & (outminute <= upperBound):
row_dict[timeBlockColumnName] = outminute - lowerBound
# else: we don't do anything because the employee wasnt clocked in
return row_dict
mappedDF = Map.apply(frame = dyF, f = create_time_block_columns).toDF()
# output some interesting logs for debugging
mappedDF.printSchema()
# Build expression to stack new columns as rows
stack_expression = F"stack({24*4}"
for i in range(24):
for j in range(1,5):
stack_expression += F", 't_{i}_{(j-1)*15}', t_{i}_{(j-1)*15}"
stack_expression += ') as (time_block, minutes_worked)'
timeBlockDF = mappedDF.select('pos_key', 'p_dob', 'dob', 'employee', 'rate', 'jobcode', 'pay', 'overpay', 'minutes', F.expr(stack_expression))
timeBlockDF = timeBlockDF.filter('minutes_worked > 0') \
.withColumn("dob",F.col("dob").cast(DateType()))
# create time block identifier column
time_pattern = r't_(\d+)_(\d+)'
timeBlockDF = timeBlockDF.withColumn('time_block_hour', F.regexp_extract('time_block', time_pattern, 1).cast(IntegerType())) \
.withColumn('time_block_min', F.regexp_extract('time_block', time_pattern, 2).cast(IntegerType())) \
.drop('time_block') \
.withColumn('time_block_time', F.concat_ws(':', F.format_string("%02d", F.col('time_block_hour')), F.format_string("%02d", F.col('time_block_min')))) \
.withColumn('time_block_temp', F.concat_ws(' ', F.col('dob'), F.col('time_block_time'))) \
.withColumn('time_block_datetime', F.to_timestamp(F.col('time_block_temp'), 'yyyy-MM-dd HH:mm')) \
.withColumn('time_block_pay', ((F.col('pay') + F.col('overpay')) / F.col('minutes')) * F.col('minutes_worked')) \
.drop('time_block_temp', 'pay', 'overpay', 'minutes')
# output some interesting logs for debugging
timeBlockDF.printSchema()
The problem was with the udf.
There were several cases not handled by the conditions, but the stack expression was working fine.
Here is a working example [without considering shifts that span midnight].
def create_time_block_columns(row_dict):
inhour = row_dict['inhour']
outhour = row_dict['outhour']
inminute = row_dict['inminute']
outminute = row_dict['outminute']
# loop through hours of day
for i in range(24):
# loop through quarter hour blocks
for j in range(1,5):
lowerBound = (j-1)*15
upperBound = j*15
# create column names like 't_0_0', 't_0_15', t_0_30', 't_0_45', 't_1_0', etc...
timeBlockColumnName = F't_{i}_{lowerBound}'
# Add a new key in the dictionary with the new column name and value.
# initialized to 0
row_dict[timeBlockColumnName] = 0
# if the employee was currently clocked in
if (inhour <= i) & (outhour >= i):
# if the inhour is the current time block hour and the outhour is in a future time block
# this means they worked the rest of the hour
# start_during_end_after
if (i == inhour) & (outhour > i):
if (inminute >= lowerBound):
row_dict[timeBlockColumnName] = (upperBound - inminute)
else:
row_dict[timeBlockColumnName] = 15
# if the current row is completely within the current time block [hour and minutes]
# this means they worked all 15 minutes of each hour quarter
elif (inhour < i) & (i < outhour):
row_dict[timeBlockColumnName] = 15
# if the inhour is before the current timeblock hour, and outhour is the current hour
# this means they worked all minutes in the current block up-to the outminute
elif (i < inhour) & (i == outhour):
if (outminute < lowerBound):
row_dict[timeBlockColumnName] = outminute - lowerBound
else:
row_dict[timeBlockColumnName] = 15
# if the inhour and outhour are the current timeblock hour, and they are the same hour,
# we'll calculated the difference between minutes
elif (i == inhour) & (i == outhour):
if (inminute >= lowerBound) & (outminute <= upperBound):
row_dict[timeBlockColumnName] = outminute - inminute
elif (inminute < lowerBound) & (outminute >= upperBound):
row_dict[timeBlockColumnName] = 15
elif (inminute >= lowerBound) & (outminute >= upperBound):
row_dict[timeBlockColumnName] = upperBound - inminute
elif (inminute < lowerBound) & (outminute <= upperBound):
row_dict[timeBlockColumnName] = outminute - lowerBound
# else: we don't do anything because the employee wasnt clocked in
return row_dict
mappedDF = Map.apply(frame = dyF, f = create_time_block_columns).toDF()
# output some interesting logs for debugging
mappedDF.printSchema()
# Build expression to stack new columns as rows
stack_expression = F"stack({24*4}"
for i in range(24):
for j in range(1,5):
stack_expression += F", 't_{i}_{(j-1)*15}', t_{i}_{(j-1)*15}"
stack_expression += ') as (time_block, minutes_worked)'
timeBlockDF = mappedDF.select('pos_key', 'p_dob', 'dob', 'employee', 'rate', 'jobcode', 'pay', 'overpay', 'minutes', F.expr(stack_expression))
timeBlockDF = timeBlockDF.filter('minutes_worked > 0') \
.withColumn("dob",F.col("dob").cast(DateType()))

Efficiently count IntervalVars between given start/end times

Is there an efficient way to count the number of IntervalVars between a given start and end time?
I'm trying to implement an employee rostering script. We have a demand that we have already generated that tells us how many employees should be working during a given interval.
What I would like to end up with is an IntVar for each i in the 24 (hour) intervals, givin the total employees with a starttime <= i <= endtime.
Below is a simple example.
from ortools.sat.python import cp_model
def main():
# init model
model = cp_model.CpModel()
emps = range(0,3)
emp_intervalvars = []
for e in emps:
start = model.NewIntVar(0,24,'st_e%i' % e)
end = model.NewIntVar(0,24,'et_e%i' % e)
dur = model.NewIntVar(0,24,'dur_e%i' % e)
pres = model.NewBoolVar('pres_e%i' % e)
interval = model.NewOptionalIntervalVar(start, dur, end, pres, 'interval_e%s' % e)
# calc start
model.Add(start == (end - dur)).OnlyEnforceIf(pres)
# make sure to set start/end to 0 if not present
model.Add(dur == 0).OnlyEnforceIf(pres.Not())
model.Add(start == 0).OnlyEnforceIf(pres.Not())
model.Add(end == 0).OnlyEnforceIf(pres.Not())
# make sure to set start/duration to > 0 if present
model.Add(dur > 0).OnlyEnforceIf(pres)
model.Add(end > 0).OnlyEnforceIf(pres)
# all emps between 8am and 6pm
model.Add(start >= 8).OnlyEnforceIf(pres)
model.Add(end <= 18).OnlyEnforceIf(pres)
if e == 0:
# lets say emp0 works mornings
model.Add(end <= 14)
elif e == 2:
# and emp2 works evenings
model.Add(start >= 11)
emp_intervalvars.append({
"present":pres,
"start":start,
"end":end,
"duration":dur,
"interval":interval
})
# simple objective
durations = list(map(lambda v: v["duration"], emp_intervalvars))
model.Maximize(sum(durations))
solver = cp_model.CpSolver()
solver.parameters.num_search_workers=8
solver.parameters.max_time_in_seconds=30
solver.parameters.log_search_progress=True
status = solver.Solve(model)
print(solver.StatusName(status))
for i,field in enumerate(model._CpModel__model.variables):
if field.name == '':
continue
print("{} : {}".format(field.name,solver._CpSolver__solution.solution[i]))
return
if __name__ == '__main__':
main()
a few comments:
# calc start
model.Add(start == (end - dur)).OnlyEnforceIf(pres)
This is already enforced by the interval var (which is actually exactly this constraint).
model.Add(end > 0).OnlyEnforceIf(pres)
is most likely not useful. But you can keep it.
Now, to your question:
given start and end variables and a time i
overlap_i = model.NewBoolVar('overlap_%i' % i)
before_i = model.NewBoolVar('before_%i' % i)
after_i = model.NewBoolVar('after_%i' % i)
model.Add(start <= i).OnlyEnforceIf(overlap_i)
model.Add(end > i).OnlyEnforceIf(overlap_i) # Intervals are open ended on the right
model.Add(end <= i).OnlyEnforceIf(before_i)
model.Add(start > i).OnlyEnforceIf(after_i)
model.Add(overlap_i + before_i + after_i == 1)
should do the trick

A python 3.x KeyError with already 'dict' maybe..?

def get_data_list(file_object,column_number):
contents = []
for string in file_object:
contents.append(tuple(string.split(',')))
list = []
for i in range(len(contents) - 1):
list.append((contents[i + 1][0], float(contents[i + 1][column_number])))
list.sort()
return list
def average_data(list_of_tuples):
dict = {'01':'January','02':'Februday','03':'March','04':'April','05':'May','06':'June','07':'July','08':'August','09':'September','10':'October','11':'November','12':'December'}
current_month = 0
total = 0
count = 1
average_data = []
for k in list_of_tuples:
for data in k:
data = str(data)
month = data[4:6]
if month == current_month:
total += k[1]
if count != 1:
count += 1
else:
current_month = month
average = float(total/count)
average_data.append((float(average),dict[data[4:6]]+data[0:4]))
total = 0
average_data = sorted(average_data)
return average_data
These are my code but when I try to run it returns an error:
KeyError:'28'or some other keyError with numbers
But I thought I already set all the numbers in dict...
And also, the data[4:6] comes from the date number, such as 20160407, 20141105.

Trouble with Python Code. Objective is to code suffix during input

Objective: Write a function that takes an integer as its only parameter and returns the ordinal abbreviation for that integer as its only result. For example, if your function is passed the integer 1 then it should return the string "1st". If it is passed the integer 12 then it should return the string "12th". If it is passed 2003 then it should return the string "2003rd". Your function must not print anything on the screen.
def convert (n):
self.num = num
n = int(self.num)
if 4 <= n <= 20:
suffix = 'th'
elif n == 1 or (n % 10) == 1:
suffix = 'st'
elif n == 2 or (n % 10) == 2:
suffix = 'nd'
elif n == 3 or (n % 10) == 3:
suffix = 'rd'
elif n < 100:
suffix = 'th'
ord_num = str(n) + suffix
return ord_num
def main ():
day = int(input("Enter the day:"))
month = int(input("Enter the month:"))
year = int(input("Enter the year:"))
print("on the %n" %n, convert(day), "day of the %n" %month,
convert(month), "month of the %n" %year, convert(year),",
something amazing happened!")
main()
This is my code however it keeps saying I haven't defined n when I run it. But above I've already defined it so not sure what the problem is.
This is probably closer to what you want:
def convert(n):
n = int(n)
suffix = ''
if 4 <= n <= 20:
suffix = 'th'
elif n == 1 or (n % 10) == 1:
suffix = 'st'
elif n == 2 or (n % 10) == 2:
suffix = 'nd'
elif n == 3 or (n % 10) == 3:
suffix = 'rd'
elif n < 100:
suffix = 'th'
return str(n) + suffix
def main ():
day = int(input("Enter the day: "))
month = int(input("Enter the month: "))
year = int(input("Enter the year: "))
print("on the %s day of the %s month of the %s, something amazing happened!" %
(convert(day), convert(month), convert(year)))
main()
There are few issues. You cannot use n in main() when you define it in convert(). Also %n is not a valid format character. You need to define suffix = '' when also want run the year through the conversion function as the year can be larger than 100. Also, you probably copied the code from within a class definition. I removed the self.

Categories

Resources