How do I create a time series with 15min buckets in pyspark? - python

I'm trying to create a report that shows the total number of minutes worked by a group of employees in 15 minute increments.
The source table has the time in/out and total minutes worked, one record for each employee.
I've create a RDD row wise mapping function to loop through the number of hours in a day, then an inner loop for each 15 minute increment.
Each loop should add a column to the RDD row dictionary.
I've confirmed the resulting schema contains these new columns, but I'm missing lots of data in the final output.
I'm not sure if it's a problem with the row iteration or the stacking.
This is the starting schema -
Any ideas?
final schema -
Updated code -
def create_time_block_columns(row_dict):
inhour = row_dict['inhour']
outhour = row_dict['outhour']
inminute = row_dict['inminute']
outminute = row_dict['outminute']
# loop through hours of day
for i in range(24):
# loop through quarter hour blocks
for j in range(1,5):
lowerBound = (j-1)*15
upperBound = j*15
# create column names like 't_0_0', 't_0_15', t_0_30', 't_0_45', 't_1_0', etc...
timeBlockColumnName = F't_{i}_{lowerBound}'
# Add a new key in the dictionary with the new column name and value.
# initialized to 0
row_dict[timeBlockColumnName] = 0
# if the employee was currently clocked in
if (inhour <= i) & (outhour >= i):
# if the inhour is the current time block hour and the outhour is in a future time block
# this means they worked the rest of the hour
# start_during_end_after
if (i == inhour) & (outhour > i):
if (inminute >= lowerBound):
row_dict[timeBlockColumnName] = (upperBound - inminute)
else:
row_dict[timeBlockColumnName] = 15
# if the current row is completely within the current time block [hour and minutes]
# this means they worked all 15 minutes of each hour quarter
elif (i < inhour) & (i > outhour):
row_dict[timeBlockColumnName] = 15
# if the inhour is before the current timeblock hour, and outhour is the current hour
# this means they worked all minutes in the current block up-to the outminute
elif (i < inhour) & (i == outhour):
if (outminute < lowerBound):
row_dict[timeBlockColumnName] = outminute - lowerBound
else:
row_dict[timeBlockColumnName] = 15
# if the inhour and outhour are the current timeblock hour, and they are the same hour,
# we'll calculated the difference between minutes
elif (i == inhour) & (i == outhour):
if (inminute >= lowerBound) & (outminute <= upperBound):
row_dict[timeBlockColumnName] = outminute - inminute
elif (inminute < lowerBound) & (outminute >= upperBound):
row_dict[timeBlockColumnName] = 15
elif (inminute >= lowerBound) & (outminute >= upperBound):
row_dict[timeBlockColumnName] = upperBound - inminute
elif (inminute < lowerBound) & (outminute <= upperBound):
row_dict[timeBlockColumnName] = outminute - lowerBound
# else: we don't do anything because the employee wasnt clocked in
return row_dict
mappedDF = Map.apply(frame = dyF, f = create_time_block_columns).toDF()
# output some interesting logs for debugging
mappedDF.printSchema()
# Build expression to stack new columns as rows
stack_expression = F"stack({24*4}"
for i in range(24):
for j in range(1,5):
stack_expression += F", 't_{i}_{(j-1)*15}', t_{i}_{(j-1)*15}"
stack_expression += ') as (time_block, minutes_worked)'
timeBlockDF = mappedDF.select('pos_key', 'p_dob', 'dob', 'employee', 'rate', 'jobcode', 'pay', 'overpay', 'minutes', F.expr(stack_expression))
timeBlockDF = timeBlockDF.filter('minutes_worked > 0') \
.withColumn("dob",F.col("dob").cast(DateType()))
# create time block identifier column
time_pattern = r't_(\d+)_(\d+)'
timeBlockDF = timeBlockDF.withColumn('time_block_hour', F.regexp_extract('time_block', time_pattern, 1).cast(IntegerType())) \
.withColumn('time_block_min', F.regexp_extract('time_block', time_pattern, 2).cast(IntegerType())) \
.drop('time_block') \
.withColumn('time_block_time', F.concat_ws(':', F.format_string("%02d", F.col('time_block_hour')), F.format_string("%02d", F.col('time_block_min')))) \
.withColumn('time_block_temp', F.concat_ws(' ', F.col('dob'), F.col('time_block_time'))) \
.withColumn('time_block_datetime', F.to_timestamp(F.col('time_block_temp'), 'yyyy-MM-dd HH:mm')) \
.withColumn('time_block_pay', ((F.col('pay') + F.col('overpay')) / F.col('minutes')) * F.col('minutes_worked')) \
.drop('time_block_temp', 'pay', 'overpay', 'minutes')
# output some interesting logs for debugging
timeBlockDF.printSchema()

The problem was with the udf.
There were several cases not handled by the conditions, but the stack expression was working fine.
Here is a working example [without considering shifts that span midnight].
def create_time_block_columns(row_dict):
inhour = row_dict['inhour']
outhour = row_dict['outhour']
inminute = row_dict['inminute']
outminute = row_dict['outminute']
# loop through hours of day
for i in range(24):
# loop through quarter hour blocks
for j in range(1,5):
lowerBound = (j-1)*15
upperBound = j*15
# create column names like 't_0_0', 't_0_15', t_0_30', 't_0_45', 't_1_0', etc...
timeBlockColumnName = F't_{i}_{lowerBound}'
# Add a new key in the dictionary with the new column name and value.
# initialized to 0
row_dict[timeBlockColumnName] = 0
# if the employee was currently clocked in
if (inhour <= i) & (outhour >= i):
# if the inhour is the current time block hour and the outhour is in a future time block
# this means they worked the rest of the hour
# start_during_end_after
if (i == inhour) & (outhour > i):
if (inminute >= lowerBound):
row_dict[timeBlockColumnName] = (upperBound - inminute)
else:
row_dict[timeBlockColumnName] = 15
# if the current row is completely within the current time block [hour and minutes]
# this means they worked all 15 minutes of each hour quarter
elif (inhour < i) & (i < outhour):
row_dict[timeBlockColumnName] = 15
# if the inhour is before the current timeblock hour, and outhour is the current hour
# this means they worked all minutes in the current block up-to the outminute
elif (i < inhour) & (i == outhour):
if (outminute < lowerBound):
row_dict[timeBlockColumnName] = outminute - lowerBound
else:
row_dict[timeBlockColumnName] = 15
# if the inhour and outhour are the current timeblock hour, and they are the same hour,
# we'll calculated the difference between minutes
elif (i == inhour) & (i == outhour):
if (inminute >= lowerBound) & (outminute <= upperBound):
row_dict[timeBlockColumnName] = outminute - inminute
elif (inminute < lowerBound) & (outminute >= upperBound):
row_dict[timeBlockColumnName] = 15
elif (inminute >= lowerBound) & (outminute >= upperBound):
row_dict[timeBlockColumnName] = upperBound - inminute
elif (inminute < lowerBound) & (outminute <= upperBound):
row_dict[timeBlockColumnName] = outminute - lowerBound
# else: we don't do anything because the employee wasnt clocked in
return row_dict
mappedDF = Map.apply(frame = dyF, f = create_time_block_columns).toDF()
# output some interesting logs for debugging
mappedDF.printSchema()
# Build expression to stack new columns as rows
stack_expression = F"stack({24*4}"
for i in range(24):
for j in range(1,5):
stack_expression += F", 't_{i}_{(j-1)*15}', t_{i}_{(j-1)*15}"
stack_expression += ') as (time_block, minutes_worked)'
timeBlockDF = mappedDF.select('pos_key', 'p_dob', 'dob', 'employee', 'rate', 'jobcode', 'pay', 'overpay', 'minutes', F.expr(stack_expression))
timeBlockDF = timeBlockDF.filter('minutes_worked > 0') \
.withColumn("dob",F.col("dob").cast(DateType()))

Related

how to select a value referring to the last repeated item in a list

I have 5 columns ( NO = index of vehicle / LEADER = index of the vehicle of the front / SEC = instant in seconds / X = position of the vehicle)
Some vehicles stop ( X stay the same for a while) and I want to get the exact time it starts to move again. Then, calculate the difference between their instant and their respective 'leader' vehicle.
I made a code but it has so many bugs
OBS: Some vehicles never stop or stop,but never return to move. I want to remove them too.
Here's my code:
dados = pd.read_excel('teste-reaction_001.xlsx')
n=dados["NO"].unique()
final=np.zeros(1)
for i in n:
botao = 0
array_aux=dados[dados["NO"] == i ]["X"]
df = pd.DataFrame(array_aux)
aux=df.diff().to_numpy()
count1 = 0
aux1 = []
for j in aux:
if j == 0:
botao = 1
elif j != 0 and botao == 1:
aux1=np.where(aux==j)[0]
aux1=aux1[np.where(aux1>=count1)[0]]
break
else :
botao = 0
count1 = count1 + 1
aux2=dados["SEC"][dados[dados["NO"]==i]["SEC"].index[aux1]].values[0]
final=np.append(final,aux2)
tr=np.zeros(1)
for i in n:
aux=dados[dados["NO"] == i ]["LEADER"].unique()[0]
aux1=np.where(dados["NO"].unique()==i)[0]
aux2=np.where(dados["NO"].unique()==aux)[0]
if aux2>-1:
aux3=final[int(aux1)]-final[aux2]
else:
aux3 = "s"
tr=np.append(tr,aux3)
columns = ["N", "TR"]
tabela = np.array([dados["NO"].unique(), tr[1:]])
res = pd.DataFrame(data=tabela.T,index=np.arange(len(tabela.T)), columns=columns)

Issues with replicating results from R to python by writing customised function

I am trying to convert the R code to python by writing customised function or without function in python based on this following lines of code
customers_df$segment = "NA"
customers_df$segment[which(customers_df$recency > 365*3)] = "inactive"
customers_df$segment[which(customers_df$recency <= 365*3 & customers_df$recency > 365*2)] = "cold"
customers_df$segment[which(customers_df$recency <= 365*2 & customers_df$recency > 365*1)] = "warm"
customers_df$segment[which(customers_df$recency <= 365)] = "active"
customers_df$segment[which(customers_df$segment == "warm" & customers_df$first_purchase <= 365*2)] = "new warm"
customers_df$segment[which(customers_df$segment == "warm" & customers_df$amount < 100)] = "warm low value"
customers_df$segment[which(customers_df$segment == "warm" & customers_df$amount >= 100)] = "warm high value"
customers_df$segment[which(customers_df$segment == "active" & customers_df$first_purchase <= 365)] = "new active"
customers_df$segment[which(customers_df$segment == "active" & customers_df$amount < 100)] = "active low value"
customers_df$segment[which(customers_df$segment == "active" & customers_df$amount >= 100)] = "active high value"
table(customers_2015$segment)
active high value active low value cold inactive
573 3313 1903 9158
new active new warm warm high value warm low value
1512 938 119 901
Python Function
I tried to replicate the same code as above in python by writing function. However, I was not able to get the same categories as R as a well number in each category also differs.
def mang_segment (s):
if (s['recency'] > 365*3):
return ("inactive")
elif (s['recency'] <= 365*3) & (s['recency'] > 365*2):
return ("cold")
elif (s['recency'] <= 365*2) & (s['recency'] > 365*1):
return ("warm")
elif (s['recency'] <= 365):
return ("active")
def mang_segment_up (s):
# if (s['recency'] > 365*3):
# return ("inactive")
# elif (s['recency'] <= 365*3 & s['recency'] > 365*2):
# return ("cold")
# elif (s['recency'] <= 365*2 & s['recency'] > 365*1):
# return ("warm")
if (s['segment'] == "warm") & (s['first_purchase'] <= 365*2):
return ("new warm")
elif (s['segment'] == "warm") & (s['amount'] < 100):
return ("warm low value")
elif (s['segment'] == "warm") & (s['amount'] >= 100):
return ("warm high value")
elif (s['segment'] == "active") & (s['first_purchase'] <= 365):
return ("new active")
elif (s['segment'] == "active") & (s['amount'] < 100):
return ("active low value")
elif (s['segment'] == "active") & (s['amount'] >= 100):
return ("active high value")
active low value 19664
warm low value 4083
active high value 3374
new active 1581
new warm 980
warm high value 561
Any pointer/suggestion would be appreciated.
Thanks in advance
I am a little confused about the purpose of the function (and if it is working as you expect). If you are seeking to mimic your R code within a function, your syntax can line up much closer with your initial code than it currently is. Assuming you are using panads/numpy:
import numpy as np
import pandas as pd
#toy example
s = pd.DataFrame({'rec' : [2000, 1500, 3000, 750]})
def mang_segment (s):
s.loc[(s['rec'] > 365*3), 'seg'] = "inactive" # creating a new column seg with our values
s.loc[(s['rec'] <= 365*3) & (s['rec'] > 365*2), 'seg'] = "cold"
#etc...
#run function
mang_segment(s)
#get value counts
s['seg'].value_counts()
Here we add a column to our dataframe to capture our values, which we can later summarize. This is different than the return function that, if it were working and call appropriately, would not assign it directly to your data frame.
There are other function and ways to get it at this, too. Check out np.where as another option.

Efficiently count IntervalVars between given start/end times

Is there an efficient way to count the number of IntervalVars between a given start and end time?
I'm trying to implement an employee rostering script. We have a demand that we have already generated that tells us how many employees should be working during a given interval.
What I would like to end up with is an IntVar for each i in the 24 (hour) intervals, givin the total employees with a starttime <= i <= endtime.
Below is a simple example.
from ortools.sat.python import cp_model
def main():
# init model
model = cp_model.CpModel()
emps = range(0,3)
emp_intervalvars = []
for e in emps:
start = model.NewIntVar(0,24,'st_e%i' % e)
end = model.NewIntVar(0,24,'et_e%i' % e)
dur = model.NewIntVar(0,24,'dur_e%i' % e)
pres = model.NewBoolVar('pres_e%i' % e)
interval = model.NewOptionalIntervalVar(start, dur, end, pres, 'interval_e%s' % e)
# calc start
model.Add(start == (end - dur)).OnlyEnforceIf(pres)
# make sure to set start/end to 0 if not present
model.Add(dur == 0).OnlyEnforceIf(pres.Not())
model.Add(start == 0).OnlyEnforceIf(pres.Not())
model.Add(end == 0).OnlyEnforceIf(pres.Not())
# make sure to set start/duration to > 0 if present
model.Add(dur > 0).OnlyEnforceIf(pres)
model.Add(end > 0).OnlyEnforceIf(pres)
# all emps between 8am and 6pm
model.Add(start >= 8).OnlyEnforceIf(pres)
model.Add(end <= 18).OnlyEnforceIf(pres)
if e == 0:
# lets say emp0 works mornings
model.Add(end <= 14)
elif e == 2:
# and emp2 works evenings
model.Add(start >= 11)
emp_intervalvars.append({
"present":pres,
"start":start,
"end":end,
"duration":dur,
"interval":interval
})
# simple objective
durations = list(map(lambda v: v["duration"], emp_intervalvars))
model.Maximize(sum(durations))
solver = cp_model.CpSolver()
solver.parameters.num_search_workers=8
solver.parameters.max_time_in_seconds=30
solver.parameters.log_search_progress=True
status = solver.Solve(model)
print(solver.StatusName(status))
for i,field in enumerate(model._CpModel__model.variables):
if field.name == '':
continue
print("{} : {}".format(field.name,solver._CpSolver__solution.solution[i]))
return
if __name__ == '__main__':
main()
a few comments:
# calc start
model.Add(start == (end - dur)).OnlyEnforceIf(pres)
This is already enforced by the interval var (which is actually exactly this constraint).
model.Add(end > 0).OnlyEnforceIf(pres)
is most likely not useful. But you can keep it.
Now, to your question:
given start and end variables and a time i
overlap_i = model.NewBoolVar('overlap_%i' % i)
before_i = model.NewBoolVar('before_%i' % i)
after_i = model.NewBoolVar('after_%i' % i)
model.Add(start <= i).OnlyEnforceIf(overlap_i)
model.Add(end > i).OnlyEnforceIf(overlap_i) # Intervals are open ended on the right
model.Add(end <= i).OnlyEnforceIf(before_i)
model.Add(start > i).OnlyEnforceIf(after_i)
model.Add(overlap_i + before_i + after_i == 1)
should do the trick

Difference between two similar if loops in Python

I have two codes which should perform the same thing but in the first, I am not getting the result but in the second one I am getting output
if (Method == "EMM" ):
if ((Loan_Obligation/12)+EMI) !=0:
DSCR_Post = EBITDA_EMM/((Loan_Obligation/12)+EMI)
else:
0
elif (Method != "EMM" ):
if ((Loan_Obligation/12)+EMI) !=0:
DSCR_Post = EBITDA/((Loan_Obligation/12)+EMI)
else:
0
and other one is:
if (Method == "EMM"):
DSCR_Post = EBITDA_EMM/((Loan_Obligation/12)+EMI) if ((Loan_Obligation/12)+EMI) !=0 else 0
else:
DSCR_Post = EBITDA/((Loan_Obligation/12)+EMI) if ((Loan_Obligation/12)+EMI) !=0 else 0
print('DSCR_Post:',DSCR_Post)
Can someone help me what is the difference between the two codes
In your first code snippet, you are not assigning the 0 to DSCR_Post as you do in the second. Modify as follows:
if Method == "EMM" :
if (Loan_Obligation / 12) + EMI !=0:
DSCR_Post = EBITDA_EMM / ((Loan_Obligation / 12) + EMI)
else:
DSCR_Post = 0 # the 0 has to be assigned!
else: # you do not need a condition here! It can either be equal or not, no third state possible.
if (Loan_Obligation / 12) + EMI !=0:
DSCR_Post = EBITDA / ((Loan_Obligation / 12) + EMI)
else:
DSCR_Post = 0
print('DSCR_Post:',DSCR_Post)
Which can be simplified to the following:
ebid = EBITDA_EMM if Method == "EMM" else EBITDA
DSCR_Post = 0 # 0 will be overwritten if ...
if (Loan_Obligation / 12) + EMI != 0:
DSCR_Post = ebid / ((Loan_Obligation / 12) + EMI)
print('DSCR_Post:',DSCR_Post)

How to create a list of totals for durations?

I want to calculate a bonus based on the two consecutive months where sales where the most. So I can iterate a total for every two consecutive months to find the Max value ie get
value = Max[total_between_firstdayMonth1_and_lastDayMonth2, total_between_firstdayMonth2_and_lastDayMonth3, ... , total_between_firstdaySecondToLastMonth_andlastDayLastMonth]
So I might need a list of pairs of datetime objects or something similar.
start= model.Order.order('created').get().created # get the oldest order
end = model.Order.order('-created').get().created # get the newest order
So inbetween start and end I must partition the time in overlapping pairs of consecutive 2 months eg. if first order was in december 2008 and the last order was in november 2011 then the list from where to pick the max should be [total_december2008 + total_january2009, total_january2009 + total_february2009, ... , total_october2011 + total_november2011]
But then how do I get the last day of the second month if I know the start like above? How can I create the list of times and totals?
I might not be able to create the list of totals right away but if I can create the list of starts and ends then I can call a helper function we can assume eg.
total(start_datetime, end_datetime)
Thanks for any help
Update
I think I found how to calculate the time for an example interval where the timeline is from any date to last day next month:
>>> d = date(2007,12,18)
>>> print d
2007-12-18
>>> d + relativedelta(months=2) - timedelta(days=d.day)
datetime.date(2008, 1, 31)
Update 2
I can calculate upto the first level the first duration. Now I only have to generalize it to loop through all the durations and check which was the highest level:
def level(self):
startdate = model.Order.all().filter('status =', 'PAID').filter('distributor_id =' , self._key.id()).get().created.date()
last_day_nextmonth =startdate + relativedelta(months=2) - timedelta(days=1)
if self.personal_silver(startdate, last_day_nextmonth) + self.non_manager_silver(startdate, last_day_nextmonth) < 25:
maxlevel = _('New distributor')
elif self.personal_silver(startdate, last_day_nextmonth) + self.non_manager_silver(startdate, last_day_nextmonth) > 25:
maxlevel = _('Assistant Teamleader')
return maxlevel
Update 3
Closer to what I mean is taking the max of some function values from beginning up to now. Basecase can be that last day next month is is the future and the helper function can be recursive but I didn't have time or help to make it recursive to it only works for the first 2 periods now ie 4 months from start:
def level(self):
startdate = model.Order.all().filter('status =', 'PAID'
).filter('distributor_id =',
self._key.id()).get().created.date()
last_day_nextmonth = startdate + relativedelta(months=2) \
- timedelta(days=1)
total = self.personal_silver(startdate, last_day_nextmonth) + self.non_manager_silver(startdate, last_day_nextmonth)
if total >= 125:
level = 5
elif total >= 75:
level = 4
elif total >= 25:
level = 3
elif total >= 2:
level = 2
else:
level = 1
return self.levelHelp(level, last_day_nextmonth + timedelta(days=1))
def levelHelp(self, level, startdate):
#if startdate in future return level
last_day_nextmonth = startdate + relativedelta(months=2) \
- timedelta(days=1)
total = self.personal_silver(startdate, last_day_nextmonth) + self.non_manager_silver(startdate, last_day_nextmonth)
if total >= 125:
newlevel = 5
elif total >= 75:
newlevel = 4
elif total >= 25:
newlevel = 3
elif total >= 2:
newlevel = 2
else:
newlevel = 1
return level if level > newlevel else newlevel
Update 4
I added the recursion where base case is that next step is in the future, if so it will return the max level:
def level(self):
startdate = model.Order.all().filter('status =', 'PAID'
).filter('distributor_id =',
self._key.id()).get().created.date()
last_day_nextmonth = startdate + relativedelta(months=2) \
- timedelta(days=1)
total = self.personal_silver(startdate, last_day_nextmonth) + self.non_manager_silver(startdate, last_day_nextmonth)
if total >= 125:
level = 5
elif total >= 75:
level = 4
elif total >= 25:
level = 3
elif total >= 2:
level = 2
else:
level = 1
return self.levelHelp(level, last_day_nextmonth + timedelta(days=1))
def levelHelp(self, level, startdate):
last_day_nextmonth = startdate + relativedelta(months=2) \
- timedelta(days=1)
total = self.personal_silver(startdate, last_day_nextmonth) + self.non_manager_silver(startdate, last_day_nextmonth)
if total >= 125:
newlevel = 5
elif total >= 75:
newlevel = 4
elif total >= 25:
newlevel = 3
elif total >= 2:
newlevel = 2
else:
newlevel = 1
maxlevel = level if level > newlevel else newlevel
nextstart = last_day_nextmonth + timedelta(days=1)
now = datetime.now().date()
if nextstart > now: #next start in is the future
return maxlevel
else: return self.levelHelp(maxlevel, nextstart)
This sounds like a fine job for functional approach. At the end there is a full working example, but I just want to emphasize the elegance and simplicity of the core function, written in FP style:
def find_best_two_months(orders):
first = lambda x: x[0]
second = lambda x: x[1]
orders_by_year_and_month = [
("%04d-%02d" % (date.year, date.month), amount)
for date, amount in orders]
sorted_orders = sorted(orders_by_year_and_month, key=first)
totals_by_month = [
(ym, sum(map(second, groupped_orders)))
for ym, groupped_orders in groupby(sorted_orders, key=first)]
totals_two_months = [
( "%s - %s" % (m1[0], m2[0]), m1[1]+m2[1] )
for m1, m2 in zip(totals_by_month, totals_by_month[1:]) ]
return max(totals_two_months, key=second)
Here is a full working example with comments:
#!/usr/bin/python
from random import randint
from datetime import date, timedelta
from itertools import groupby
""" finding best two months the functional way """
def find_best_two_months(orders):
"""
Expect a list of tuples of form (date_of_order, amount):
[ (date1, amount1), (date2, amount2), ...]
"""
" helper functions for extracting first or second from tuple "
first = lambda x: x[0]
second = lambda x: x[1]
" converts [(date, amount)] -> [(YYYY-MM, amount)] "
orders_by_year_and_month = [ ("%04d-%02d" % (date.year, date.month), amount) for date, amount in orders]
" Sorts by YYYY-MM. This step can be omitted if orders were already sorted by date"
sorted_orders = sorted(orders_by_year_and_month, key=first)
" Compresses orders from the same month, so ve get [(YYYY-MM), total_amount_of_orders]"
totals_by_month = [ (ym, sum(map(lambda x:x[1], groupped_orders)))
for ym, groupped_orders in groupby(sorted_orders, key=first)]
" Zips orders to two month periods"
totals_two_months = [ ("%s - %s" % (m1[0], m2[0]), m1[1]+m2[1]) for m1, m2 in zip(totals_by_month, totals_by_month[1:]) ]
" Returns two-month period with maximum total amount. If there were many periods with max amount, only the first is returned "
return max(totals_two_months, key=second)
"""
this code is for generating random list of orders
and is not a part of the solution
"""
MIN_AMOUNT=70
MAX_AMOUNT=500
MAX_DAY_SPREAD=5
def gen_order(last_date):
""" returns (order_date, amount) """
days = timedelta()
return (
last_date+timedelta(days=randint(0, MAX_DAY_SPREAD)), # new date
randint(MIN_AMOUNT, MAX_AMOUNT)) # amount
def gen_orders(total, start_date):
orders = []
last_date = start_date
for i in range(total):
order = gen_order(last_date)
orders.append(order)
last_date = order[0]
return orders
if __name__ == "__main__":
orders = gen_orders(300, date(2010,1,1))
print find_best_two_months(orders)

Categories

Resources