Is there a more efficient way of writing the following? I current have this set up to calculate using a for-loop and at this pace, it will take a few days to compile.
I am forecasting demand over a period of 6 years on a weekly basis (52 weeks) broken down by product type (586 types) and zip code (892 unique ZIPs). The rand arrays are the parameter demand shares for each year drawn from a normal distribution and have dimensions [#weeks/#types/#zips x 6]. The demand growth array is the annual demand for each year.
I ultimately need to produce a data frame that has the following:
Year | Week of the Year | Product | Zip Code | Qty
This is what I currently have
demand_growth = [10,15,20,23,26,30]
rand_week_total = np.random.rand(52,6)
rand_product_total = np.random.rand(586,6)
rand_zipcode_total = np.random.rand(892,6)
forecast_year = []
forecast_week = []
forecast_product = []
forecast_ZIP = []
forecast_qty = []
for i in range(len(years)):
for j in range(len(week)):
for k in range(len(product)):
for l in range(len(zipcode)):
a = np.rint(demand_growth[i]*rand_week_total[j,i]*rand_product_total[k,i]*rand_zipcode_total[l,i])
if a !=0:
forecast_year.append(years[i])
forecast_week.append(week[j])
forecast_product.append(product[k])
forecast_ZIP.append(zipcode[l])
forecast_qty.append(a)
'''
Edited: included examples of the arrays being multiplied
Any recommendations would be greatly appreciated!
I think you can do more than than by studying how to use arrays and/or threading. For now, the best I got was 3x faster. I used lower boundaries to not spend the night on this.
import numpy as np
import timeit
def f1():
demand_growth = np.array([10,15,20,23,26,30])
rand_week_total = np.random.rand(52,6)
rand_product_total = np.random.rand(23,6)
rand_zipcode_total = np.random.rand(43,6)
forecast_year = []
forecast_week = []
forecast_product = []
forecast_ZIP = []
forecast_qty = []
years = np.array(range(2015, 2020))
weeks = np.array(range(0, 52))
product = np.array(range(0, 23))
zipcode = np.array(range(0, 43))
for i in range(len(years)):
for j in range(len(weeks)):
for k in range(len(product)):
for l in range(len(zipcode)):
a = np.rint(demand_growth[i]*rand_week_total[j,i]*rand_product_total[k,i]*rand_zipcode_total[l,i])
if a !=0:
forecast_year.append(years[i])
forecast_week.append(weeks[j])
forecast_product.append(product[k])
forecast_ZIP.append(zipcode[l])
forecast_qty.append(a)
def f2():
demand_growth = np.array([10,15,20,23,26,30])
rand_week_total = np.random.rand(52,6)
rand_product_total = np.random.rand(23,6)
rand_zipcode_total = np.random.rand(43,6)
forecast_year = []
forecast_week = []
forecast_product = []
forecast_ZIP = []
forecast_qty = []
years = np.array(range(2015, 2020))
weeks = np.array(range(0, 52))
product = np.array(range(0, 23))
zipcode = np.array(range(0, 43))
for i in range(len(years)):
for j in range(len(weeks)):
temp_ij = demand_growth[i]*rand_week_total[j,i]
for k in range(len(product)):
temp_ikj = temp_ij*rand_product_total[k,i]
for l in range(len(zipcode)):
a = np.rint(temp_ikj*rand_zipcode_total[l,i])
if a !=0:
forecast_year.append(years[i])
forecast_week.append(weeks[j])
forecast_product.append(product[k])
forecast_ZIP.append(zipcode[l])
forecast_qty.append(a)
def f3():
demand_growth = np.array([10,15,20,23,26,30])
rand_week_total = np.random.rand(52,6)
rand_product_total = np.random.rand(23,6)
rand_zipcode_total = np.random.rand(43,6)
forecast_year = []
forecast_week = []
forecast_product = []
forecast_ZIP = []
forecast_qty = []
years = np.array(range(2015, 2020))
weeks = np.array(range(0, 52))
product = np.array(range(0, 23))
zipcode = np.array(range(0, 43))
for j in range(len(weeks)):
temp_j = demand_growth*rand_week_total[j,:]
for k in range(len(product)):
temp_jk = temp_j * rand_product_total[k,:]
for l in range(len(zipcode)):
a = np.rint(temp_jk*rand_zipcode_total[l,:])
for i in range(len(years)):
if a[i] !=0:
forecast_year.append(years[i])
forecast_week.append(weeks[j])
forecast_product.append(product[k])
forecast_ZIP.append(zipcode[l])
forecast_qty.append(a[i])
print(timeit.Timer(f1).timeit(5))
print(timeit.Timer(f2).timeit(5))
print(timeit.Timer(f3).timeit(5))
I have a file that contains 3 lists with pairs of coordinates. I would like to read the files and separate the first field as a name and the rest as coordinates. However, I don't know how to do this.
I am using the following code to read the txt file.
arquivo = open('dados_utm.txt', 'rt')
t = ' '
t1 = ' '
while t != '':
t = arquivo.readline()
t1 = t.split(' ')
print(t1)
Output:
['Poly', '"Pampulha"', '420545.,8039109.', '421826.,8039269.',
'424213.,8041682.', '424189.,8043000.', '424331.,8044861.',
'426457.,8047689.', '427082.,8047013.', '427713.,8044612.',
'427710.,8042703.', '428712.,8040642.', '428713.,8040196.',
'428790.,8039499.', '428356.,8038819.', '427844.,8039050.',
'426759.,8038697.', '426595.,8035314.', '427213.,8033950.',
'426558.,8030343.', '426113.,8030041.', '420041.,8030502.',
'419081.,8031438.', '419678.,8037604.', '420545.,8039109.\n']
['Poly',
'"Jacaré"', '425846.,8055763.', '424723.,8057841.',
'422398.,8058414.', '413568.,8058765.', '410307.,8060688.',
'403022.,8068114.', '402543.,8071067.', '403423.,8071846.',
'417134.,8073069.', '419408.,8074047.', '424638.,8068255.',
'429946.,8065755.', '430183.,8064351.', '433594.,8058696.',
'434290.,8058940.', '434296.,8057197.', '431016.,8051616.',
'430041.,8051612.', '428278.,8051122.\n']
['Poly', '"Patos"',
'437525.,7991091.', '439184.,7993615.', '435440.,8005422.',
'437290.,8006397.', '443981.,8000217.', '445662.,7995572.',
'448275.,7988217.', '446432.,7984918.', '438654.,7985476.',
'437525.,7991091.'] ['']
The second step is to separate the x and y coordinates for different variables. For this I am using the following code.
for i in t1[1,0]:
x = []
y = []
xy = t1.readline()
xy = xy.split(',')
x.append(float(xy[0]))
y.append(float(xy[1]))
print(x, y)
However I have the following error:
TypeError: list indices must be integers or slices, not tuple
txt file:
Poly "Pampulha" 420545.,8039109. 421826.,8039269. 424213.,8041682.
424189.,8043000. 424331.,8044861. 426457.,8047689. 427082.,8047013. 427713.,8044612. 427710.,8042703. 428712.,8040642. 428713.,8040196. 428790.,8039499. 428356.,8038819. 427844.,8039050. 426759.,8038697. 426595.,8035314. 427213.,8033950. 426558.,8030343. 426113.,8030041. 420041.,8030502. 419081.,8031438. 419678.,8037604. 420545.,8039109.
Poly "Jacaré" 425846.,8055763. 424723.,8057841. 422398.,8058414.
413568.,8058765. 410307.,8060688. 403022.,8068114. 402543.,8071067. 403423.,8071846. 417134.,8073069. 419408.,8074047. 424638.,8068255. 429946.,8065755. 430183.,8064351. 433594.,8058696. 434290.,8058940. 434296.,8057197. 431016.,8051616. 430041.,8051612. 428278.,8051122.
Poly "Patos" 437525.,7991091. 439184.,7993615. 435440.,8005422.
437290.,8006397. 443981.,8000217. 445662.,7995572. 448275.,7988217. 446432.,7984918. 438654.,7985476. 437525.,7991091.
what am I doing wrong?
You need more than one list because you're overwriting t1, you got that error from having [1, 0] after t1 in your for loop, t1 is a list so readline() won't work.
This should work and put coords as lists of tuples into dict t2 with the names as keys:
arquivo = open('dados_utm.txt', 'rt')
t = None
t1 = []
while t != '':
t = arquivo.readline()
t1.append(t.split(' '))
t2 = {}
for a in t1:
name = a.pop(0) + ' ' + a.pop(1)
t2[name] = []
for ele in a:
xy = ele.split(',')
x, y = float(xy[0]), float(xy[1])
t2[name].append((x, y))
print(t2)
You might want to think about pandas its a good library.
text = open('untitled.txt', 'rt').read()
lst = [item for item in text.split('\n') if item]
lst = [item.split(' ') for item in lst]
t2 = {}
for itr in lst:
name = ''.join(itr[0:2]).replace('"',' ')
t2[name] = {}
df = pd.DataFrame(map(lambda x: x.split(','),itr[2:]),columns=["X","Y"])
t2[name] = {
"X": df["X"].to_list(),
"Y": df["Y"].to_list()
}
print(t2)
I have this script I'm running to try to create a dataframe to summarize some statistics:
month = [may,june,july,august,sept]
month_str = [5,6,7,8,9]
avg_age = []
avg_use = []
avg_kwh = []
avg_coll = []
avg_cred = []
for i in month:
avg_age.append(i[i['Age']!=0]['Age'].mean())
avg_use.append(i[i['AverageBilledUsage']!=0]['AverageBilledUsage'].mean())
avg_kwh.append(i[i['AverageKWH']!=0]['AverageKWH'].mean())
avg_coll.append(i[i['Total Collected']!=0]['Total Collected'].mean())
avg_cred.append(i[(i['credit_score']!=0) & (i['credit_score']!=99999)]['credit_score'].mean())
pd.DataFrame(data = [avg_age,avg_use,avg_kwh,avg_coll,avg_cred],columns = month_str,index = ['Age','Usage','kwh','collected','creditscore'])
It returns exactly what I want to see. But when I place it inside a function I get the following error:
AssertionError: 5 columns passed, passed data had 1 columns
Here is the code inside the function:
def get_nums():
months = [may,june,july,august,sept]
month_str = [5,6,7,8,9]
avg_age = []
avg_use = []
avg_kwh = []
avg_coll = []
avg_cred = []
for i in months:
avg_age.append(i[i['Age']!=0]['Age'].mean())
avg_use.append(i[i['AverageBilledUsage']!=0]['AverageBilledUsage'].mean())
avg_kwh.append(i[i['AverageKWH']!=0]['AverageKWH'].mean())
avg_coll.append(i[i['Total Collected']!=0]['Total Collected'].mean())
avg_cred.append(i[(i['credit_score']!=0) & (i['credit_score']!=99999)]['credit_score'].mean())
this_df = pd.DataFrame(data = [avg_age,avg_use,avg_kwh,avg_coll,avg_cred],columns = month_str,index = ['Age','Usage','kwh','collected','creditscore'])
return this_df
You have a problem with the last line of the for loop in the function. this_df is being defined in every iteration of the loop.
The corrected code is below.
def get_nums():
months = [may,june,july,august,sept]
month_str = [5,6,7,8,9]
avg_age = []
avg_use = []
avg_kwh = []
avg_coll = []
avg_cred = []
for i in months:
avg_age.append(i[i['Age']!=0]['Age'].mean())
avg_use.append(i[i['AverageBilledUsage']!=0]['AverageBilledUsage'].mean())
avg_kwh.append(i[i['AverageKWH']!=0]['AverageKWH'].mean())
avg_coll.append(i[i['Total Collected']!=0]['Total Collected'].mean())
avg_cred.append(i[(i['credit_score']!=0) & (i['credit_score']!=99999)]['credit_score'].mean())
this_df = pd.DataFrame(data = [avg_age,avg_use,avg_kwh,avg_coll,avg_cred],columns = month_str,index = ['Age','Usage','kwh','collected','creditscore'])
return this_df
Base on my understanding , you do not need the for loop here
month = [may,june,july,august,sept]
month_str = [5,6,7,8,9]
df=pd.concat(month,keys=month_str)
df=df.mask(df==0|df==99999)
df.groupby(level=0).mean().T
Question source: SPOJ.. ORDERS
def swap(ary,idx1,idx2):
tmp = ary[idx1]
ary[idx1] = ary[idx2]
ary[idx2] = tmp
def mkranks(size):
tmp = []
for i in range(1, size + 1):
tmp = tmp + [i]
return tmp
def permutations(ordered, movements):
size = len(ordered)
for i in range(1, size): # The leftmost one never moves
for j in range(0, int(movements[i])):
swap(ordered, i-j, i-j-1)
return ordered
numberofcases = input()
for i in range(0, numberofcases):
sizeofcase = input()
tmp = raw_input()
movements = ""
for i in range(0, len(tmp)):
if i % 2 != 1:
movements = movements + tmp[i]
ordered = mkranks(sizeofcase)
ordered = permutations(ordered, movements)
output = ""
for i in range(0, sizeofcase - 1):
output = output + str(ordered[i]) + " "
output = output + str(ordered[sizeofcase - 1])
print output
Having made your code a bit more Pythonic (but without altering its flow/algorithm):
def swap(ary, idx1, idx2):
ary[idx1], ary[idx2] = [ary[i] for i in (idx2, idx1)]
def permutations(ordered, movements):
size = len(ordered)
for i in range(1, len(ordered)):
for j in range(movements[i]):
swap(ordered, i-j, i-j-1)
return ordered
numberofcases = input()
for i in range(numberofcases):
sizeofcase = input()
movements = [int(s) for s in raw_input().split()]
ordered = [str(i) for i in range(1, sizeofcase+1)]
ordered = permutations(ordered, movements)
output = " ".join(ordered)
print output
I see it runs correctly in the sample case given at the SPOJ URL you indicate. What is your failing case?