Time efficiency by eliminating three for loops - python

I have the a script similar to this:
import random
import pandas as pd
FA = []
FB = []
Value = []
df = pd.DataFrame()
df_save = pd.DataFrame(index=['min','max'])
days = ['Monday','Tuesday','Wednesday','Thursday','Friday','Saturday','Sunday']
numbers = list(range(24)) # FA.unique()
mix = '(pairwise combination of days and numbers, i.e. 0Monday,0Tuesday,...1Monday,1Tuesday,....)' 'I dont know how to do this combination btw'
def Calculus():
global min,max
min = df['Value'][boolean].min()
max = df['Value'][boolean].max()
for i in range(1000):
FA.append(random.randrange(0,23,1))
FB.append(random.choice(days))
Value.append(random.random())
df['FA'] = FA
df['FB'] = FB
df['FAB'] = df['FA'].astype(str) + df['FB'].astype(str)
df['Value'] = Value
mix_factor = df['FA'].astype(str) + df['FB'].astype(str)
for i in numbers:
boolean = df['FA'] == i
Calculus()
df_save[str(i)] = [min,max]
for i in days:
boolean = df['FB'] == i
Calculus()
df_save[str(i)] = [min,max]
for i in mix_factor.unique():
boolean = df['FAB'] == i
Calculus() #
df_save[str(i)] = [min,max]
My question is: there is another way to do the same but more time efficiently? My real data (df in this case) is a csv with millions of rows and this three loops are taking forever.
Maybe using 'apply' but I never have worked with it before.
Any insight will be very appreciate, thanks.

You could put all three loops into one, depending on what your exact code is. Is there a parameter for calculus? If not, putting them into one would allow you to have to run Calculus() less

Related

Filling 0's with Local Means

Hi I am working on a dataset where there is a host_id and two other columns : reviews_per_month and number_of_reviews. For every host_id, majority of the values are present for these two columns whereas some of them are zeros. For each column, I want to replace those 0 values by the mean of all the values related with that host_id. Here is the code I have tried :
def process_rpm_nor(data):
data['reviews_per_month'] = data['reviews_per_month'].fillna(0)
data['number_of_reviews'] = data['number_of_reviews'].fillna(0)
data_list = []
for host_id in set(data['host_id']):
data_temp = data[data['host_id'] == host_id]
nor_non_zero = np.mean(data_temp[data_temp['number_of_reviews'] > 0]['number_of_reviews'])
rpm_non_zero = np.mean(data_temp[data_temp['reviews_per_month'] > 0]['reviews_per_month'])
data_temp['number_of_reviews'] = data_temp['number_of_reviews'].replace(0,nor_non_zero)
data_temp['reviews_per_month'] = data_temp['reviews_per_month'].replace(0,rpm_non_zero)
data_list.append(data_temp)
return pd.concat(data_list, axis = 1)
Though the code works, yet it takes a lot of time to process and I was wondering if anyone could help by offering an alternate solution to this problem or help me optimize my code. I'd really appreciate the help.

My program compute values as string and not as float even when ichange the type

i have a problem with my program and i'm confused, i don't know why it won't change the type of the columns, or maybe it is changing the type of the columns and it just still compute the columns as string. When i change the type into float, if i want it to be multiplied by 8, it will give me, for example with 4, 44444444. Here is my code.
import pandas as pd
import re
import numpy as np
link = "excelfilett.txt"
file = open(link, "r")
frames = []
is_count_frames = False
for line in file:
if "[Frames]" in line:
is_count_frames = True
if is_count_frames == True:
frames.append(line)
if "[EthernetRouting]" in line:
break
number_of_rows = len(frames) - 3
header = re.split(r'\t', frames[1])
number_of_columns = len(header)
frame_array = np.full((number_of_rows, number_of_columns), 0)
df_frame_array = pd.DataFrame(frame_array)
df_frame_array.columns= header
for row in range(number_of_rows):
frame_row = re.split(r'\t',frames[row+2])
for position in range(len(frame_row)):
df_frame_array.iloc[row, position]=frame_row[position]
df_frame_array['[MinDistance (ms)]'].astype(float)
df_frame_array.loc[:,'[MinDistance (ms)]'] *= 8
print(df_frame_array['[MinDistance (ms)]'])
but it gives me 8 times the value like (100100...100100), i also tried with puting them in a list
MinDistList = df_frame_array['[MinDistance (ms)]'].tolist()
product = []
for i in MinDistList:
product.append(i*8)
print(product)
but it still won't work, any ideas?
df_frame_array['[MinDistance (ms)]'].astype(float) doesn't change the column in place, but returns a new one.
You had the right idea, so just store it back:
df_frame_array['[MinDistance (ms)]'] = df_frame_array['[MinDistance (ms)]'].astype(float)

How to vectorize this peak finding for loop in Python?

Basically I'm writing a peak finding function that needs to be able to beat scipy.argrelextrema in benchmarking. Here is a link to the data I'm using, and the code:
https://drive.google.com/open?id=1U-_xQRWPoyUXhQUhFgnM3ByGw-1VImKB
If this link expires, the data can be found at dukascopy bank's online historical data downloader.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
data = pd.read_csv('EUR_USD.csv')
data.columns = ['Date', 'open', 'high', 'low', 'close','volume']
data.Date = pd.to_datetime(data.Date, format='%d.%m.%Y %H:%M:%S.%f')
data = data.set_index(data.Date)
data = data[['open', 'high', 'low', 'close']]
data = data.drop_duplicates(keep=False)
price = data.close.values
def fft_detect(price, p=0.4):
trans = np.fft.rfft(price)
trans[round(p*len(trans)):] = 0
inv = np.fft.irfft(trans)
dy = np.gradient(inv)
peaks_idx = np.where(np.diff(np.sign(dy)) == -2)[0] + 1
valleys_idx = np.where(np.diff(np.sign(dy)) == 2)[0] + 1
patt_idx = list(peaks_idx) + list(valleys_idx)
patt_idx.sort()
label = [x for x in np.diff(np.sign(dy)) if x != 0]
# Look for Better Peaks
l = 2
new_inds = []
for i in range(0,len(patt_idx[:-1])):
search = np.arange(patt_idx[i]-(l+1),patt_idx[i]+(l+1))
if label[i] == -2:
idx = price[search].argmax()
elif label[i] == 2:
idx = price[search].argmin()
new_max = search[idx]
new_inds.append(new_max)
plt.plot(price)
plt.plot(inv)
plt.scatter(patt_idx,price[patt_idx])
plt.scatter(new_inds,price[new_inds],c='g')
plt.show()
return peaks_idx, price[peaks_idx]
It basically smoothes data using a fast fourier transform (FFT) then takes the derivative to find the minimum and maximum indices of the smoothed data, then finds the corresponding peaks on the unsmoothed data. Sometimes the peaks it finds are not idea due to some smoothing effects, so I run this for loop to search for higher or lower points for each index between the bounds specified by l. I need help vectorizing this for loop! I have no idea how to do it. Without the for loop, my code is about 50% faster than scipy.argrelextrema, but the for loop slows it down. So if I can find a way to vectorize it, it'd be a very quick, and very effective alternative to scipy.argrelextrema. These two images represent the data without and with the for loop respectively.
This may do it. It's not perfect but hopefully it obtains what you want and shows you a bit how to vectorize. Happy to hear any improvements you think up
label = np.array(label[:-1]) # not sure why this is 1 unit longer than search.shape[0]?
# the idea is to make the index matrix you're for looping over row by row all in one go.
# This part is sloppy and you can improve this generation.
search = np.vstack((np.arange(patt_idx[i]-(l+1),patt_idx[i]+(l+1)) for i in range(0,len(patt_idx[:-1])))) # you can refine this.
# then you can make the price matrix
price = price[search]
# and you can swap the sign of elements so you only need to do argmin instead of both argmin and argmax
price[label==-2] = - price[label==-2]
# now find the indices of the minimum price on each row
idx = np.argmin(price,axis=1)
# and then extract the refined indices from the search matrix
new_inds = search[np.arange(idx.shape[0]),idx] # this too can be cleaner.
# not sure what's going on here so that search[:,idx] doesn't work for me
# probably just a misunderstanding
I find that this reproduces your result but I did not time it. I suspect the search generation is quite slow but probably still faster than your for loop.
Edit:
Here's a better way to produce search:
patt_idx = np.array(patt_idx)
starts = patt_idx[:-1]-(l+1)
stops = patt_idx[:-1]+(l+1)
ds = stops-starts
s0 = stops.shape[0]
s1 = ds[0]
search = np.reshape(np.repeat(stops - ds.cumsum(), ds) + np.arange(ds.sum()),(s0,s1))
Here is an alternative... it uses list comprehension which is generally faster than for-loops
l = 2
# Define the bounds beforehand, its marginally faster than doing it in the loop
upper = np.array(patt_idx) + l + 1
lower = np.array(patt_idx) - l - 1
# List comprehension...
new_inds = [price[low:hi].argmax() + low if lab == -2 else
price[low:hi].argmin() + low
for low, hi, lab in zip(lower, upper, label)]
# Find maximum within each interval
new_max = price[new_inds]
new_global_max = np.max(new_max)

How to improve this script. Dividing a column by 3 in openpyxl

Looking for a way to simplify the division of a column. Is there a loop I can use?
from openpyxl import load_workbook
xfile = load_workbook('camdatatest.xlsx')
sheet =xfile.get_sheet_by_name('Sheet1')
sheet['D2'] = '=C2/3' # Want to divide all values in Column C with the new value in D.
sheet['D3'] = '=C4/3'
sheet['D4'] = '=C5/3'
sheet['D5'] = '=C6/3'
sheet['D6'] = '=C7/3'
sheet['D7'] = '=C8/3'
sheet['D8'] = '=C9/3'
sheet['D9'] = '=C10/3'
xfile.save("camdatatestoutput.xlsx")
This is just one of the possibilities.
for i in range(2,11):
sheet['D{}'.format(i)] = '=C{}/3'.format(i)
Notice that the way range works if you want the number 10 to be included, you need to make sure to pass in 11.

pandas: setting last N rows of multi-index to Nan for speeding up groupby with shift

I am trying to speed up my groupby.apply + shift and
thanks to this previous question and answer: How to speed up Pandas multilevel dataframe shift by group? I can prove that it does indeed speed things up when you have many groups.
From that question I now have the following code to set the first entry in each multi-index to Nan. And now I can do my shift globally rather than per group.
df.iloc[df.groupby(level=0).size().cumsum()[:-1]] = np.nan
but I want to look forward, not backwards, and need to do calculations across N rows. So I am trying to use some similar code to set the last N entries to NaN, but obviously I am missing some important indexing knowledge as I just can't figure it out.
I figure I want to convert this so that every entry is a range rather than a single integer. How would I do that?
# the start of each group, ignoring the first entry
df.groupby(level=0).size().cumsum()[1:]
Test setup (for backwards shift) if you want to try it:
length = 5
groups = 3
rng1 = pd.date_range('1/1/1990', periods=length, freq='D')
frames = []
for x in xrange(0,groups):
tmpdf = pd.DataFrame({'date':rng1,'category':int(10000000*abs(np.random.randn())),'colA':np.random.randn(length),'colB':np.random.randn(length)})
frames.append(tmpdf)
df = pd.concat(frames)
df.sort(columns=['category','date'],inplace=True)
df.set_index(['category','date'],inplace=True,drop=True)
df['tmpShift'] = df['colB'].shift(1)
df.iloc[df.groupby(level=0).size().cumsum()[:-1]] = np.nan
# Yay this is so much faster.
df['newColumn'] = df['tmpShift'] / df['colA']
df.drop('tmp',1,inplace=True)
Thanks!
I ended up doing it using a groupby apply as follows (and coded to work forwards or backwards):
def replace_tail(grp,col,N,value):
if (N > 0):
grp[col][:N] = value
else:
grp[col][N:] = value
return grp
df = df.groupby(level=0).apply(replace_tail,'tmpShift',2,np.nan)
So the final code is:
def replace_tail(grp,col,N,value):
if (N > 0):
grp[col][:N] = value
else:
grp[col][N:] = value
return grp
length = 5
groups = 3
rng1 = pd.date_range('1/1/1990', periods=length, freq='D')
frames = []
for x in xrange(0,groups):
tmpdf = pd.DataFrame({'date':rng1,'category':int(10000000*abs(np.random.randn())),'colA':np.random.randn(length),'colB':np.random.randn(length)})
frames.append(tmpdf)
df = pd.concat(frames)
df.sort(columns=['category','date'],inplace=True)
df.set_index(['category','date'],inplace=True,drop=True)
shiftBy=-1
df['tmpShift'] = df['colB'].shift(shiftBy)
df = df.groupby(level=0).apply(replace_tail,'tmpShift',shiftBy,np.nan)
# Yay this is so much faster.
df['newColumn'] = df['tmpShift'] / df['colA']
df.drop('tmpShift',1,inplace=True)

Categories

Resources