I am working on a data analysis and I have to generate Histograms. My code has more than 7 nested for-loops. Each nested loop filters the data frame by a unique value from the category to form a new data frame of sub categories and then splitting further like previous. Each day has around 400,000 records. And I have to process last 30 days record. The result is to produce histograms for the values(only one numerical column) of the last un-splittable category. How do I reduce complexity? Any alternate methods ?
for customer in data_frame['MasterCustomerID'].unique():
df_customer = data_frame.loc[data_frame['MasterCustomerID'] == customer]
for service in df_customer['Service'].unique():
df_service = df_customer.loc[df_customer['Service'] == service]
for source in df_service['Source'].unique():
df_source = df_service.loc[df_service['Source'] == source]
for subcomponent in df_source['SubComponentType'].unique():
df_subcomponenttypes = df_source.loc[df_source['SubComponentType'] == subcomponent]
for kpi in df_subcomponenttypes['KPI'].unique():
df_kpi = df_subcomponenttypes.loc[df_subcomponenttypes['KPI'] == kpi]
for device in df_kpi['Device_Type'].unique():
df_device_type = df_kpi.loc[df_kpi['Device_Type'] == device]
for access in df_device_type['Access_type'].unique():
df_access_type = df_device_type.loc[df_device_type['Access_type'] == access]
df_access_type['Day'] = ifweekday(df_access_type['PerformanceTimeStamp'])
You can use pandas.groupby to find unique combinations of different levels of the columns (see here and here) and then loop over the dataframe grouped by each combination. There are ~4000 combinations so be careful when uncommenting the histogram code below.
import string
import numpy as np, pandas as pd
from matplotlib import pyplot as plt
# Generate 400,000 records (400 obs for 1000 individuals in 6 columns)
NIDS = 1000; NOBS = 400; NCOLS = 6
df = pd.DataFrame(np.random.randint(0, 4, size = (NIDS*NOBS, NCOLS)))
mapper = dict(zip(range(26), list(string.ascii_lowercase)))
df.replace(mapper, inplace = True)
cols = ['Service', 'Source', 'SubComponentType', \
'KPI', 'Device_Type', 'Access_type']
df.columns = cols
# Generate IDs for individuals
df['MasterCustomerID'] = np.repeat(range(NIDS), NOBS)
# Generate values of interest (to be plotted)
df['value2plot'] = np.random.rand(NIDS*NOBS)
# View the counts for each unique combination of column levels
# Do something with the different subsets (such as make histograms)
for levels, group in df.groupby(cols):
# fig, ax = plt.subplots()
# ax.hist(group['value2plot'])
# ax.set_title(", ".join(levels))
# plt.savefig("hist_" + "_".join(levels) + ".png")
# plt.close()
I have to calculate the removal pecentages of chemical/biological parameters (e.g. after an oxidation process) in a waster water treatment plant.
My code code works so far and does exactly what it should do, but it is really slow.
On my laptop the calculation for the original dataset took about 10 sec and on my PC 4 sec for a 15x80 Data Frame. That is too long, especially if I have to deal with more rows.
What the code does:
The formula for the single removal is defined as: 1 - n(i)/n(i-1)
and for the total removal: 1 - n(i)/n(0)
Every measuring point has its own ID. The code searches for the ID's and performs the calculation and saves it in the data frame.
Here is an example (I cant post the original data):
import pandas as pd
import numpy as np
data = {"ID": ["X1_P0001", "X2_P0001", "X3_P0001", "X1_P0002", "X2_P0002", "X3_P0002", "X4_P0002","X5_P0002", "X1_P0003", "X2_P0003", "X3_P0003"],
"Measurement": [100, 80, 60, 120,90,70,50,25, 85,65,35]}
df["S_removal"]= np.nan
df["T_removal"]= np.nan
Data Frame before calculation
this is my function for the calculation:
def removal_TEST(Rem1, Measure, Rem2):
lst = [i.split("_")[1] for i in df["ID"]] #takes relevant ID information
y = np.unique(lst) #stores unique ID values to loop over them
for ID in y:
id_list = []
for i in range(0, len(df["ID"])):
if ID in df["ID"][i]:
else: # this stores only the relevant id in a new list
indexlist = pd.Series(id_list)
first_index = indexlist.first_valid_index() #gets the first and last index of the id list
last_index = indexlist.last_valid_index()
col_indizes = []
for i in range(first_index, last_index+1):
for i in col_indizes:
if i == 0:
continue # for i=0 there is no 0-1 element, so i=0 should be skipped
Rem1[i]= 1-(Measure[i]/Measure[i-1])
Rem1[first_index]= np.nan #first entry of an ID must be NaN value
for i in range(first_index, last_index+1):
for i in range(len(Rem2)):
for i in col_indizes:
Rem2[i]= 1-(Measure[i]/Measure[first_index])
Rem2[first_index]= np.nan
this is the result:
Final Data Frame
I am new to Python and to stackoverflow (so sorry if my code and question are not so good to read). Are there any good libraries to speed up my code, or do you have some suggestions?
Thank you :)
Your use of Pandas seems to be getting in the way of solving the problem. The only relevant state seems to be when the group changes and the first and previous measurement values for each row.
I'd be tempted to solve this just using Python primitives, but you could solve this in other ways if you had lots of data (i.e. millions of rows).
import pandas as pd
df = pd.DataFrame({
"ID": ["X1_P0001", "X2_P0001", "X3_P0001", "X1_P0002", "X2_P0002", "X3_P0002", "X4_P0002","X5_P0002", "X1_P0003", "X2_P0003", "X3_P0003"],
"Measurement": [100, 80, 60, 120,90,70,50,25, 85,65,35],
"S_removal": float('nan'),
"T_removal": float('nan'),
# somewhere keep track of the last group identifier
last = None
# iterate over rows
for idx, ID, meas in zip(df.index, df['ID'], df['Measurement']):
# what's the current group name
_, grp = ID.split('_', 1)
# see if we're in a new group
if grp != last:
last = grp
# track the group's measurement
grp_meas = meas
# calculate things
df.loc[idx, 'S_removal'] = 1 - meas / last_meas
df.loc[idx, 'T_removal'] = 1 - meas / grp_meas
# keep track of the last measurement
last_meas = meas
I've commented the code in the hopes it makes sense. This takes ~2 seconds for 1000 copies of your example data, so 11000 rows.
Given that OP has said this needs to be done for a wide dataset, here's another version that reduces runtime to ~30ms for 11000 rows and 2 columns:
import numpy as np
import pandas as pd
data = {
"ID": ["X1_P0001", "X2_P0001", "X3_P0001", "X1_P0002", "X2_P0002", "X3_P0002", "X4_P0002","X5_P0002", "X1_P0003", "X2_P0003", "X3_P0003"],
"M1": [100, 80, 60, 120,90,70,50,25, 85,65,35],
"M2": [100, 80, 60, 120,90,70,50,25, 85,65,35],
# reset_index() because code below assumes they are unique
df = pd.concat([pd.DataFrame(data)]*1000).reset_index()
# column names
measurement_col_names = ['M1', 'M2']
single_output_names = ['S1', 'S2']
total_output_names = ['T1', 'T2']
# somewhere keep track of the last group identifier
last = None
# somewhere to store intermediate state
vals_idx = []
meas_vals = []
last_vals = []
grp_vals = []
# iterate over rows
for idx, ID, meas in zip(df.index, df['ID'], df.loc[:,measurement_col_names].values):
# what's the current group name
_, grp = ID.split('_', 1)
# we're in a new group
if grp != last:
last = grp
# track the group's measurement
grp_meas = meas
# track values and which rows they apply to
# keep track of the last measurement
last_meas = meas
# convert to numpy array so it vectorises nicely
meas_vals = np.array(meas_vals)
# perform calculation using fast numpy operations
df.loc[vals_idx, single_output_names] = 1 - (meas_vals / last_vals)
df.loc[vals_idx, total_output_names] = 1 - (meas_vals / grp_vals)
I'm using nested loops to add new columns with dynamic names based on the dataset columns (col) and columns that drops one col (I called it interact col). It works well for small datasets, but it becomes very slow if I have datasets with a very high amount of features. Any tips to simplify the process to make it faster?
import numpy as np
import pandas as pd
X = pd.read_csv('water_potability.csv')
X = X.drop(columns='Unnamed: 0')
X_columns = np.array(X.columns)
fi_df = X.copy()
done_list = []
for col in X_columns:
interact_col = X.drop(columns = col).columns
for int_col in interact_col:
fi_df['({})_minus_({})'.format(col, int_col)] = X[col]-X[int_col]
fi_df['({})_div_({})'.format(col, int_col)] = X[col]/X[int_col]
if int_col not in done_list:
fi_df['({})_add_({})'.format(col, int_col)] = X[col]+X[int_col]
fi_df['({})_multi_({})'.format(col, int_col)] = X[col]*X[int_col]
I have weekly based trade export time-series data that I need to make a stacked bar plot for visualizing trade activity. To do so, I aggregated my data for sum-up of each columns for all rows, then use nlargest() to select top n columns. However, doing this way might not be quite accurate because I made stacked plot for different years in the loop and top n columns for each year can be different. But what I did, take the total sum of each column for all rows (a.k.a, including all years) then select top n columns, which is biased. So, I am looking at the different way of doing this, perhaps, I might group the time series data by each year then make the stacked plot. Is there other way around selecting top n columns from time-series data instead of using nlargest? Does anyone know any possible way of doing this? What other way we could select top n columns from time-series data? Any idea?
my current attempt:
this is my current attempt to manipulate time series data, where I aggregate each columns for all rows then select top n columns using nlargest():
import pandas as pd
# load the data
url = 'https://gist.githubusercontent.com/adamFlyn/a6048e547b5a963c7af356c964d15af6/raw/c57c7915cf14f81edc9d5eadaf14efbd43d3e58a/trade_df.csv'
df_ = pd.read_csv(url, parse_dates=['weekly'])
df_.set_index('weekly', inplace=True)
df_.loc['Total',:]= df_.sum(axis=0)
df1 = df_.T
df1 =df1.nlargest(6, columns=['Total'])
df1.drop('Total', axis=1, inplace=True)
df2 = df1.T
df2['weekly'] = pd.to_datetime(df2['weekly'])
df2['year'] = df2['weekly'].dt.year
df2['week'] = df2['weekly'].dt.strftime('%W').astype('int')
then I visualize the plotting data with matplotlib as follow:
import matplotlib.pyplot as plt
plt_df = df2.set_index(['year','week'])
plt_df.drop("weekly", axis=1, inplace=True)
for n, g in plt_df.groupby(level=0):
ax = g.loc[n].plot.bar(stacked=True, title=f'{n} Year', figsize=(8,5))
although the output of current approach in stacked plot is fine, but selecting top n columns using nlargest() is not quite accurate.for example, in 2019 USDA report, China wasn't top trade partner of US, but in late 2020, China was getting more products from US, and if I use nlargest() to select top column (or trade partners), it is going to be problematic and China won't be in list and not in the plot.
As #Vaishali suggested in the comment with this post, using head() might be good idea to extract top columns, so I tried like this:
for n, g in plt_df.groupby(level=0):
for i in g:
gg = g[i].sort_values(g[i].values,ascending = False).groupby('week').head(5)
ax = gg.loc[n].plot.bar(stacked=True, title=f'{n} Year', figsize=(8,5))
but this is not working. Can anyone point me out how to select top n columns from time series data? any idea?
You can try something like this:
url = 'https://gist.githubusercontent.com/adamFlyn/a6048e547b5a963c7af356c964d15af6/raw/c57c7915cf14f81edc9d5eadaf14efbd43d3e58a/trade_df.csv'
df_ = pd.read_csv(url, parse_dates=['weekly'])
df_['weekly'] = pd.to_datetime(df_['weekly'])
df_.set_index('weekly', inplace=True)
for g, n in df_.groupby(df_.index.year):
ng = n.loc[:, n.sum().rank(ascending=False, method='min')<5]
ng.div(ng.sum(axis=1), axis=0).plot.area(title=f'{g}')
Bar chart:
import matplotlib.ticker as mticker
url = 'https://gist.githubusercontent.com/adamFlyn/a6048e547b5a963c7af356c964d15af6/raw/c57c7915cf14f81edc9d5eadaf14efbd43d3e58a/trade_df.csv'
df_ = pd.read_csv(url, parse_dates=['weekly'])
df_['weekly'] = pd.to_datetime(df_['weekly'])
df_.set_index('weekly', inplace=True)
for g, n in df_.groupby(df_.index.year):
ng = n.loc[:, n.sum().rank(ascending=False, method='min')<5]
ng.index = ng.index.strftime('%m/%d/%Y')
ax = ng.plot.bar(stacked=True, figsize=(10,8))
Staked 100% Bar chart:
#(previous code)
ax = ng.div(ng.sum(axis=1), axis=0).plot.bar(stacked=True, figsize=(10,8))
I am not sure I understand the requirement correctly here, but this is based on your output charts:
find top n countries using sum and nlargest
filter df by top_countries, groupby year and week, sum
for each unique year, plot stacked chart
df.columns = df.columns.str.strip()
top_countries = df.iloc[:, 1:].sum().nlargest(6).index.tolist()
df['weekly'] = pd.to_datetime(df['weekly'])
agg = df[top_countries].groupby([df['weekly'].dt.year.rename('year'),df['weekly'].dt.week.rename('week')]).sum()
for year in df['weekly'].dt.year.unique():
agg[agg.index.get_level_values(0) == year].droplevel(level=0).plot.bar(stacked = True, figsize = (10,5), title = year)
If you want to filter top countries by year, move the part where you are filtering df into the loop,
df.columns = df.columns.str.strip()
df['weekly'] = pd.to_datetime(df['weekly'])
for year in df['weekly'].dt.year.unique():
top_countries = df.iloc[:, 1:].sum().nlargest(6).index.tolist()
agg = df[top_countries].groupby([df['weekly'].dt.year.rename('year'),df['weekly'].dt.week.rename('week')]).sum()
agg[agg.index.get_level_values(0) == year].droplevel(level=0).plot.bar(stacked = True, figsize = (10,5), title = year)
You can try this
import pandas as pd
# load the data
url = 'https://gist.githubusercontent.com/adamFlyn/a6048e547b5a963c7af356c964d15af6/raw/c57c7915cf14f81edc9d5eadaf14efbd43d3e58a/trade_df.csv'
df = pd.read_csv(url, parse_dates=['weekly'])
df['weekly'] = pd.to_datetime(df['weekly'])
df['year'] = df['weekly'].dt.year
df['week'] = df['weekly'].dt.strftime('%W').astype('int')
df.set_index(['year', 'week'], inplace=True)
df.drop('weekly', axis=1, inplace=True)
df_year_sums = df.groupby(level='year').sum().T
for year in df_year_sums.columns:
largest = list(df_year_sums[year].nlargest(6).index)
df_plot = df.xs(year, level='year')[largest]
df_plot.plot.bar(stacked=True, title=f'{year} Year', figsize=(8,5))
for key,row in df.iterrows():
for row_value in row:
if type(row_value)==float:
fp = fp[(fp.T != 0).any()]
fp2 = fp2[(fp2.T != 0).any()]
fp3 = fp3[(fp3.T != 0).any()]
df = pd.DataFrame(np.random.randint(1,100,(100)),columns=["column1"])
I would like to find out the row which meets the condition RSI < 25.
However, the result is generated with one data frame. Is it possible to create separate dataframes for any single row?
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pandas_datareader import data as wb
ck_df = wb.DataReader(stock,data_source='yahoo',start='2015-01-01')
rsi_period = 14
chg = ck_df['Close'].diff(1)
gain = chg.mask(chg<0,0)
ck_df['Gain'] = gain
loss = chg.mask(chg>0,0)
ck_df['Loss'] = loss
avg_gain = gain.ewm(com = rsi_period-1,min_periods=rsi_period).mean()
avg_loss = loss.ewm(com = rsi_period-1,min_periods=rsi_period).mean()
ck_df['Avg Gain'] = avg_gain
ck_df['Avg Loss'] = avg_loss
rs = abs(avg_gain/avg_loss)
rsi = 100-(100/(1+rs))
ck_df['RSI'] = rsi
RSIFactor = ck_df['RSI'] <25
If you want to know at what index the RSI < 25 then just use:
ck_df[ck_df['RSI'] <25].index
The result will also be a dataframe. If you insist on making a new one then:
new_df = ck_df[ck_df['RSI'] <25].copy()
To split the rows found by #Omkar's solution into separate dataframes you might use this function taken from here: Pandas: split dataframe into multiple dataframes by number of rows;
def split_dataframe_to_chunks(df, n):
df_len = len(df)
count = 0
dfs = []
while True:
if count > df_len-1:
start = count
count += n
dfs.append(df.iloc[start : count])
return dfs
With this you get a list of dataframes.
I have a question about eliminating outliers from two-time series. One time series includes spot market prices and the other includes power outputs. The two series are from 2012 to 2016 and are both CSV files with the with a timestamp and then a value. As example for the power output: 2012-01-01 00:00:00,2335.2152646951617 and for the price: 2012-01-01 00:00:00,17.2
Because the spot market prices are very volatile and have a lot of outliers, I have filtered them. For the second time series, I have to delete the values with the same timestamp, which were eliminated in the time series of the prices. I thought about generating a list with the deleted values and writing a loop to delete the values with the same timestamp in the second time series. But so far that has not worked and I'm not really on. Does anyone have an idea?
My python code looks as follow:
import pandas as pd
import matplotlib.pyplot as plt
power_output = pd.read_csv("./data/external/power_output.csv", delimiter=",", parse_dates=[0], index_col=[0])
spotmarket = pd.read_csv("./data/external/spotmarket_dhp.csv", delimiter=",", parse_dates=[0], index_col=[0])
r = spotmarket['price'].pct_change().dropna() * 100
Q1 = r.quantile(.25)
Q3 = r.quantile(.75)
q1 = Q1-2*(Q3-Q1)
q3 = Q3+2*(Q3-Q1)
a = r[r.between(q1, q3)]
Can somebody help me?
If your question is about how to compare two timestamps you can have a look at this.
Basically you could do:
out = r[~r.between(q1, q3)] # negation of your between to get the outliers
Which is a merge operation that conserves only those rows that are only present in the left dataframe
The following suggestion is based on an answer of mine from a previous post.
You can solve your problem by merging both of your series and storing them in pandas dataframe. Then you can use any desired technique to identify and remove outliers. Take a look at the post mentioned above.
Here is my take on your particular problem using a snippet that can handle more than one series:
Since I don't have access to your data, the following snippet will produce two series where one of them has a distinctive outlier:
def sample(colname):
base = 100
nsample = 20
sigma = 10
# Basic df with trend and sinus seasonality
trend1 = np.linspace(0,1, nsample)
y1 = np.sin(trend1)
dates = pd.date_range(pd.datetime(2016, 1, 1).strftime('%Y-%m-%d'), periods=nsample).tolist()
df = pd.DataFrame({'dates':dates, 'trend1':trend1, 'y1':y1})
df = df.set_index(['dates'])
df.index = pd.to_datetime(df.index)
# Gaussian Noise with amplitude sigma
df['y2'] = sigma * np.random.normal(size=nsample)
df['y3'] = df['y2'] + base + (np.sin(trend1))
df['trend2'] = 1/(np.cos(trend1)/1.05)
df['y4'] = df['y3'] * df['trend2']
df.columns = [colname]
df_sample1 = sample(colname = 'series1')
df_sample2 = sample(colname = 'series2')
df_sample2['series2'].iloc[10] = 800
Series 1 - No outliers
Series 2 - A distinctive outlier
Now you can merge those series like this:
# Merge dataframes
df_merged = pd.merge(df_sample1, df_sample2, how='outer', left_index=True, right_index=True)
What is considered an outlier will depend full on the nature of your dataset. In this case, you can set the level for identifying outliers using sscipy.zscore(). In the following case, every observation with a difference that exceeds 3 is considered an outlier.
# A function for removing outliers
def noSpikes(df, level, keepFirst):
# 1. Get some info about the original data:
#df = df_merged
#level = 3
#keepFirst = True
firstVal = df[:1]
colNames = df.columns
colNumber = len(df.columns)
#cleanBy = 'Series1'
# 2. Take the first difference and
df_diff = df.diff()
# 3. Remove missing values
df_clean = df_diff.dropna()
# 4. Select a level for a Z-score to identify and remove outliers
df_Z = df_clean[(np.abs(stats.zscore(df_clean)) < level).all(axis=1)]
ix_keep = df_Z.index
# 5. Subset the raw dataframe with the indexes you'd like to keep
df_keep = df.loc[ix_keep]
# 6.
# df_keep will be missing some indexes.
# Do the following if you'd like to keep those indexes
# and, for example, fill missing values with the previous values
df_out = pd.merge(df_keep, df, how='outer', left_index=True, right_index=True)
# 7. Keep only the original columns (drop the diffs)
df_out = df_out.ix[:,:colNumber]
# 8. Fill missing values
df_complete = df_out.fillna(axis=0, method='ffill')
# 9. Reset column names
df_complete.columns = colNames
# Keep the first value
if keepFirst:
df_complete.iloc[0] = firstVal.iloc[0]
df_clean = noSpikes(df = df_merged, level = 3, keepFirst = True)
Let me know how this works out for you.
Here's the whole thing for an easy copy-paste:
# Imports
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from scipy import stats
# A function for noisy data with a trend element
def sample(colname):
base = 100
nsample = 20
sigma = 10
# Basic df with trend and sinus seasonality
trend1 = np.linspace(0,1, nsample)
y1 = np.sin(trend1)
dates = pd.date_range(pd.datetime(2016, 1, 1).strftime('%Y-%m-%d'), periods=nsample).tolist()
df = pd.DataFrame({'dates':dates, 'trend1':trend1, 'y1':y1})
df = df.set_index(['dates'])
df.index = pd.to_datetime(df.index)
# Gaussian Noise with amplitude sigma
df['y2'] = sigma * np.random.normal(size=nsample)
df['y3'] = df['y2'] + base + (np.sin(trend1))
df['trend2'] = 1/(np.cos(trend1)/1.05)
df['y4'] = df['y3'] * df['trend2']
df.columns = [colname]
df_sample1 = sample(colname = 'series1')
df_sample2 = sample(colname = 'series2')
df_sample2['series2'].iloc[10] = 800
# Merge dataframes
df_merged = pd.merge(df_sample1, df_sample2, how='outer', left_index=True, right_index=True)
# A function for removing outliers
def noSpikes(df, level, keepFirst):
# 1. Get some info about the original data:
firstVal = df[:1]
colNames = df.columns
colNumber = len(df.columns)
#cleanBy = 'Series1'
# 2. Take the first difference and
df_diff = df.diff()
# 3. Remove missing values
df_clean = df_diff.dropna()
# 4. Select a level for a Z-score to identify and remove outliers
df_Z = df_clean[(np.abs(stats.zscore(df_clean)) < level).all(axis=1)]
ix_keep = df_Z.index
# 5. Subset the raw dataframe with the indexes you'd like to keep
df_keep = df.loc[ix_keep]
# 6.
# df_keep will be missing some indexes.
# Do the following if you'd like to keep those indexes
# and, for example, fill missing values with the previous values
df_out = pd.merge(df_keep, df, how='outer', left_index=True, right_index=True)
# 7. Keep only the original columns (drop the diffs)
df_out = df_out.ix[:,:colNumber]
# 8. Fill missing values
df_complete = df_out.fillna(axis=0, method='ffill')
# 9. Reset column names
df_complete.columns = colNames
# Keep the first value
if keepFirst:
df_complete.iloc[0] = firstVal.iloc[0]
df_clean = noSpikes(df = df_merged, level = 3, keepFirst = True)