I would like to run two separate loops on df. In the first step, I would like to filter the df by sex (male, female) and year (yrs 2008:2013) and save these dataframes in a list. In the second step, I would like to do some kind of analysis to each element of the list and name the output based on which sex & year combination it came from.
I realize I can do this in one step, but my actual code and significantly more complex and throws an error, which stops the loop and it never advances to the second stage. consequently, I need to break it up into two steps. This is what I have so far. I would like to ask for help on the second stage. How do I run the make_graph function on each element of the list and name it according to sex&year combination?
import random
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
df_toy=pd.DataFrame([])
df_toy['value'] = np.random.randint(low=1, high=1000, size=100000)
df_toy['age'] = np.random.choice(range(0, 92), 100000)
df_toy['sex'] = np.random.choice([0, 1], 100000)
df_toy['year'] = np.random.randint(low=2008, high=2013, size=100000)
def format_data(df_toy, SEX, YEAR):
df_toy = df_toy[(df_toy["sex"] == SEX) & (df_toy["year"] == YEAR) ]
return df_toy
def make_graph(df_):
plt.scatter(age, value)
return df_toy
dfs = []
for SEX in range(0,3):
for YEAR in range(2008,2014):
dfs.append(format_data(df_toy, SEX, YEAR))
for i in range(len(dfs)):
df_=dfs[i]
make_graph(df_)
df_YEAR_SEX=df_
IIUC you could filter plot and save the data like this. Since I don't know the actual data I don't know why you need to do it in 2 steps, here is how you could do it with a few changes.
# Input data
df_toy = pd.DataFrame({
'value' : np.random.randint(low=1, high=1000, size=100000),
'age' : np.random.choice(range(0, 92), 100000),
'sex' : np.random.choice([0, 1], 100000),
'year' : np.random.randint(low=2008, high=2013, size=100000)
})
def filter_and_plot(df, SEX, YEAR):
# filter the df for sex and year
tmp = df[(df["sex"] == SEX) & (df["year"] == YEAR)]
# create a new plot for each filtered df and plot it
fig, ax = plt.subplots()
ax.scatter(x=tmp['age'], y=tmp['value'], s=0.4)
# return the filtered df
return tmp
result_dict = {}
for SEX in range(0,2):
for YEAR in range(2008, 2013):
# use a f-string to build a key in a dictionary which includes sex and year
# keys look like this: "df_1_2009", the value to each key is the filtered dataframe
result_dict[f"df_{SEX}_{YEAR}"] = filter_and_plot(df_toy, SEX, YEAR)
Related
I have to calculate the removal pecentages of chemical/biological parameters (e.g. after an oxidation process) in a waster water treatment plant.
My code code works so far and does exactly what it should do, but it is really slow.
On my laptop the calculation for the original dataset took about 10 sec and on my PC 4 sec for a 15x80 Data Frame. That is too long, especially if I have to deal with more rows.
What the code does:
The formula for the single removal is defined as: 1 - n(i)/n(i-1)
and for the total removal: 1 - n(i)/n(0)
Every measuring point has its own ID. The code searches for the ID's and performs the calculation and saves it in the data frame.
Here is an example (I cant post the original data):
import pandas as pd
import numpy as np
data = {"ID": ["X1_P0001", "X2_P0001", "X3_P0001", "X1_P0002", "X2_P0002", "X3_P0002", "X4_P0002","X5_P0002", "X1_P0003", "X2_P0003", "X3_P0003"],
"Measurement": [100, 80, 60, 120,90,70,50,25, 85,65,35]}
df["S_removal"]= np.nan
df["T_removal"]= np.nan
Data Frame before calculation
this is my function for the calculation:
def removal_TEST(Rem1, Measure, Rem2):
lst = [i.split("_")[1] for i in df["ID"]] #takes relevant ID information
y = np.unique(lst) #stores unique ID values to loop over them
for ID in y:
id_list = []
for i in range(0, len(df["ID"])):
if ID in df["ID"][i]:
id_list.append(i)
else: # this stores only the relevant id in a new list
id_list.append(np.nan)
indexlist = pd.Series(id_list)
first_index = indexlist.first_valid_index() #gets the first and last index of the id list
last_index = indexlist.last_valid_index()
col_indizes = []
for i in range(first_index, last_index+1):
col_indizes.append(i)
for i in col_indizes:
if i == 0:
continue # for i=0 there is no 0-1 element, so i=0 should be skipped
else:
Rem1[i]= 1-(Measure[i]/Measure[i-1])
Rem1[first_index]= np.nan #first entry of an ID must be NaN value
for i in range(first_index, last_index+1):
col_indizes.append(i)
for i in range(len(Rem2)):
for i in col_indizes:
Rem2[i]= 1-(Measure[i]/Measure[first_index])
Rem2[first_index]= np.nan
this is the result:
Final Data Frame
I am new to Python and to stackoverflow (so sorry if my code and question are not so good to read). Are there any good libraries to speed up my code, or do you have some suggestions?
Thank you :)
Your use of Pandas seems to be getting in the way of solving the problem. The only relevant state seems to be when the group changes and the first and previous measurement values for each row.
I'd be tempted to solve this just using Python primitives, but you could solve this in other ways if you had lots of data (i.e. millions of rows).
import pandas as pd
df = pd.DataFrame({
"ID": ["X1_P0001", "X2_P0001", "X3_P0001", "X1_P0002", "X2_P0002", "X3_P0002", "X4_P0002","X5_P0002", "X1_P0003", "X2_P0003", "X3_P0003"],
"Measurement": [100, 80, 60, 120,90,70,50,25, 85,65,35],
"S_removal": float('nan'),
"T_removal": float('nan'),
})
# somewhere keep track of the last group identifier
last = None
# iterate over rows
for idx, ID, meas in zip(df.index, df['ID'], df['Measurement']):
# what's the current group name
_, grp = ID.split('_', 1)
# see if we're in a new group
if grp != last:
last = grp
# track the group's measurement
grp_meas = meas
else:
# calculate things
df.loc[idx, 'S_removal'] = 1 - meas / last_meas
df.loc[idx, 'T_removal'] = 1 - meas / grp_meas
# keep track of the last measurement
last_meas = meas
I've commented the code in the hopes it makes sense. This takes ~2 seconds for 1000 copies of your example data, so 11000 rows.
Given that OP has said this needs to be done for a wide dataset, here's another version that reduces runtime to ~30ms for 11000 rows and 2 columns:
import numpy as np
import pandas as pd
data = {
"ID": ["X1_P0001", "X2_P0001", "X3_P0001", "X1_P0002", "X2_P0002", "X3_P0002", "X4_P0002","X5_P0002", "X1_P0003", "X2_P0003", "X3_P0003"],
"M1": [100, 80, 60, 120,90,70,50,25, 85,65,35],
"M2": [100, 80, 60, 120,90,70,50,25, 85,65,35],
}
# reset_index() because code below assumes they are unique
df = pd.concat([pd.DataFrame(data)]*1000).reset_index()
# column names
measurement_col_names = ['M1', 'M2']
single_output_names = ['S1', 'S2']
total_output_names = ['T1', 'T2']
# somewhere keep track of the last group identifier
last = None
# somewhere to store intermediate state
vals_idx = []
meas_vals = []
last_vals = []
grp_vals = []
# iterate over rows
for idx, ID, meas in zip(df.index, df['ID'], df.loc[:,measurement_col_names].values):
# what's the current group name
_, grp = ID.split('_', 1)
# we're in a new group
if grp != last:
last = grp
# track the group's measurement
grp_meas = meas
else:
# track values and which rows they apply to
vals_idx.append(idx)
meas_vals.append(meas)
last_vals.append(last_meas)
grp_vals.append(grp_meas)
# keep track of the last measurement
last_meas = meas
# convert to numpy array so it vectorises nicely
meas_vals = np.array(meas_vals)
# perform calculation using fast numpy operations
df.loc[vals_idx, single_output_names] = 1 - (meas_vals / last_vals)
df.loc[vals_idx, total_output_names] = 1 - (meas_vals / grp_vals)
I have a ~8million-ish row data frame consisting of sales for 615 products across 16 stores each day for five years.
I need to make new column/s that consists of the sales shifted back from 1 to 7 days. I've decided to sort the data frame by date, product and location. The I concatenate item and location as its own column.
Using that column I loop through each unique item/location concatenation and make the shifted sales columns. This code is below:
import pandas as pd
#sort values by item, location, date
df = df.sort_values(['date', 'product', 'location'])
df['sort_values'] = df['product']+"_"+df['location']
df1 = pd.DataFrame()
z = 0
for i in list(df['sort_values'].unique()):
df_ = df[df['sort_values']==i]
df_ = df_.sort_values('ORD_DATE')
df_['eaches_1'] = df_['eaches'].shift(-1)
df_['eaches_2'] = df_['eaches'].shift(-2)
df_['eaches_3'] = df_['eaches'].shift(-3)
df_['eaches_4'] = df_['eaches'].shift(-4)
df_['eaches_5'] = df_['eaches'].shift(-5)
df_['eaches_6'] = df_['eaches'].shift(-6)
df_['eaches_7'] = df_['eaches'].shift(-7)
df1 = pd.concat((df1, df_))
z+=1
if z % 100 == 0:
print(z)
The above code gets me exactly what I want, but takes FOREVER to complete. Is there a faster way to accomplish what I want?
I have weekly based trade export time-series data that I need to make a stacked bar plot for visualizing trade activity. To do so, I aggregated my data for sum-up of each columns for all rows, then use nlargest() to select top n columns. However, doing this way might not be quite accurate because I made stacked plot for different years in the loop and top n columns for each year can be different. But what I did, take the total sum of each column for all rows (a.k.a, including all years) then select top n columns, which is biased. So, I am looking at the different way of doing this, perhaps, I might group the time series data by each year then make the stacked plot. Is there other way around selecting top n columns from time-series data instead of using nlargest? Does anyone know any possible way of doing this? What other way we could select top n columns from time-series data? Any idea?
my current attempt:
this is my current attempt to manipulate time series data, where I aggregate each columns for all rows then select top n columns using nlargest():
import pandas as pd
# load the data
url = 'https://gist.githubusercontent.com/adamFlyn/a6048e547b5a963c7af356c964d15af6/raw/c57c7915cf14f81edc9d5eadaf14efbd43d3e58a/trade_df.csv'
df_ = pd.read_csv(url, parse_dates=['weekly'])
df_.set_index('weekly', inplace=True)
df_.loc['Total',:]= df_.sum(axis=0)
df1 = df_.T
df1 =df1.nlargest(6, columns=['Total'])
df1.drop('Total', axis=1, inplace=True)
df2 = df1.T
df2.reset_index(inplace=True)
df2['weekly'] = pd.to_datetime(df2['weekly'])
df2['year'] = df2['weekly'].dt.year
df2['week'] = df2['weekly'].dt.strftime('%W').astype('int')
then I visualize the plotting data with matplotlib as follow:
import matplotlib.pyplot as plt
plt_df = df2.set_index(['year','week'])
plt_df.drop("weekly", axis=1, inplace=True)
for n, g in plt_df.groupby(level=0):
ax = g.loc[n].plot.bar(stacked=True, title=f'{n} Year', figsize=(8,5))
plt.show()
although the output of current approach in stacked plot is fine, but selecting top n columns using nlargest() is not quite accurate.for example, in 2019 USDA report, China wasn't top trade partner of US, but in late 2020, China was getting more products from US, and if I use nlargest() to select top column (or trade partners), it is going to be problematic and China won't be in list and not in the plot.
update
As #Vaishali suggested in the comment with this post, using head() might be good idea to extract top columns, so I tried like this:
for n, g in plt_df.groupby(level=0):
for i in g:
gg = g[i].sort_values(g[i].values,ascending = False).groupby('week').head(5)
ax = gg.loc[n].plot.bar(stacked=True, title=f'{n} Year', figsize=(8,5))
but this is not working. Can anyone point me out how to select top n columns from time series data? any idea?
You can try something like this:
url = 'https://gist.githubusercontent.com/adamFlyn/a6048e547b5a963c7af356c964d15af6/raw/c57c7915cf14f81edc9d5eadaf14efbd43d3e58a/trade_df.csv'
df_ = pd.read_csv(url, parse_dates=['weekly'])
df_['weekly'] = pd.to_datetime(df_['weekly'])
df_.set_index('weekly', inplace=True)
for g, n in df_.groupby(df_.index.year):
ng = n.loc[:, n.sum().rank(ascending=False, method='min')<5]
ng.div(ng.sum(axis=1), axis=0).plot.area(title=f'{g}')
Output:
Bar chart:
import matplotlib.ticker as mticker
url = 'https://gist.githubusercontent.com/adamFlyn/a6048e547b5a963c7af356c964d15af6/raw/c57c7915cf14f81edc9d5eadaf14efbd43d3e58a/trade_df.csv'
df_ = pd.read_csv(url, parse_dates=['weekly'])
df_['weekly'] = pd.to_datetime(df_['weekly'])
df_.set_index('weekly', inplace=True)
for g, n in df_.groupby(df_.index.year):
ng = n.loc[:, n.sum().rank(ascending=False, method='min')<5]
ng.index = ng.index.strftime('%m/%d/%Y')
ax = ng.plot.bar(stacked=True, figsize=(10,8))
Output:
Staked 100% Bar chart:
#(previous code)
ax = ng.div(ng.sum(axis=1), axis=0).plot.bar(stacked=True, figsize=(10,8))
Output:
I am not sure I understand the requirement correctly here, but this is based on your output charts:
find top n countries using sum and nlargest
filter df by top_countries, groupby year and week, sum
for each unique year, plot stacked chart
df.columns = df.columns.str.strip()
top_countries = df.iloc[:, 1:].sum().nlargest(6).index.tolist()
df['weekly'] = pd.to_datetime(df['weekly'])
agg = df[top_countries].groupby([df['weekly'].dt.year.rename('year'),df['weekly'].dt.week.rename('week')]).sum()
for year in df['weekly'].dt.year.unique():
agg[agg.index.get_level_values(0) == year].droplevel(level=0).plot.bar(stacked = True, figsize = (10,5), title = year)
Edit:
If you want to filter top countries by year, move the part where you are filtering df into the loop,
df.columns = df.columns.str.strip()
df['weekly'] = pd.to_datetime(df['weekly'])
for year in df['weekly'].dt.year.unique():
top_countries = df.iloc[:, 1:].sum().nlargest(6).index.tolist()
agg = df[top_countries].groupby([df['weekly'].dt.year.rename('year'),df['weekly'].dt.week.rename('week')]).sum()
agg[agg.index.get_level_values(0) == year].droplevel(level=0).plot.bar(stacked = True, figsize = (10,5), title = year)
You can try this
import pandas as pd
# load the data
url = 'https://gist.githubusercontent.com/adamFlyn/a6048e547b5a963c7af356c964d15af6/raw/c57c7915cf14f81edc9d5eadaf14efbd43d3e58a/trade_df.csv'
df = pd.read_csv(url, parse_dates=['weekly'])
df['weekly'] = pd.to_datetime(df['weekly'])
df['year'] = df['weekly'].dt.year
df['week'] = df['weekly'].dt.strftime('%W').astype('int')
df.set_index(['year', 'week'], inplace=True)
df.drop('weekly', axis=1, inplace=True)
df_year_sums = df.groupby(level='year').sum().T
for year in df_year_sums.columns:
largest = list(df_year_sums[year].nlargest(6).index)
df_plot = df.xs(year, level='year')[largest]
df_plot.plot.bar(stacked=True, title=f'{year} Year', figsize=(8,5))
df=pd.read_csv('trade_df.csv',parse_dates=['weekly'])
df['weekly']=pd.to_datetime(df['weekly'])
df['Total']=0
df.reset_index()
for key,row in df.iterrows():
sum=0.0
for row_value in row:
if type(row_value)==float:
sum+=row_value
df.loc[key,'Total']=sum
results=df.sort_values(by="Total",ascending=False)
print(results.head(5))
#grouped=df.groupby('year').sum().T.plot.bar(stacked=True)
#plt.show()
filter=df['year'].isin([2018])
results_2018=df[filter].sort_values(by=['total'],ascending=False).head(5)
filter=df['year'].isin([2019])
results_2019=df[filter].sort_values(by=['total'],ascending=False).head(5)
filter=df['year'].isin([2020])
results_2020=df[filter].sort_values(by=['total'],ascending=False).head(5)
grouped=df.groupby('year').sum().T.plot.bar(stacked=True)
plt.show()
fp=results_2018.pivot_table(index=['week'],aggfunc='sum').fillna(0)
fp = fp[(fp.T != 0).any()]
fp2=results_2019.pivot_table(index=['week'],aggfunc='sum').fillna(0)
fp2 = fp2[(fp2.T != 0).any()]
fp3=results_2020.pivot_table(index=['week'],aggfunc='sum').fillna(0)
fp3 = fp3[(fp3.T != 0).any()]
#print(fp)
fig,ax=plt.subplots(3,1,figsize=(16,16))
fp.plot.bar(stacked=True,ax=ax[0])
fp2.plot.bar(stacked=True,ax=ax[1])
fp3.plot.bar(stacked=True,ax=ax[2])
plt.show()
df = pd.DataFrame(np.random.randint(1,100,(100)),columns=["column1"])
results=np.array(df.sort_values(by="column1",ascending=False)).flatten()
print(results[:5])
I am having trouble reformatting a dataframe.
My input is a day value rows by symbols columns (each symbol has different dates with it's values):
Input
code to generate input
data = [("01-01-2010", 15, 10), ("02-01-2010", 16, 11), ("03-01-2010", 16.5, 10.5)]
labels = ["date", "AAPL", "AMZN"]
df_input = pd.DataFrame.from_records(data, columns=labels)
The needed output is (month row with new row for each month):
Needed output
code to generate output
data = [("01-01-2010","29-01-2010", "AAPL", 15, 20), ("01-01-2010","29-01-2010", "AMZN", 10, 15),("02-02-2010","30-02-2010", "AAPL", 20, 32)]
labels = ['bd start month', 'bd end month','stock', 'start_month_value', "end_month_value"]
df = pd.DataFrame.from_records(data, columns=labels)
Meaning (Pseudo code)
1. for each row take only non nan values to create a new "row" (maybe dictionary with the date as the index and the [stock, value] as the value.
2. take only rows that are business start of month or business end of month.
3. write those rows to a new datatframe.
I have read several posts like this and this and several more.
All treat with dataframe of the same "type" and just resampling while I need to change to structure...
My code so far
# creating the new index with business days
df1 =pd.DataFrame(range(10000), index = pd.date_range(df.iloc[0].name, periods=10000, freq='D'))
from pandas.tseries.offsets import CustomBusinessMonthBegin
from pandas.tseries.holiday import USFederalHolidayCalendar
bmth_us = CustomBusinessMonthBegin(calendar=USFederalHolidayCalendar())
df2 = df1.resample(bmth_us).mean()
# creating the new index interseting my old one (daily) with the monthly index
new_index = df.index.intersection(df2.index)
# selecting only the rows I want
df = df.loc[new_index]
# creating a dict that will be my new dataset
new_dict = collections.OrderedDict()
# iterating over the rows and adding to dictionary
for index, row in df.iterrows():
# print index
date = df.loc[index].name
# values are the not none values
values = df.loc[index][~df.loc[index].isnull().values]
new_dict[date]=values
# from dict to list
data=[]
for key, values in new_dict.iteritems():
for i in range(0, len(values)):
date = key
stock_name = str(values.index[i])
stock_value = values.iloc[i]
row = (key, stock_name, stock_value)
data.append(row)
# from the list to df
labels = ['date','stock', 'value']
df = pd.DataFrame.from_records(data, columns=labels)
df.to_excel("migdal_format.xls")
Current output I get
One big problem:
I only get value of the stock on the start of month day.. I need start and end so I can calculate the stock gain on this month..
One smaller problem:
I am sure this is not the cleanest and fastest code :)
Thanks a lot!
So I have found a way.
looping through each column
groupby month
taking the first and last value I have in that month
calculate return
df_migdal = pd.DataFrame()
for col in df_input.columns[0:]:
stock_position = df_input.loc[:,col]
name = stock_position.name
name = re.sub('[^a-zA-Z]+', '', name)
name = name[0:-4]
stock_position=stock_position.groupby([pd.TimeGrouper('M')]).agg(['first', 'last'])
stock_position["name"] = name
stock_position["return"] = ((stock_position["last"] / stock_position["first"]) - 1) * 100
stock_position.dropna(inplace=True)
df_migdal=df_migdal.append(stock_position)
df_migdal=df_migdal.round(decimals=2)
I tried I way cooler way, but did not know how to handle the ,multi index I got... I needed that for each column, to take the two sub columns and create a third one from some lambda function.
df_input.groupby([pd.TimeGrouper('M')]).agg(['first', 'last'])
I am working on a data analysis and I have to generate Histograms. My code has more than 7 nested for-loops. Each nested loop filters the data frame by a unique value from the category to form a new data frame of sub categories and then splitting further like previous. Each day has around 400,000 records. And I have to process last 30 days record. The result is to produce histograms for the values(only one numerical column) of the last un-splittable category. How do I reduce complexity? Any alternate methods ?
for customer in data_frame['MasterCustomerID'].unique():
df_customer = data_frame.loc[data_frame['MasterCustomerID'] == customer]
for service in df_customer['Service'].unique():
df_service = df_customer.loc[df_customer['Service'] == service]
for source in df_service['Source'].unique():
df_source = df_service.loc[df_service['Source'] == source]
for subcomponent in df_source['SubComponentType'].unique():
df_subcomponenttypes = df_source.loc[df_source['SubComponentType'] == subcomponent]
for kpi in df_subcomponenttypes['KPI'].unique():
df_kpi = df_subcomponenttypes.loc[df_subcomponenttypes['KPI'] == kpi]
for device in df_kpi['Device_Type'].unique():
df_device_type = df_kpi.loc[df_kpi['Device_Type'] == device]
for access in df_device_type['Access_type'].unique():
df_access_type = df_device_type.loc[df_device_type['Access_type'] == access]
df_access_type['Day'] = ifweekday(df_access_type['PerformanceTimeStamp'])
You can use pandas.groupby to find unique combinations of different levels of the columns (see here and here) and then loop over the dataframe grouped by each combination. There are ~4000 combinations so be careful when uncommenting the histogram code below.
import string
import numpy as np, pandas as pd
from matplotlib import pyplot as plt
np.random.seed(100)
# Generate 400,000 records (400 obs for 1000 individuals in 6 columns)
NIDS = 1000; NOBS = 400; NCOLS = 6
df = pd.DataFrame(np.random.randint(0, 4, size = (NIDS*NOBS, NCOLS)))
mapper = dict(zip(range(26), list(string.ascii_lowercase)))
df.replace(mapper, inplace = True)
cols = ['Service', 'Source', 'SubComponentType', \
'KPI', 'Device_Type', 'Access_type']
df.columns = cols
# Generate IDs for individuals
df['MasterCustomerID'] = np.repeat(range(NIDS), NOBS)
# Generate values of interest (to be plotted)
df['value2plot'] = np.random.rand(NIDS*NOBS)
# View the counts for each unique combination of column levels
df.groupby(cols).size()
# Do something with the different subsets (such as make histograms)
for levels, group in df.groupby(cols):
print(levels)
# fig, ax = plt.subplots()
# ax.hist(group['value2plot'])
# ax.set_title(", ".join(levels))
# plt.savefig("hist_" + "_".join(levels) + ".png")
# plt.close()