Resampling timeseries Data Frame for different customized seasons and finding aggregates - python

I am working on a huge timeseries dataset of following format:
import pandas as pd
import numpy as np
import random
import seaborn as sns
df = pd.DataFrame({'date':pd.date_range('1990',end = '1994',freq='3H'),
'A': np.random.randint(low = 0,high=100,size=11689),
'B': np.random.randint(low = 10,high=45,size=11689) })
df['date'] = pd.to_datetime(, format='%Y/%m/%d %H:%M',errors ='coerce')
df.index = pd.DatetimeIndex(
df.drop('date', axis = 1, inplace = True)
My aim is to first filter the dataframe according to the customized seasons: winter (12,1,2) (i.e. Dec, Jan, Feb), Pre-monsoon (3,4,5) , monsoon (6,7,8,9) and post-monsoon (10,11). I am aware of resample('Q-NOV') function but it is quarterly only. As mentioned, I need to customize the months.
I have been able to do so by executing the following codes:
# DF-Winter
winterStart = '-12'
winterEnd = '-02'
df_winter = pd.concat([df.loc[str(year) + winterStart : str(year+1) + winterEnd].mean() for year in range(1990, 1994)]) # DJF
# used year and year+1 because winter season spans from an initial year to the next year.
# DF - Premonsoon
df_preMonsoon = df[df.index.month.isin([3,4,5])] # MAM
and so on.
I want to find the seasonal average values (every year and season) of my parameters A and B for my data period. Any help will be highly appreciated.
Thank you in advance.

Could something like that work for you:
import pandas as pd
import numpy as np
seasons_order = (
dates = pd.date_range('1990',end = '1994',freq='3H')
def define_seasons(month):
if 1 <= month <= 2:
return 'Winter'
if 3 <= month <= 5:
return 'Pre-monsoon'
if 6 <= month <= 9:
return 'Monsoon'
if 10 <= month <= 11:
return 'Post-monsoon'
if month == 12:
return 'Winter'
def define_seasons_year(date_to_convert):
if date_to_convert.month == 12:
return date_to_convert.year + 1
return date_to_convert.year
df = pd.DataFrame(
'A': np.random.randint(low = 0, high=100, size=dates.size),
'B': np.random.randint(low = 10, high=45, size=dates.size),
'season': pd.Categorical(,
seasonal_average_values = df.groupby(['season_year', 'season']).mean()


Sum of a numeric columns into specific ranges and counting its occurrences

I am quite new to programming field of Python.
I have a dataset which needs to be modified. I tried few methods for sum part but I dont get the exact results.
Dataset : My data table
To categorize the debit and credit values into the following ranges/bins :
a) 2000-4000
b) 5000-8000
c) 9000-20000
The sum of debit should be for 20 days period like
if the transaction happened on 2020-01-01 then
the sum of credit should be from 2020-01-01 to 2020-01-20
I also want the record of occurrences i.e
the number of times the value from the bins lies in the category
Required Result : Result]2
The code I tried for credit values:
EndDate = BM['transaction_date']+ pd.to_timedelta(20, unit='D')
StartDate= BM['transaction_date']
dfx['EndDate'] = EndDate
dfx['StartDate'] = StartDate
dfx['Debit'] = dfx.apply(lambda x: BM.loc[(df['transaction_date'] >= x.StartDate) &
<=x.EndDate),'Debit'].sum(), axis=1)
error :
I have created a lot of functions and broke the problem into smaller tasks. Hope the comments make this understandable.
def sum20Days(df, debitORCredit):
Calculates the sum of all amount in the debitORCredit column of df looking 20 days into the future within df
df: pandas DataFrame. Should already do groupby on name
debitORCredit : String. Takes either debit or credit. Column names in the dataframe
df: Creates a column sum_debit_20days, adds the sum amount and returns the final dataframe
df = df.copy()
temp_df = df[df[debitORCredit]>0]
dates = sorted(temp_df["transaction_date"].unique())
curr_date = dates[0]
date_20days = curr_date + pd.Timedelta(20, unit="D")
i = 0
while i < len(dates):
date = dates[i]
if date > date_20days:
curr_date = date
date_20days = curr_date + pd.Timedelta(20, unit="D")
series = temp_df.loc[(df["transaction_date"]>=date)&(df["transaction_date"]<=date_20days), :]
df.loc[max(df.loc[df["transaction_date"] == series["transaction_date"].max()].index), f"sum_{debitORCredit}_20days"] = sum(series[debitORCredit])
new_i = series["transaction_date"].nunique()
if new_i > 1:
i = new_i+1
i += 1
return df
def groupListUsingList(inp, groupby):
Groups inp by list groupby
inp: List
groupby: List
Example: inp = [0, 1, 2, 3, 4, 5, 6, 7], groupby=[3, 6] then output = [[0, 1, 2, 3], [4, 5, 6], [7]]
groupby = sorted(groupby)
inp = sorted(inp)
lst = []
arr = []
for i in inp:
if len(groupby) > 0:
if i <= groupby[0]:
if len(arr)>0:
arr = [i]
arr += inp[i:]
if len(arr) > 0:
return lst
def count_amounts_in_category(df, debitORCredit, category_info):
Based on the category assigned, finds the number of amounts belonging to that category
df: Pandas Dataframe. Grouped by name and only contains the transactions belonging to a single category calculation
debitORCredit: String. Takes either credit/debit. Used to get column in df
category_info: Dict. Contains the rules of categorization.
count: Float. Returns count
if debitORCredit.lower() == "debit":
temp_df = df.loc[(df["debitorcredit"]=="D")]
elif debitORCredit.lower() == "credit":
temp_df = df.loc[(df["debitorcredit"]=="C")]
if temp_df.shape[0] == 0:
return np.nan
category = temp_df.iloc[-1].loc[f"category_{debitORCredit}"]
amount_range = category_info.get(category)
count = temp_df[debitORCredit].apply(lambda x: 1 if x<=amount_range[1] and x>=amount_range[0] else 0).sum()
return count
def assign_category(amount, category_info):
Assigns category based on amount and categorization rules
Input -
amount: Float/Int. The amount
category_info: Dict. Contains the rules of categorization.
Ouptut -
Returns the String category based on the categorization rules
if pd.isna(amount):
return np.nan
for k, v in category_info.items():
if v[0]<=amount<=v[1]:
return k
return np.nan
category_info = {"A": (2000, 4000),
"B": (5000, 8000),
"C":(9000, 20000)}
debitORCredit = "debit"
new_df = pd.DataFrame()
#Groupby name, then for each date in a group, calculate the sum of debitORCredit amounts over the next 20 days
for group in df.groupby("name"):
temp_df = sum20Days(group[1], debitORCredit=debitORCredit)
new_df = pd.concat([new_df, temp_df])
new_df = new_df.reset_index(drop=True)
#Based on the 20 days sum, use the categorization rules to assign a category
new_df[f"category_{debitORCredit}"] = new_df[f"sum_{debitORCredit}_20days"].apply(lambda x: assign_category(x, category_info))
#After assigning a category, groupby name and later groupby each 20 day transaction to find the count of transaction that belong to category assigned to that group of transactions
for group in new_df.groupby("name"):
#to groupby every 20 day transaction, we identified the last row of every 20 day transaction (ones which have a sum_debit_20days value) and split the group(a group from name groupby) on the last value in the index
indices = groupListUsingList(inp=group[1].index, groupby=group[1][group[1][f"sum_{debitORCredit}_20days"].notna()].index)
for index in indices:
count = count_amounts_in_category(df=new_df.loc[index], debitORCredit=debitORCredit, category_info=category_info)
new_df.loc[index[-1], f"count_{debitORCredit}"] = count

How do I iterate through a pandas dataframe and access a lagging or leading row?

I have the following DataFrame:
df = pd.DataFrame(
'date': ['2020-12-05', '2020-12-06', '2020-12-07'],
'day': ['Saturday', 'Sunday', 'Monday'],
'score': [2, 3, 0]
In the DataFrame above, I want to update the score on Monday if the scores on the weekend were non-zero values. For the DataFrame above, Monday's score would be 2.5. But it should work for other, longer DataFrames as well.
I know I can use the following:
df.score.loc[( == 'Monday') & (df.score != 0) & (df.score.shift(1) != 0) & (df.score.shift(2) != 0)] = (df.score + df.score.shift(1)+df.score.shift(2))/3
df.score.loc[( == 'Monday') & (df.score != 0) & (df.score.shift(1) != 0) & (df.score.shift(2) == 0)] = (df.score + df.score.shift(1))/2
df.score.loc[( == 'Monday') & (df.score != 0) & (df.score.shift(1) == 0) & (df.score.shift(2) != 0)] = (df.score + df.score.shift(2))/2
df.score.loc[( == 'Monday') & (df.score == 0) & (df.score.shift(1) != 0) & (df.score.shift(2) != 0)] = (df.score.shift(1) + df.score.shift(2))/2
df.score.loc[( == 'Monday') & (df.score == 0) & (df.score.shift(1) != 0) & (df.score.shift(2) == 0)] = df.score.shift(1)
df.score.loc[( == 'Monday') & (df.score == 0) & (df.score.shift(1) == 0) & (df.score.shift(2) != 0)] = df.score.shift(2)
but this is too lengthy. I think I need to iterate through the DataFrame, something like this:
for index, row in df.iterrows():
if == 'Monday':
non_zeros = []
if row.score != 0:
if row.score.shift(1) != 0:
if row.score.shift(2) != 0:
mon_score = sum(non_zeros)/len(non_zeros)[index, 'score'] = mon_score
The code above doesn't work because I get an error:
AttributeError: 'float' object has no attribute 'shift'
So, it seems that shift() isn't correct.
How would I access the previous row and how would I access the score in the previous row? Is there a better way than manually listing the combinations of conditions, like I've done above?
How would I access the previous row
Keep the previous row in a variable - you actually want to see the previous two rows.
rows = df.iterrows()
index,minus2 = next(rows)
index,minus1 = next(rows)
for index, current in rows:
if == 'Monday':
print(f'Saturday:{}, Sunday:{}, Monday:{}')
print(f'Sat score:{minus2.score}, Sun score:{minus2.score}, Mon score:{current.score}')
minus2,minus1 = minus1,current
Here is another way to do it.
import pandas as pd
from numpy.random import default_rng
rng = default_rng()
dates = pd.date_range("2020-12-04", periods=60, freq="D")
days = dates.day_name()
score = rng.choice([0, 0, 0, 0, 0, 0, 0.1, 0.2, 0.3], size=60)
df = pd.DataFrame({"date": dates, "day": days, "score": score})
Groupby weekend/weekday; if there is a score on the weekend; calculate the new Monday score; use the group's Monday index to assign a new value to the DataFrame.
weekends ="Saturday|Sunday|Monday")
days_of_interest = df[weekends]
gb = df.groupby(( == "Saturday").cumsum())
for k, g in gb:
if (g.iloc[:2].score != 0).any():
monday = (g.iloc[-1]).name
new_score = g.score.mean()
# print(new_score))
df.loc[monday, "score"] = new_score
# print(g)
It assumes the data does not break/stop/start on a Saturday,Sunday,Monday boundary - necessary assumption because it uses .iloc slicing/indexing and groups starting on 'Saturday'. Extra logic or alternate selection methods would be needed to accommodate the edge cases if the assumption is not correct.
I wasn't too sure on how the monday score was to be updated, looked like you were averaging over non-zero scores (sum(non_zeros)/len(non_zeros)) - including monday(?). Maybe it should have been:
# add average of weekend scores to monday score
weekend_score = g.iloc[:2].score.mean()
df.loc[monday,'score'] += weekend_score
# or just the sum of all the scores?
df.loc[monday,'score'] = g.score.sum()
If I get it correctly, this might be the solution you have been looking for. I have assumed that your dummy data might be same(no Nulls or any other randomized rows) Try following if you want to use shift().
import pandas as pd
import math
df = pd.DataFrame(
'date': ['2020-11-30','2020-12-05', '2020-12-06', '2020-12-07'],
'day': ['Monday','Saturday', 'Sunday', 'Monday'],
'score': [4, 2, 3, 0]
def update(row,df1,df2):
score_Mon = row['score']
score_Sun = df1.iloc[]['score']
score_Sat = df2.iloc[]['score']
# Base condition if previous Saturday,Sunday values available, else put
# default value only
if row['day'] == 'Monday' and not(math.isnan(score_Sun)) and
# Non zero condition
if score_Mon != 0 and score_Sun > 0 and score_Sat > 0:
row['score'] = (score_Mon + score_Sun + score_Sat)/3
# Zero condition
row['score'] = (score_Sun + score_Sat)/2
return row
df = df.apply(update,axis=1,args = [df.shift(1),df.shift(2)])
df = pd.DataFrame(
'date': ['2020-12-05', '2020-12-06', '2020-12-07', '2020-12-05', '2020-12-06', '2020-12-07'],
'day': ['Saturday', 'Sunday', 'Monday','Saturday', 'Sunday', 'Monday'],
'score': [-0.2, 0, 0.0, -0.3, 0, 0.0]
Based on the question, if we have to access previous rows based on row in question, we can use below
mondays = df['day']=='Monday'
sundays = df.score.shift(1)[mondays]
saturdays = df.score.shift(2)[mondays]
# row_mask_for_upd = mondays & sundays.astype(bool) | saturdays.astype(bool) # if either of sundays or saturdays have to be non zero
row_mask_for_upd = mondays & sundays.astype(bool) & saturdays.astype(bool) # if both sundays and saturdays have to be non zero
if True in set(row_mask_for_upd):
df.loc[row_mask_for_upd, "score"] = (sundays + saturdays)/2
Take n rows from a spark dataframe and pass to toPandas()

I have this code:
l = [('Alice', 1),('Jim',2),('Sandra',3)]
df = sqlContext.createDataFrame(l, ['name', 'age'])
df.withColumn('age2', df.age + 2).toPandas()
Works fine, does what it needs to. Suppose though I only want to display the first n rows, and then call toPandas() to return a pandas dataframe. How do I do it? I can't call take(n) because that doesn't return a dataframe and thus I can't pass it to toPandas().
So to put it another way, how can I take the top n rows from a dataframe and call toPandas() on the resulting dataframe? Can't think this is difficult but I can't figure it out.
I'm using Spark 1.6.0.
You can use the limit(n) function:
l = [('Alice', 1),('Jim',2),('Sandra',3)]
df = sqlContext.createDataFrame(l, ['name', 'age'])
df.limit(2).withColumn('age2', df.age + 2).toPandas()
l = [('Alice', 1),('Jim',2),('Sandra',3)]
df = sqlContext.createDataFrame(l, ['name', 'age'])
df.withColumn('age2', df.age + 2).limit(2).toPandas()
You could get first rows of Spark DataFrame with head and then create Pandas DataFrame:
l = [('Alice', 1),('Jim',2),('Sandra',3)]
df = sqlContext.createDataFrame(l, ['name', 'age'])
df_pandas = pd.DataFrame(df.head(3), columns=df.columns)
In [4]: df_pandas
name age
0 Alice 1
1 Jim 2
2 Sandra 3
Try it:
def showDf(df, count=None, percent=None, maxColumns=0):
if (df == None): return
import pandas
from IPython.display import display
pandas.set_option('display.encoding', 'UTF-8')
# Pandas dataframe
dfp = None
# maxColumns param
if (maxColumns >= 0):
if (maxColumns == 0): maxColumns = len(df.columns)
pandas.set_option('display.max_columns', maxColumns)
# count param
if (count == None and percent == None): count = 10 # Default count
if (count != None):
count = int(count)
if (count == 0): count = df.count()
pandas.set_option('display.max_rows', count)
dfp = pandas.DataFrame(df.head(count), columns=df.columns)
# percent param
elif (percent != None):
percent = float(percent)
if (percent >=0.0 and percent <= 1.0):
import datetime
now =
seed = long(now.strftime("%H%M%S"))
dfs = df.sample(False, percent, seed)
count = df.count()
pandas.set_option('display.max_rows', count)
dfp = dfs.toPandas()
Examples of usages are:
# Shows the ten first rows of the Spark dataframe
showDf(df, 10)
showDf(df, count=10)
# Shows a random sample which represents 15% of the Spark dataframe
showDf(df, percent=0.15)

Create new dataframe from multiple multi-index dataframes

I want to create a new dataframe with x amount of years which takes random seasons from previous weather data.
Code to illustrate the problem:
import pandas as pd
import numpy as np
dates = pd.date_range('20070101',periods=3200)
df = pd.DataFrame(data=np.random.randint(0,100,(3200,1)), columns =list('A'))
df['date'] = dates
df = df[['date','A']]
Apply season function to the datetime index
def get_season(row):
if row['date'].month >= 3 and row['date'].month <= 5:
return '2'
elif row['date'].month >= 6 and row['date'].month <= 8:
return '3'
elif row['date'].month >= 9 and row['date'].month <= 11:
return '4'
return '1'
Apply the function
df['Season'] = df.apply(get_season, axis=1)
Create a 'Year' column for indexing
df['Year'] = df['date'].dt.year
Multi-index by Year and Season
df = df.set_index(['Year', 'Season'], inplace=False)
Create new dataframes based on season to select from
winters = df.query('Season == "1"')
springs = df.query('Season == "2"')
summers = df.query('Season == "3"')
autumns = df.query('Season == "4"')
I now want to create a new DataFrame which takes a random winter from the wintersdataframe, followed by a random spring from the springs, followed by a random summer from summersand random autumn from autumns and does this for a specified number of years (e.g. 100) but I can't see how to do this.
Duplicate seasons are allowed (it should sample seasons randomly), and the first spring does not have to belong to the same year as the first winter, this doesn't matter.
EDIT 2: Solution using all seasonal dataframes:
years = df['date'].dt.year.unique()
dfs = []
for i in range(outputyears):
dfs.append(winters.query("Year == %d" %np.random.choice(years, 1)))
dfs.append(springs.query("Year == %d" %np.random.choice(years, 1)))
dfs.append(summers.query("Year == %d" %np.random.choice(years, 1)))
dfs.append(autumns.query("Year == %d" %np.random.choice(years, 1)))
rnd = pd.concat(dfs)
It's most probably not the best way to do it, but you can do it this way:
years = df['date'].dt.year.unique()
dfs = []
for i in range(100):
dfs.append(df.query("Year == %d and Season == '1'" %np.random.choice(years, 1)))
dfs.append(df.query("Year == %d and Season == '2'" %np.random.choice(years, 1)))
dfs.append(df.query("Year == %d and Season == '3'" %np.random.choice(years, 1)))
dfs.append(df.query("Year == %d and Season == '4'" %np.random.choice(years, 1)))
rnd = pd.concat(dfs)

How to generate random time series data including February with 29 days?

I have used the following codes to generate a random rainfall data from 1950 to 2009 with known probability, mean and standard deviations. But, I have been suffering from iterating the days of February as 29 days in the leap years. And, also I was trying to save the output in a text file, but it gives an error message like
TypeError: float argument required, not numpy.string_
Can anyone please help me out?
My code:
import numpy as np
import random
import itertools
import datetime
dry =[0.33,0.27,0.32,0.41,0.42,0.45,0.57,0.52,0.45,0.39,0.37,0.37]
wet = [0.66,0.72,0.67,0.58,0.57,0.54,0.42,0.47,0.54,0.60,0.62,0.62]
d2d_tran = [0.56,0.50,0.58,0.62,0.63,0.67,0.73,0.66,0.60,0.56,0.57,0.62]
w2w_tran = [0.78,0.80,0.79,0.73,0.72,0.72,0.63,0.64,0.66,0.71,0.74,0.76]
mu = [3.71, 4.46, 4.11, 2.94, 3.00, 2.87, 2.31, 2.44, 2.56, 3.45, 4.32, 4.12]
sigma = [6.72,7.92,7.49,6.57,6.09,5.53,4.38,4.69,4.31,5.71,7.64,7.54]
days = [31,28,31,30,31,30,31,31,30,31,30,31]
rain = []
for y in xrange(0,60):
for m in xrange(0,12):
random_num = np.random.rand(days[m])
if random.random() <= dry[m]:
random_num[0] = 0
r = abs(random.gauss(mu[m],sigma[m]))
random_num[0] = r
for i in xrange(1,days[m]):
if random_num[i-1] == 0:
if random_num[i] <= d2d_tran[m]:
random_num[i] = 0
r = abs(random.gauss(mu[m],sigma[m]))
random_num[i] = r
if random_num[i] <= w2w_tran[m]:
r = abs(random.gauss(mu[m],sigma[m]))
random_num[i] = r
random_num[i] = 0
rain_series = []
for j in itertools.chain.from_iterable(rain):
y = np.array(rain_series).reshape(-1, 1)
date_series = []
def generate_dates(start_date, end_date):
return (start_date + datetime.timedelta(days=d) for d in xrange((end_date - start_date).days + 1))
start_date =, 1, 1)
end_date =, 12, 16)
for current_date in generate_dates(start_date, end_date):
f = current_date.strftime('%Y-%m-%d')
z = np.array(date_series).reshape(-1, 1)
#### Here I have 365x60 = 21900 rainfall values, that is why I had to
####set the end_date upto (2009,12,16). If
#### the February days for leap years can be set as 29 in days[] of
####rain_series than this problem would be solved.
data = np.concatenate((z,y), axis=1)
print data
data1 = data.reshape((21900,2))
np.savetxt('Random Rainfall Data.txt', data1)
#### I want to shape data in two columns like dates and rainfall.
#### And than, save it into a text file. But, it provides an error!!!
Use the calendar.monthrange() to get the number of days of a month.
for year in xrange(1950,2020):
for month in xrange(1,13):
day_num = calendar.monthrange(year, month)[1]
random_num = np.random.rand(day_num)
Regarding the data writing problem, you must add a third argument. It depends on your specific problem:
np.savetxt('test.txt', a, '%s') # Example
See documentation for more formatting info.

