Resampling timeseries Data Frame for different customized seasons and finding aggregates - python

I am working on a huge timeseries dataset of following format:
import pandas as pd
import numpy as np
import random
import seaborn as sns
df = pd.DataFrame({'date':pd.date_range('1990',end = '1994',freq='3H'),
'A': np.random.randint(low = 0,high=100,size=11689),
'B': np.random.randint(low = 10,high=45,size=11689) })
df['date'] = pd.to_datetime(df.date.astype(str), format='%Y/%m/%d %H:%M',errors ='coerce')
df.index = pd.DatetimeIndex(df.date)
df.drop('date', axis = 1, inplace = True)
My aim is to first filter the dataframe according to the customized seasons: winter (12,1,2) (i.e. Dec, Jan, Feb), Pre-monsoon (3,4,5) , monsoon (6,7,8,9) and post-monsoon (10,11). I am aware of resample('Q-NOV') function but it is quarterly only. As mentioned, I need to customize the months.
I have been able to do so by executing the following codes:
# DF-Winter
winterStart = '-12'
winterEnd = '-02'
df_winter = pd.concat([df.loc[str(year) + winterStart : str(year+1) + winterEnd].mean() for year in range(1990, 1994)]) # DJF
# used year and year+1 because winter season spans from an initial year to the next year.
# DF - Premonsoon
df_preMonsoon = df[df.index.month.isin([3,4,5])] # MAM
and so on.
Problem
I want to find the seasonal average values (every year and season) of my parameters A and B for my data period. Any help will be highly appreciated.
Thank you in advance.

Could something like that work for you:
import pandas as pd
import numpy as np
seasons_order = (
'Winter',
'Pre-monsoon',
'Monsoon',
'Post-monsoon',
)
dates = pd.date_range('1990',end = '1994',freq='3H')
def define_seasons(month):
if 1 <= month <= 2:
return 'Winter'
if 3 <= month <= 5:
return 'Pre-monsoon'
if 6 <= month <= 9:
return 'Monsoon'
if 10 <= month <= 11:
return 'Post-monsoon'
if month == 12:
return 'Winter'
def define_seasons_year(date_to_convert):
if date_to_convert.month == 12:
return date_to_convert.year + 1
else:
return date_to_convert.year
df = pd.DataFrame(
{
'A': np.random.randint(low = 0, high=100, size=dates.size),
'B': np.random.randint(low = 10, high=45, size=dates.size),
'season': pd.Categorical(
dates.month.map(define_seasons),
categories=seasons_order,
ordered=True
),
'season_year': dates.map(define_seasons_year),
},
index=dates
)
print(df)
seasonal_average_values = df.groupby(['season_year', 'season']).mean()
print(seasonal_average_values)

Related

Sum of a numeric columns into specific ranges and counting its occurrences

I am quite new to programming field of Python.
I have a dataset which needs to be modified. I tried few methods for sum part but I dont get the exact results.
Dataset : My data table
Requirements:
To categorize the debit and credit values into the following ranges/bins :
a) 2000-4000
b) 5000-8000
c) 9000-20000
The sum of debit should be for 20 days period like
if the transaction happened on 2020-01-01 then
the sum of credit should be from 2020-01-01 to 2020-01-20
I also want the record of occurrences i.e
the number of times the value from the bins lies in the category
Required Result : Result]2
The code I tried for credit values:
EndDate = BM['transaction_date']+ pd.to_timedelta(20, unit='D')
StartDate= BM['transaction_date']
dfx=BM
dfx['EndDate'] = EndDate
dfx['StartDate'] = StartDate
dfx['Debit'] = dfx.apply(lambda x: BM.loc[(df['transaction_date'] >= x.StartDate) &
(BM['transaction_date']
<=x.EndDate),'Debit'].sum(), axis=1)
Code1-
Code2-
error :
I have created a lot of functions and broke the problem into smaller tasks. Hope the comments make this understandable.
def sum20Days(df, debitORCredit):
"""
Calculates the sum of all amount in the debitORCredit column of df looking 20 days into the future within df
df: pandas DataFrame. Should already do groupby on name
debitORCredit : String. Takes either debit or credit. Column names in the dataframe
Returns:
df: Creates a column sum_debit_20days, adds the sum amount and returns the final dataframe
"""
df = df.copy()
temp_df = df[df[debitORCredit]>0]
dates = sorted(temp_df["transaction_date"].unique())
curr_date = dates[0]
date_20days = curr_date + pd.Timedelta(20, unit="D")
i = 0
while i < len(dates):
date = dates[i]
if date > date_20days:
curr_date = date
date_20days = curr_date + pd.Timedelta(20, unit="D")
series = temp_df.loc[(df["transaction_date"]>=date)&(df["transaction_date"]<=date_20days), :]
df.loc[max(df.loc[df["transaction_date"] == series["transaction_date"].max()].index), f"sum_{debitORCredit}_20days"] = sum(series[debitORCredit])
new_i = series["transaction_date"].nunique()
if new_i > 1:
i = new_i+1
else:
i += 1
return df
def groupListUsingList(inp, groupby):
"""
Groups inp by list groupby
inp: List
groupby: List
Example: inp = [0, 1, 2, 3, 4, 5, 6, 7], groupby=[3, 6] then output = [[0, 1, 2, 3], [4, 5, 6], [7]]
"""
groupby = sorted(groupby)
inp = sorted(inp)
lst = []
arr = []
for i in inp:
if len(groupby) > 0:
if i <= groupby[0]:
arr.append(i)
else:
if len(arr)>0:
lst.append(arr)
arr = [i]
groupby.pop(0)
else:
arr += inp[i:]
if len(arr) > 0:
lst.append(arr)
return lst
def count_amounts_in_category(df, debitORCredit, category_info):
"""
Based on the category assigned, finds the number of amounts belonging to that category
Inputs-
df: Pandas Dataframe. Grouped by name and only contains the transactions belonging to a single category calculation
debitORCredit: String. Takes either credit/debit. Used to get column in df
category_info: Dict. Contains the rules of categorization.
Output-
count: Float. Returns count
"""
if debitORCredit.lower() == "debit":
temp_df = df.loc[(df["debitorcredit"]=="D")]
elif debitORCredit.lower() == "credit":
temp_df = df.loc[(df["debitorcredit"]=="C")]
if temp_df.shape[0] == 0:
return np.nan
category = temp_df.iloc[-1].loc[f"category_{debitORCredit}"]
amount_range = category_info.get(category)
count = temp_df[debitORCredit].apply(lambda x: 1 if x<=amount_range[1] and x>=amount_range[0] else 0).sum()
return count
def assign_category(amount, category_info):
"""
Assigns category based on amount and categorization rules
Input -
amount: Float/Int. The amount
category_info: Dict. Contains the rules of categorization.
Ouptut -
Returns the String category based on the categorization rules
"""
if pd.isna(amount):
return np.nan
for k, v in category_info.items():
if v[0]<=amount<=v[1]:
return k
return np.nan
category_info = {"A": (2000, 4000),
"B": (5000, 8000),
"C":(9000, 20000)}
debitORCredit = "debit"
new_df = pd.DataFrame()
#Groupby name, then for each date in a group, calculate the sum of debitORCredit amounts over the next 20 days
for group in df.groupby("name"):
temp_df = sum20Days(group[1], debitORCredit=debitORCredit)
new_df = pd.concat([new_df, temp_df])
new_df = new_df.reset_index(drop=True)
#Based on the 20 days sum, use the categorization rules to assign a category
new_df[f"category_{debitORCredit}"] = new_df[f"sum_{debitORCredit}_20days"].apply(lambda x: assign_category(x, category_info))
#After assigning a category, groupby name and later groupby each 20 day transaction to find the count of transaction that belong to category assigned to that group of transactions
for group in new_df.groupby("name"):
#to groupby every 20 day transaction, we identified the last row of every 20 day transaction (ones which have a sum_debit_20days value) and split the group(a group from name groupby) on the last value in the index
indices = groupListUsingList(inp=group[1].index, groupby=group[1][group[1][f"sum_{debitORCredit}_20days"].notna()].index)
for index in indices:
count = count_amounts_in_category(df=new_df.loc[index], debitORCredit=debitORCredit, category_info=category_info)
new_df.loc[index[-1], f"count_{debitORCredit}"] = count
new_df

How do I iterate through a pandas dataframe and access a lagging or leading row?

I have the following DataFrame:
df = pd.DataFrame(
{
'date': ['2020-12-05', '2020-12-06', '2020-12-07'],
'day': ['Saturday', 'Sunday', 'Monday'],
'score': [2, 3, 0]
}
)
df
In the DataFrame above, I want to update the score on Monday if the scores on the weekend were non-zero values. For the DataFrame above, Monday's score would be 2.5. But it should work for other, longer DataFrames as well.
I know I can use the following:
df.score.loc[(df.day == 'Monday') & (df.score != 0) & (df.score.shift(1) != 0) & (df.score.shift(2) != 0)] = (df.score + df.score.shift(1)+df.score.shift(2))/3
df.score.loc[(df.day == 'Monday') & (df.score != 0) & (df.score.shift(1) != 0) & (df.score.shift(2) == 0)] = (df.score + df.score.shift(1))/2
df.score.loc[(df.day == 'Monday') & (df.score != 0) & (df.score.shift(1) == 0) & (df.score.shift(2) != 0)] = (df.score + df.score.shift(2))/2
df.score.loc[(df.day == 'Monday') & (df.score == 0) & (df.score.shift(1) != 0) & (df.score.shift(2) != 0)] = (df.score.shift(1) + df.score.shift(2))/2
df.score.loc[(df.day == 'Monday') & (df.score == 0) & (df.score.shift(1) != 0) & (df.score.shift(2) == 0)] = df.score.shift(1)
df.score.loc[(df.day == 'Monday') & (df.score == 0) & (df.score.shift(1) == 0) & (df.score.shift(2) != 0)] = df.score.shift(2)
but this is too lengthy. I think I need to iterate through the DataFrame, something like this:
for index, row in df.iterrows():
if row.day == 'Monday':
non_zeros = []
if row.score != 0:
non_zeros.append(row.score)
if row.score.shift(1) != 0:
non_zeros.append(row.score.shift(1))
if row.score.shift(2) != 0:
non_zeros.append(row.score.shift(2))
mon_score = sum(non_zeros)/len(non_zeros)
df.at[index, 'score'] = mon_score
The code above doesn't work because I get an error:
AttributeError: 'float' object has no attribute 'shift'
So, it seems that shift() isn't correct.
How would I access the previous row and how would I access the score in the previous row? Is there a better way than manually listing the combinations of conditions, like I've done above?
How would I access the previous row
Keep the previous row in a variable - you actually want to see the previous two rows.
rows = df.iterrows()
index,minus2 = next(rows)
index,minus1 = next(rows)
for index, current in rows:
if current.day == 'Monday':
print(f'Saturday:{minus2.date}, Sunday:{minus2.date}, Monday:{current.date}')
print(f'Sat score:{minus2.score}, Sun score:{minus2.score}, Mon score:{current.score}')
print('*********')
minus2,minus1 = minus1,current
Here is another way to do it.
Setup
import pandas as pd
from numpy.random import default_rng
rng = default_rng()
dates = pd.date_range("2020-12-04", periods=60, freq="D")
days = dates.day_name()
score = rng.choice([0, 0, 0, 0, 0, 0, 0.1, 0.2, 0.3], size=60)
df = pd.DataFrame({"date": dates, "day": days, "score": score})
print(df.head(10))
Groupby weekend/weekday; if there is a score on the weekend; calculate the new Monday score; use the group's Monday index to assign a new value to the DataFrame.
weekends = df.day.str.contains(r"Saturday|Sunday|Monday")
days_of_interest = df[weekends]
gb = df.groupby((days_of_interest.day == "Saturday").cumsum())
for k, g in gb:
if (g.iloc[:2].score != 0).any():
monday = (g.iloc[-1]).name
new_score = g.score.mean()
# print(new_score))
df.loc[monday, "score"] = new_score
# print(g)
print(df)
It assumes the data does not break/stop/start on a Saturday,Sunday,Monday boundary - necessary assumption because it uses .iloc slicing/indexing and groups starting on 'Saturday'. Extra logic or alternate selection methods would be needed to accommodate the edge cases if the assumption is not correct.
I wasn't too sure on how the monday score was to be updated, looked like you were averaging over non-zero scores (sum(non_zeros)/len(non_zeros)) - including monday(?). Maybe it should have been:
# add average of weekend scores to monday score
weekend_score = g.iloc[:2].score.mean()
df.loc[monday,'score'] += weekend_score
# or just the sum of all the scores?
df.loc[monday,'score'] = g.score.sum()
If I get it correctly, this might be the solution you have been looking for. I have assumed that your dummy data might be same(no Nulls or any other randomized rows) Try following if you want to use shift().
import pandas as pd
import math
df = pd.DataFrame(
{
'date': ['2020-11-30','2020-12-05', '2020-12-06', '2020-12-07'],
'day': ['Monday','Saturday', 'Sunday', 'Monday'],
'score': [4, 2, 3, 0]
}
)
def update(row,df1,df2):
score_Mon = row['score']
score_Sun = df1.iloc[row.name]['score']
score_Sat = df2.iloc[row.name]['score']
# Base condition if previous Saturday,Sunday values available, else put
# default value only
if row['day'] == 'Monday' and not(math.isnan(score_Sun)) and
not(math.isnan(score_Sat)):
# Non zero condition
if score_Mon != 0 and score_Sun > 0 and score_Sat > 0:
row['score'] = (score_Mon + score_Sun + score_Sat)/3
# Zero condition
else:
row['score'] = (score_Sun + score_Sat)/2
return row
df = df.apply(update,axis=1,args = [df.shift(1),df.shift(2)])
df
df = pd.DataFrame(
{
'date': ['2020-12-05', '2020-12-06', '2020-12-07', '2020-12-05', '2020-12-06', '2020-12-07'],
'day': ['Saturday', 'Sunday', 'Monday','Saturday', 'Sunday', 'Monday'],
'score': [-0.2, 0, 0.0, -0.3, 0, 0.0]
}
)
Based on the question, if we have to access previous rows based on row in question, we can use below
mondays = df['day']=='Monday'
sundays = df.score.shift(1)[mondays]
saturdays = df.score.shift(2)[mondays]
# row_mask_for_upd = mondays & sundays.astype(bool) | saturdays.astype(bool) # if either of sundays or saturdays have to be non zero
row_mask_for_upd = mondays & sundays.astype(bool) & saturdays.astype(bool) # if both sundays and saturdays have to be non zero
if True in set(row_mask_for_upd):
df.loc[row_mask_for_upd, "score"] = (sundays + saturdays)/2
Input:
Output:
Other Inputs and Outputs:
Input:
Output:
Input:
Output:

Take n rows from a spark dataframe and pass to toPandas()

I have this code:
l = [('Alice', 1),('Jim',2),('Sandra',3)]
df = sqlContext.createDataFrame(l, ['name', 'age'])
df.withColumn('age2', df.age + 2).toPandas()
Works fine, does what it needs to. Suppose though I only want to display the first n rows, and then call toPandas() to return a pandas dataframe. How do I do it? I can't call take(n) because that doesn't return a dataframe and thus I can't pass it to toPandas().
So to put it another way, how can I take the top n rows from a dataframe and call toPandas() on the resulting dataframe? Can't think this is difficult but I can't figure it out.
I'm using Spark 1.6.0.
You can use the limit(n) function:
l = [('Alice', 1),('Jim',2),('Sandra',3)]
df = sqlContext.createDataFrame(l, ['name', 'age'])
df.limit(2).withColumn('age2', df.age + 2).toPandas()
Or:
l = [('Alice', 1),('Jim',2),('Sandra',3)]
df = sqlContext.createDataFrame(l, ['name', 'age'])
df.withColumn('age2', df.age + 2).limit(2).toPandas()
You could get first rows of Spark DataFrame with head and then create Pandas DataFrame:
l = [('Alice', 1),('Jim',2),('Sandra',3)]
df = sqlContext.createDataFrame(l, ['name', 'age'])
df_pandas = pd.DataFrame(df.head(3), columns=df.columns)
In [4]: df_pandas
Out[4]:
name age
0 Alice 1
1 Jim 2
2 Sandra 3
Try it:
def showDf(df, count=None, percent=None, maxColumns=0):
if (df == None): return
import pandas
from IPython.display import display
pandas.set_option('display.encoding', 'UTF-8')
# Pandas dataframe
dfp = None
# maxColumns param
if (maxColumns >= 0):
if (maxColumns == 0): maxColumns = len(df.columns)
pandas.set_option('display.max_columns', maxColumns)
# count param
if (count == None and percent == None): count = 10 # Default count
if (count != None):
count = int(count)
if (count == 0): count = df.count()
pandas.set_option('display.max_rows', count)
dfp = pandas.DataFrame(df.head(count), columns=df.columns)
display(dfp)
# percent param
elif (percent != None):
percent = float(percent)
if (percent >=0.0 and percent <= 1.0):
import datetime
now = datetime.datetime.now()
seed = long(now.strftime("%H%M%S"))
dfs = df.sample(False, percent, seed)
count = df.count()
pandas.set_option('display.max_rows', count)
dfp = dfs.toPandas()
display(dfp)
Examples of usages are:
# Shows the ten first rows of the Spark dataframe
showDf(df)
showDf(df, 10)
showDf(df, count=10)
# Shows a random sample which represents 15% of the Spark dataframe
showDf(df, percent=0.15)

Create new dataframe from multiple multi-index dataframes

I want to create a new dataframe with x amount of years which takes random seasons from previous weather data.
Code to illustrate the problem:
import pandas as pd
import numpy as np
dates = pd.date_range('20070101',periods=3200)
df = pd.DataFrame(data=np.random.randint(0,100,(3200,1)), columns =list('A'))
df['date'] = dates
df = df[['date','A']]
Apply season function to the datetime index
def get_season(row):
if row['date'].month >= 3 and row['date'].month <= 5:
return '2'
elif row['date'].month >= 6 and row['date'].month <= 8:
return '3'
elif row['date'].month >= 9 and row['date'].month <= 11:
return '4'
else:
return '1'
Apply the function
df['Season'] = df.apply(get_season, axis=1)
Create a 'Year' column for indexing
df['Year'] = df['date'].dt.year
Multi-index by Year and Season
df = df.set_index(['Year', 'Season'], inplace=False)
Create new dataframes based on season to select from
winters = df.query('Season == "1"')
springs = df.query('Season == "2"')
summers = df.query('Season == "3"')
autumns = df.query('Season == "4"')
I now want to create a new DataFrame which takes a random winter from the wintersdataframe, followed by a random spring from the springs, followed by a random summer from summersand random autumn from autumns and does this for a specified number of years (e.g. 100) but I can't see how to do this.
EDIT:
Duplicate seasons are allowed (it should sample seasons randomly), and the first spring does not have to belong to the same year as the first winter, this doesn't matter.
EDIT 2: Solution using all seasonal dataframes:
years = df['date'].dt.year.unique()
dfs = []
for i in range(outputyears):
dfs.append(winters.query("Year == %d" %np.random.choice(years, 1)))
dfs.append(springs.query("Year == %d" %np.random.choice(years, 1)))
dfs.append(summers.query("Year == %d" %np.random.choice(years, 1)))
dfs.append(autumns.query("Year == %d" %np.random.choice(years, 1)))
rnd = pd.concat(dfs)
It's most probably not the best way to do it, but you can do it this way:
years = df['date'].dt.year.unique()
dfs = []
for i in range(100):
dfs.append(df.query("Year == %d and Season == '1'" %np.random.choice(years, 1)))
dfs.append(df.query("Year == %d and Season == '2'" %np.random.choice(years, 1)))
dfs.append(df.query("Year == %d and Season == '3'" %np.random.choice(years, 1)))
dfs.append(df.query("Year == %d and Season == '4'" %np.random.choice(years, 1)))
rnd = pd.concat(dfs)

How to generate random time series data including February with 29 days?

I have used the following codes to generate a random rainfall data from 1950 to 2009 with known probability, mean and standard deviations. But, I have been suffering from iterating the days of February as 29 days in the leap years. And, also I was trying to save the output in a text file, but it gives an error message like
TypeError: float argument required, not numpy.string_
Can anyone please help me out?
My code:
import numpy as np
import random
import itertools
import datetime
dry =[0.33,0.27,0.32,0.41,0.42,0.45,0.57,0.52,0.45,0.39,0.37,0.37]
wet = [0.66,0.72,0.67,0.58,0.57,0.54,0.42,0.47,0.54,0.60,0.62,0.62]
d2d_tran = [0.56,0.50,0.58,0.62,0.63,0.67,0.73,0.66,0.60,0.56,0.57,0.62]
w2w_tran = [0.78,0.80,0.79,0.73,0.72,0.72,0.63,0.64,0.66,0.71,0.74,0.76]
mu = [3.71, 4.46, 4.11, 2.94, 3.00, 2.87, 2.31, 2.44, 2.56, 3.45, 4.32, 4.12]
sigma = [6.72,7.92,7.49,6.57,6.09,5.53,4.38,4.69,4.31,5.71,7.64,7.54]
days = [31,28,31,30,31,30,31,31,30,31,30,31]
rain = []
for y in xrange(0,60):
for m in xrange(0,12):
random_num = np.random.rand(days[m])
if random.random() <= dry[m]:
random_num[0] = 0
else:
r = abs(random.gauss(mu[m],sigma[m]))
random_num[0] = r
for i in xrange(1,days[m]):
if random_num[i-1] == 0:
if random_num[i] <= d2d_tran[m]:
random_num[i] = 0
else:
r = abs(random.gauss(mu[m],sigma[m]))
random_num[i] = r
else:
if random_num[i] <= w2w_tran[m]:
r = abs(random.gauss(mu[m],sigma[m]))
random_num[i] = r
else:
random_num[i] = 0
rain.append(random_num)
rain_series = []
for j in itertools.chain.from_iterable(rain):
rain_series.append(j)
y = np.array(rain_series).reshape(-1, 1)
date_series = []
def generate_dates(start_date, end_date):
return (start_date + datetime.timedelta(days=d) for d in xrange((end_date - start_date).days + 1))
start_date = datetime.date(1950, 1, 1)
end_date = datetime.date(2009, 12, 16)
for current_date in generate_dates(start_date, end_date):
f = current_date.strftime('%Y-%m-%d')
date_series.append(f)
z = np.array(date_series).reshape(-1, 1)
#### Here I have 365x60 = 21900 rainfall values, that is why I had to
####set the end_date upto (2009,12,16). If
#### the February days for leap years can be set as 29 in days[] of
####rain_series than this problem would be solved.
data = np.concatenate((z,y), axis=1)
print data
data1 = data.reshape((21900,2))
np.savetxt('Random Rainfall Data.txt', data1)
#### I want to shape data in two columns like dates and rainfall.
#### And than, save it into a text file. But, it provides an error!!!
Use the calendar.monthrange() to get the number of days of a month.
for year in xrange(1950,2020):
for month in xrange(1,13):
day_num = calendar.monthrange(year, month)[1]
random_num = np.random.rand(day_num)
...
Regarding the data writing problem, you must add a third argument. It depends on your specific problem:
np.savetxt('test.txt', a, '%s') # Example
See documentation for more formatting info.

Categories

Resources