Duplicate rows with some changes in Python - python

I encountered a problem of how to duplicate rows with loop function in Python. I have a dataset like this(this is a pandas data frame):
userId period Date
0 41851 4 1/4/2015
1 13575 1 1/4/2015
And I want to duplicate the first row 3 times, every time for the original row, the period column need to minus 1. until the period for the original is 1. and also every time when I duplicate it, I want to add 1 month to the date. So the result would be like this:
userId period Date
0 41851 1 1/4/2015
1 41851 1 2/4/2015
2 41851 1 3/4/2015
3 41851 1 4/4/2015
4 13575 1 1/4/2015
Does someone know how to do that? Thanks!

Idea is repeat rows by Index.repeat and DataFrame.loc, then add days by GroupBy.cumcount with this solution and last if necessary change format of datetimes by Series.dt.strftime:
def combine64(years, months=1, days=1, weeks=None, hours=None, minutes=None,
seconds=None, milliseconds=None, microseconds=None, nanoseconds=None):
years = np.asarray(years) - 1970
months = np.asarray(months) - 1
days = np.asarray(days) - 1
types = ('<M8[Y]', '<m8[M]', '<m8[D]', '<m8[W]', '<m8[h]',
'<m8[m]', '<m8[s]', '<m8[ms]', '<m8[us]', '<m8[ns]')
vals = (years, months, days, weeks, hours, minutes, seconds,
milliseconds, microseconds, nanoseconds)
return sum(np.asarray(v, dtype=t) for t, v in zip(types, vals)
if v is not None)
def year(dates):
"Return an array of the years given an array of datetime64s"
return dates.astype('M8[Y]').astype('i8') + 1970
def month(dates):
"Return an array of the months given an array of datetime64s"
return dates.astype('M8[M]').astype('i8') % 12 + 1
def day(dates):
"Return an array of the days of the month given an array of datetime64s"
return (dates - dates.astype('M8[M]')) / np.timedelta64(1, 'D') + 1
df['Date'] = pd.to_datetime(df['Date'])
df1 = df.loc[df.index.repeat(df['period'])]
g = df1.groupby(level=0).cumcount()
start = df1['Date'].values
df1['Date'] = combine64(year(start), months=month(start) + g,
days=day(start))
df1['period'] = 1
df1 = df1.reset_index(drop=True)
df1['Date'] = df1['Date'].dt.strftime('%m/%d/%Y')
print (df1)
userId period Date
0 41851 1 01/04/2015
1 41851 1 02/04/2015
2 41851 1 03/04/2015
3 41851 1 04/04/2015
4 13575 1 01/04/2015

Related

Iterating a suffix using apply in pandas

I'm trying to use the apply function in Pandas with a lambda to create a named value.
I have the TIME column and I want the ACT_NUM column as shown in the following.
df
TIME ACT_NUM
3 act_1
3 act_1
4 act_1
12 act_2
3 act_2
15 act_3
The logic of the wanted column: When the value in the TIME column is > 10, then I would like to increment the suffix number by 1 permanently.
I have tried this, but it was not working because I couldn't integrate the variable n row to row when using apply.
n = 1
df['ACT_NUM'] = df['TIME'].apply(lambda x: 'act_'+(n+1) if x>10 else 'act_'+n)
df.cumsum() on df["TIME"] > 10 seems to be what you are looking for. The cumsum begins from 0 and increase by 1 whenever a row with df["TIME"] > 10 is encountered.
Code
df["ACT_NUM"] = (df["TIME"] > 10).cumsum().apply(lambda el: f"act_{el + 1}")
Result
print(df)
TIME ACT_NUM
0 3 act_1
1 3 act_1
2 4 act_1
3 12 act_2
4 3 act_2
5 15 act_3
Just use str(n+1) and str(n) to convert number to string :
df['ACT_NUM'] = df['TIME'].apply(lambda x: 'act_'+str(n+1) if x>10 else 'act_'+str(n))

Identify customer segments based on transactions that they have made in specific period using Python

For customer segmentation purpose, I want to analyse, How many transactions did the customer do in prior 10 days & 20 days based on given table of transaction records with date.
In this table, the last 2 columns are joined by using the following code.
But I'm not satisfied with this code, please suggest me improvement.
import pandas as pd
df4 = pd.read_excel(path)
# Since A and B two customers are there, two separate dataframe created
df4A = df4[df4['Customer_ID'] == 'A']
df4B = df4[df4['Customer_ID'] == 'B']
from datetime import date
from dateutil.relativedelta import relativedelta
txn_prior_10days = []
for i in range(len(df4)):
current_date = df4.iloc[i,2]
prior_10days_date = current_date - relativedelta(days=10)
if df4.iloc[i,1] == 'A':
No_of_txn = ((df4A['Transaction_Date'] >= prior_10days_date) & (df4A['Transaction_Date'] < current_date)).sum()
txn_prior_10days.append(No_of_txn)
elif df4.iloc[i,1] == 'B':
No_of_txn = ((df4B['Transaction_Date'] >= prior_10days_date) & (df4B['Transaction_Date'] < current_date)).sum()
txn_prior_10days.append(No_of_txn)
txn_prior_20days = []
for i in range(len(df4)):
current_date = df4.iloc[i,2]
prior_20days_date = current_date - relativedelta(days=20)
if df4.iloc[i,1] == 'A':
no_of_txn = ((df4A['Transaction_Date'] >= prior_20days_date) & (df4A['Transaction_Date'] < current_date)).sum()
txn_prior_20days.append(no_of_txn)
elif df4.iloc[i,1] == 'B':
no_of_txn = ((df4B['Transaction_Date'] >= prior_20days_date) & (df4B['Transaction_Date'] < current_date)).sum()
txn_prior_20days.append(no_of_txn)
df4['txn_prior_10days'] = txn_prior_10days
df4['txn_prior_20days'] = txn_prior_20days
df4
Your code would be very difficult to write if you had
e.g. 10 different Customer_IDs.
Fortunately, there is much shorter solution:
When you read your file, convert Transaction_Date to datetime,
e.g. passing parse_dates=['Transaction_Date'] to read_excel.
Define a fuction counting how many dates in group (gr) are
within the range between tDlt (Timedelta) and 1 day before the
current date (dd):
def cntPrevTr(dd, gr, tDtl):
return gr.between(dd - tDtl, dd - pd.Timedelta(1, 'D')).sum()
It will be applied twice to each member of the current group
by Customer_ID (actually to Transaction_Date column only),
once with tDtl == 10 days and second time with tDlt == 20 days.
Define a function counting both columns containing the number of previous
transactions, for the current group of transaction dates:
def priorTx(td):
return pd.DataFrame({
'tx10' : td.apply(cntPrevTr, args=(td, pd.Timedelta(10, 'D'))),
'tx20' : td.apply(cntPrevTr, args=(td, pd.Timedelta(20, 'D')))})
Generate the result:
df[['txn_prior_10days', 'txn_prior_20days']] = df.groupby('Customer_ID')\
.Transaction_Date.apply(priorTx)
The code above:
groups df by Customer_ID,
takes from the current group only Transaction_Date column,
applies priorTx function to it,
saves the result in 2 target columns.
The result, for a bit shortened Transaction_ID, is:
Transaction_ID Customer_ID Transaction_Date txn_prior_10days txn_prior_20days
0 912410 A 2019-01-01 0 0
1 912341 A 2019-01-03 1 1
2 312415 A 2019-01-09 2 2
3 432513 A 2019-01-12 2 3
4 357912 A 2019-01-19 2 4
5 912411 B 2019-01-06 0 0
6 912342 B 2019-01-11 1 1
7 312416 B 2019-01-13 2 2
8 432514 B 2019-01-20 2 3
9 357913 B 2019-01-21 3 4
You cannot use rolling computation, because:
the rolling window extends forward from the current row, but you
want to count previous transactions,
rolling calculations include the current row, whereas
you want to exclude it.
This is why I came up with the above solution (just 8 lines of code).
Details how my solution works
To see all details, create the test DataFrame the following way:
import io
txt = '''
Transaction_ID Customer_ID Transaction_Date
912410 A 2019-01-01
912341 A 2019-01-03
312415 A 2019-01-09
432513 A 2019-01-12
357912 A 2019-01-19
912411 B 2019-01-06
912342 B 2019-01-11
312416 B 2019-01-13
432514 B 2019-01-20
357913 B 2019-01-21'''
df = pd.read_fwf(io.StringIO(txt), skiprows=1,
widths=[15, 12, 16], parse_dates=[2])
Perform groupby, but for now retrieve only group with key 'A':
gr = df.groupby('Customer_ID')
grp = gr.get_group('A')
It contains:
Transaction_ID Customer_ID Transaction_Date
0 912410 A 2019-01-01
1 912341 A 2019-01-03
2 312415 A 2019-01-09
3 432513 A 2019-01-12
4 357912 A 2019-01-19
Let's start from the most detailed issue, how works cntPrevTr.
Retrieve one of dates from grp:
dd = grp.iloc[2,2]
It contains Timestamp('2019-01-09 00:00:00').
To test example invocation of cntPrevTr for this date, run:
cntPrevTr(dd, grp.Transaction_Date, pd.Timedelta(10, 'D'))
i.e. you want to check how many prior transaction performed this customer
before this date, but not earlier than 10 days back.
The result is 2.
To see how the whole first column is computed, run:
td = grp.Transaction_Date
td.apply(cntPrevTr, args=(td, pd.Timedelta(10, 'D')))
The result is:
0 0
1 1
2 2
3 2
4 2
Name: Transaction_Date, dtype: int64
The left column is the index and the right - values returned
from cntPrevTr call for each date.
And the last thing is to show, how the result for the whole group
is generated. Run:
priorTx(grp.Transaction_Date)
The result (a DataFrame) is:
tx10 tx20
0 0 0
1 1 1
2 2 2
3 2 3
4 2 4
The same procedure takes place for all other groups, then
all partial results are concatenated (vertically) and the last
step is to save both columns of the whole DataFrame in
respective columns of df.

Count number of registers in interval & location

Recently I asked how one could count the number of registers by the interval as answered in Count number of registers in interval.
The solution works great, but I had to adapt it to also take into account some localization key.
I did it through the following code:
def time_features(df, time_key, T, location_key, output_key):
"""
Create features based on time such as: how many BDs are open in the same GRA at this moment (hour)?
"""
from datetime import date
assert np.issubdtype(df[time_key], np.datetime64)
output = pd.DataFrame()
grouped = df.groupby(location_key)
for name, group in grouped:
# initialize times registers open as 1, close as -1
start_times = group.copy()
start_times[time_key] = group[time_key]-pd.Timedelta(hours=T)
start_times[output_key] = 1
aux = group.copy()
all_times = start_times.copy()
aux[output_key] = -1
all_times = all_times.append(aux, ignore_index=True)
# sort by time and perform a cumulative sum to get opened registers
# (subtract 1 since you don't want to include the current time as opened)
all_times = all_times.sort_values(by=time_key)
all_times[output_key] = all_times[output_key].cumsum() - 1
# revert the index back to original order, and truncate closed times
all_times = all_times.sort_index().iloc[:len(all_times)//2]
output = output.append(all_times, ignore_index=True)
return output
Output:
time loc1 loc2
0 2013-01-01 12:56:00 1 "a"
1 2013-01-01 12:00:12 1 "b"
2 2013-01-01 10:34:28 2 "c"
3 2013-01-01 09:34:54 2 "c"
4 2013-01-01 08:34:55 3 "d"
5 2013-01-01 08:34:55 5 "d"
6 2013-01-01 16:35:19 4 "e"
7 2013-01-01 16:35:30 4 "e"
time_features(df, time_key='time', T=2, location_key='loc1', output_key='count')
This works great for small data, but for longer data (I using it with a file with 1 million rows) it takes "forever" to run. I wonder if I could optimize this computation somehow.

Pandas Week of Year from July 1

So I have pandas dataframe with a 'date' column. Our calendar is based off of July 1st being the first day. I know I can do df['date'].dt.week, but that gives me the week from Jan 1. Is there a way to take my df and make a new column 'week' where 'week' is 0 for the first days in July until Sunday and then 1... etc.? Basically the same way that dt.week works... just shifted to Jul 1. I know that resample allows me to shift this way, I just can't seem to figure out how to get it all correct as a column.
Thanks
Update: Currently doing this... not exactly working.
def get_academic_year(x):
if (x.month < 7):
year = x.year - 1
else:
year = x.year
return year
def get_week(x):
return ((x['date'].week -
pd.to_datetime(pd.datetime(x['academic_year'], 7, 1)).week) % 52)
df_x['academic_year'] = df_x['date'].apply(lambda x: get_academic_year(x))
df_x['week'] = df_x.apply(lambda x: get_week(x), axis=1)
My Dataset:
'{"date":{"0":1414368000000,"1":1414454400000,"2":1414540800000,"3":1414627200000,"4":1414713600000,"5":1414800000000,"6":1414886400000,"7":1425254400000,"8":1425340800000,"9":1425427200000,"10":1425513600000,"11":1425600000000,"12":1425686400000,"13":1425772800000,"14":1433116800000,"15":1433203200000,"16":1433289600000,"17":1433376000000,"18":1433462400000,"19":1433548800000,"20":1433635200000,"21":1444262400000,"22":1444348800000,"23":1444608000000,"24":1444694400000,"25":1444780800000,"26":1444867200000,"27":1444953600000,"28":1445040000000,"29":1445126400000,"30":1452643200000,"31":1452729600000,"32":1452816000000,"33":1452902400000,"34":1452988800000,"35":1460505600000,"36":1460937600000,"37":1461024000000,"38":1461110400000,"39":1461196800000,"40":1461283200000,"41":1461369600000,"42":1461456000000,"43":1465776000000,"44":1465862400000,"45":1465948800000,"46":1466035200000,"47":1466121600000,"48":1470873600000,"49":1470960000000,"50":1471219200000,"51":1471305600000,"52":1471392000000,"53":1486598400000,"54":1489968000000,"55":1490054400000,"56":1490140800000,"57":1490227200000,"58":1490313600000,"59":1492387200000,"60":1492473600000,"61":1492560000000,"62":1492646400000,"63":1492732800000,"64":1494201600000,"65":1494288000000,"66":1494374400000,"67":1494460800000,"68":1494547200000,"69":1502668800000,"70":1502755200000,"71":1502841600000,"72":1502928000000,"73":1503014400000,"74":1503100800000,"75":1503187200000,"76":1505174400000,"77":1505433600000,"78":1507507200000,"79":1507593600000,"80":1507680000000,"81":1507766400000,"82":1507852800000,"83":1507939200000,"84":1508025600000,"85":1508976000000,"86":1509062400000,"87":1509148800000,"88":1509235200000,"89":1509321600000,"90":1509408000000,"91":1512086400000,"92":1524268800000,"93":1524355200000,"94":1529884800000,"95":1529971200000,"96":1530057600000,"97":1530144000000,"98":1530230400000}}'
Update #2:
def get_academic_year(x):
if (x.month < 7):
year = x.year - 1
else:
year = x.year
return year
def get_week(x):
return int(((x['date'] - pd.to_datetime(pd.datetime(x['academic_year'], 7, 1)))).days / 7) + 1
rng = pd.date_range('7/1/2015', periods=365*3, freq='D')
df_x = pd.DataFrame()
df_x['date'] = rng
df_x['academic_year'] = df_x['date'].apply(lambda x: get_academic_year(x))
df_x['week'] = df_x.apply(lambda x: get_week(x), axis=1)
df_x
This might work for you.
df = pd.DataFrame({'A': ['2017-07-05', '2017-07-21', '2017-07-22',
'2017-08-01','2017-08-15', '2017-08-30']})
df['A'] = pd.to_datetime(df['A'])
df['Week'] = df['A'].dt.week - pd.to_datetime('2017-07-01').week
# A Week
# 0 2017-07-05 1
# 1 2017-07-21 3
# 2 2017-07-22 3
# 3 2017-08-01 5
# 4 2017-08-15 7
# 5 2017-08-30 9

Python pandas resampling

I have the following dataframe:
Timestamp S_time1 S_time2 End_Time_1 End_time_2 Sign_1 Sign_2
0 2413044 0 0 0 0 x x
1 2422476 0 0 0 0 x x
2 2431908 0 0 0 0 x x
3 2441341 0 0 0 0 x x
4 2541232 2526631 2528631 2520631 2530631 10 80
5 2560273 2544946 2546496 2546496 2548496 40 80
6 2577224 2564010 2566010 2566010 2568010 null null
7 2592905 2580959 2582959 2582959 2584959 null null
The table goes on and on like that. The first column is a timestamp which is in milliseconds. S_time1 and End_time_1 are the duration where a particular sign (number) appear. For example, if we take the 5th row, S_time1 is 2526631, End_time_1 is 2520631, and the corresponding sign_1 is 10, which means from 2526631 to 2520631 the sign 10 will be displayed. And the same thing goes to S_time2 and End_time_2. The corresponding values in sign_2 will appear in the duration from S_time2 to End_time_2.
I want to resample the index column (Timestamp) in 100-millisecond bin time and check in which bin times the signs belong. For instance, between each start time and end time there is 2000 milliseconds difference. So the corresponding sign number will appear repeatedly in around 20 consecutive bin times because each bin time is 100 millisecond. So I need to have two columns only: one with the bin times and the second with the signs. Looks like the following table: (I am just making up the bin time just for example)
Bin_time signs
...100 0
...200 0
...300 10
...400 10
...500 10
...600 10
The sign 10 will be for the duration of the corresponding S_time1 to End_time_1. Then the next sign which is 80 continues for the duration of S_time2 to End_time_2. I am not sure if this can be done in pandas or not. But I really need help either in pandas or other methods.
Thanks for your help and suggestion in advance.
Input:
print df
Timestamp S_time1 S_time2 End_Time_1 End_time_2 Sign_1 Sign_2
0 2413044 0 0 0 0 x x
1 2422476 0 0 0 0 x x
2 2431908 0 0 0 0 x x
3 2441341 0 0 0 0 x x
4 2541232 2526631 2528631 2520631 2530631 10 80
5 2560273 2544946 2546496 2546496 2548496 40 80
6 2577224 2564010 2566010 2566010 2568010 null null
7 2592905 2580959 2582959 2582959 2584959 null null
2 approaches:
In [231]: %timeit s(df)
1 loops, best of 3: 2.78 s per loop
In [232]: %timeit m(df)
1 loops, best of 3: 690 ms per loop
def m(df):
#resample column Timestamp by 100ms, convert bak to integers
df['Timestamp'] = df['Timestamp'].astype('timedelta64[ms]')
df['i'] = 1
df = df.set_index('Timestamp')
df1 = df[[]].resample('100ms', how='first').reset_index()
df1['Timestamp'] = (df1['Timestamp'] / np.timedelta64(1, 'ms')).astype(int)
#felper column i for merging
df1['i'] = 1
#print df1
out = df1.merge(df,on='i', how='left')
out1 = out[['Timestamp', 'Sign_1']][(out.Timestamp >= out.S_time1) & (out.Timestamp <= out.End_Time_1)]
out2 = out[['Timestamp', 'Sign_2']][(out.Timestamp >= out.S_time2) & (out.Timestamp <= out.End_time_2)]
out1 = out1.rename(columns={'Sign_1':'Bin_time'})
out2 = out2.rename(columns={'Sign_2':'Bin_time'})
df = pd.concat([out1, out2], ignore_index=True).drop_duplicates(subset='Timestamp')
df1 = df1.set_index('Timestamp')
df = df.set_index('Timestamp')
df = df.reindex(df1.index).reset_index()
#print df.head(10)
def s(df):
#resample column Timestamp by 100ms, convert bak to integers
df['Timestamp'] = df['Timestamp'].astype('timedelta64[ms]')
df = df.set_index('Timestamp')
out = df[[]].resample('100ms', how='first')
out = out.reset_index()
out['Timestamp'] = (out['Timestamp'] / np.timedelta64(1, 'ms')).astype(int)
#print out.head(10)
#search start end
def search(x):
mask1 = (df.S_time1<=x['Timestamp']) & (df.End_Time_1>=x['Timestamp'])
#if at least one True return first value of series
if mask1.any():
return df.loc[mask1].Sign_1[0]
#check second start and end time
else:
mask2 = (df.S_time2<=x['Timestamp']) & (df.End_time_2>=x['Timestamp'])
if mask2.any():
#if at least one True return first value
return df.loc[mask2].Sign_2[0]
else:
#if all False return NaN
return np.nan
out['Bin_time'] = out.apply(search, axis=1)
#print out.head(10)

Categories

Resources