I am trying to loop over a dataframe and apply a customer function, however my date column keeps either corrupting, or adds brackets to each element
Does anyone know what I am doing wrong?
import numpy as np
import string
import random
# This is a the custom function I use
def summarise_dummy(x):
d = {}
date_index = x['groups_2'] == max(x['groups_2'])
d['date'] = x['date'][date_index] # do something with date
d['y'] = x['y'][date_index] # do something with y
return pd.Series(d, index=['date', 'y']) # return a series
# Generate some dummy data
todays_date = datetime.datetime.now().date()
date = pd.date_range(todays_date-datetime.timedelta(10), periods=10, freq='D')
columns = ['y']
data = [random.randint(0,10) for i in range(0,10)]
df = pd.DataFrame(data, columns=columns)
df['date'] = date
random.choice(string.letters)
df['date'] = pd.to_datetime(df['date'])
df['groups_1'] = list(np.random.choice(list(string.ascii_lowercase[0:5]), 10))
df['groups_2'] = list(np.random.choice(list(string.ascii_lowercase[0:2]), 10))
# ***
#df.loc[:,'date'] = df.loc[:,'date'].dt.strftime('%Y-%m-%d')
# Apply the function for each group_1
grouped = df.groupby(['groups_1'])
summarised = grouped.apply(summarise_dummy)
# Upon expecting the date column, they are all Nat. However if you uncomment *** (above)
# and re-run, dates are returned?
summarised['date']
# But when I finally run with *** un-commented and convert my output to a json, date has []'s in it's series
summarised_json = summarised.to_json(orient='records')
What final output are you looking to get?
Does it work if you change pd.Series to pd.DataFrame within def summarise_dummy(x), setting the date and y along the columns.
import numpy as np
import string
import random
import pandas as pd
import datetime
# This is a the custom function I use
def summarise_dummy(x):
d = {}
date_index = x['groups_2'] == max(x['groups_2'])
d['date'] = x['date'][date_index] # do something with date
d['y'] = x['y'][date_index] # do something with y
return pd.DataFrame(d, columns=['date', 'y']) # return a series
# Generate some dummy data
date = pd.date_range(datetime.datetime.now().date() - datetime.timedelta(10), periods=10, freq='D')
print(date)
columns = ['y']
data = [random.randint(0, 10) for i in range(0, 10)]
df = pd.DataFrame(data, columns=columns)
df['date'] = date
random.choice(string.ascii_letters)
# df['date'] = pd.to_datetime(df['date'])
df['groups_1'] = list(np.random.choice(list(string.ascii_lowercase[0:5]), 10))
df['groups_2'] = list(np.random.choice(list(string.ascii_lowercase[0:2]), 10))
df['date'] = df['date'].dt.strftime('%Y-%m-%d')
print(df)
# Apply the function for each group_1
grouped = df.groupby(['groups_1'])
summarised = grouped.apply(summarise_dummy)
print(summarised)
# Upon expecting the date column, they are all Nat. However if you uncomment *** (above)
# and re-run, dates are returned?
# But when I finally run with *** un-commented and convert my output to a json, date has []'s in it's series
summarised_json = summarised.to_json(orient='records')
print(summarised_json)
After apply:
date y
groups_1
a 9 2018-08-21 0
b 6 2018-08-18 7
c 4 2018-08-16 0
7 2018-08-19 5
8 2018-08-20 1
d 1 2018-08-13 6
3 2018-08-15 8
e 5 2018-08-17 1
After to_json:
[{"date":"2018-08-21","y":0},{"date":"2018-08-18","y":7},{"date":"2018-08-16","y":0},{"date":"2018-08-19","y":5},{"date":"2018-08-20","y":1},{"date":"2018-08-13","y":6},{"date":"2018-08-15","y":8},{"date":"2018-08-17","y":1}]
Additionally, you can configure the json format with orient.
Related
I need to add seconds in YYYY-MM-DD-HH-MM-SS. My code works perfectly for one data point but not for the whole set. The data.txt consists of 7 columns and around 200 rows.
import numpy as np
import pandas as pd
from datetime import datetime, timedelta
df = pd.read_csv('data.txt',sep='\t',header=None)
a = np.array(list(df[0]))
b = np.array(list(df[1]))
c = np.array(list(df[2]))
d = np.array(list(df[3]))
e = np.array(list(df[4]))
f = np.array(list(df[5]))
g = np.array(list(df[6]))
t1=datetime(year=a, month=b, day=c, hour=d, minute=e, second=f)
t = t1 + timedelta(seconds=g)
print(t)
You can pass parameter names to read_csv for new columns names in first step and then convert first 5 columns to datetimes by to_datetime and add seconds converted to timedeltas by to_timedelta:
names = ["year","month","day","hour","minute","second","new"]
df = pd.read_csv('data.txt',sep='\t',names=names)
df['out'] = pd.to_datetime(df[names]) + pd.to_timedelta(df["new"], unit='s')
use apply with axis=1 to apply a function to every row of the dataframe.
df.apply(lambda x: datetime(year=x[0],
month=x[1],
day=x[2],
hour=x[3],
minute=x[4],
second=x[5]) + timedelta(seconds=int(x[6])) , axis=1)
generating dataset
simple to do as pandas series
s = 20
df = pd.DataFrame(np.array([np.random.randint(2015,2020,s),np.random.randint(1,12,s),np.random.randint(1,28,s),
np.random.randint(0,23,s), np.random.randint(0,59,s), np.random.randint(0,59,s),
np.random.randint(0,200,s)]).T,
columns=["year","month","day","hour","minute","second","add"])
pd.to_datetime(df.loc[:,["year","month","day","hour","minute","second"]]) + df["add"].apply(lambda s: pd.Timedelta(seconds=s))
without using apply()
pd.to_datetime(df.loc[:,["year","month","day","hour","minute","second"]]) + pd.to_timedelta(df["add"], unit="s")
I have DataFrame like below:
df = pd.DataFrame({"data" : ["25.01.2020", and many more other dates...]})
df["data"] = pd.to_datetime(df["data"], format = "%d%m%Y")
And I have a series of special dates like below:
special_date = pd.Series(pd.to_datetime(["16.01.2020",
"27.01.2020",
and many more other dates...], dayfirst=True))
And I need to calculate 2 more columns in this DataFrame:
col1 = number of weeks to the next special date
col2 = number of weeks after las special date
So I need results like below:
col1 = 1 because next special date after 25.01 is 27.01 so it is the same week
col2 = 2 because last special date before 25.01 is 16.01 so i is 2 weeks ago
*please be aware that I have many more dates, so code needs to work for more dates than only 2 special dates or only 1 data from df.
You can use broadcasting to create a matrix of time deltas and than calculate the minima for your new columns
import numpy as np, pandas as pd
df = pd.DataFrame({'data': pd.to_datetime(["01.01.2020","25.01.2020","20.02.2020"], dayfirst=True)})
s = pd.Series(pd.to_datetime(["16.01.2020","27.01.2020","08.02.2020","19.02.2020"], dayfirst=True))
delta = (s.to_numpy()[:,None] - df['data'].to_numpy()).astype('timedelta64[D]') / np.timedelta64(1, 'D')
n = np.min( delta, 0, where=delta> 0, initial=np.inf)
p = np.min(-delta, 0, where=delta<=0, initial=np.inf)
df['next'] = np.ceil(n/7) #consider np.floor
df['prev'] = np.ceil(p/7)
Alternatively to using the where argument you could perform the steps by hand:
n = delta.copy(); n[delta<=0] = np.inf; n = np.abs(np.min(n,0))
p = delta.copy(); p[delta> 0] = -np.inf; p = np.abs(np.min(-p,0))
I would like to use a function that produces multiple outputs to create multiple new columns in an existing pandas dataframe.
For example, say I have this test function which outputs 2 things:
def testfunc (TranspoId, LogId):
thing1 = TranspoId + LogId
thing2 = LogId - TranspoId
return thing1, thing2
I can give those returned outputs to 2 different variables like so:
Thing1,Thing2 = testfunc(4,28)
print(Thing1)
print(Thing2)
I tried to do this with a dataframe in the following way:
data = {'Name':['Picard','Data','Guinan'],'TranspoId':[1,2,3],'LogId':[12,14,23]}
df = pd.DataFrame(data, columns = ['Name','TranspoId','LogId'])
print(df)
df['thing1','thing2'] = df.apply(lambda row: testfunc(row.TranspoId, row.LogId), axis=1)
print(df)
What I want is something that looks like this:
data = {'Name':['Picard','Data','Guinan'],'TranspoId':[1,2,3],'LogId':[12,14,23], 'Thing1':[13,16,26], 'Thing2':[11,12,20]}
df = pd.DataFrame(data, columns=['Name','TranspoId','LogId','Thing1','Thing2'])
print(df)
In the real world that function is doing a lot of heavy lifting, and I can't afford to run it twice, once for each new variable being added to the df.
I've been hitting myself in the head with this for a few hours. Any insights would be greatly appreciated.
I believe the best way is to change the order and make a function that works with Series.
import pandas as pd
# Create function that deals with series
def testfunc (Series1, Series2):
Thing1 = Series1 + Series2
Thing2 = Series1 - Series2
return Thing1, Thing2
# Create df
data = {'Name':['Picard','Data','Guinan'],'TranspoId':[1,2,3],'LogId':[12,14,23]}
df = pd.DataFrame(data, columns = ['Name','TranspoId','LogId'])
# Apply function
Thing1,Thing2 = testfunc(df['TranspoId'],df['LogId'])
print(Thing1)
print(Thing2)
# Assign new columns
df = df.assign(Thing1 = Thing1)
df = df.assign(Thing2 = Thing2)
# print df
print(df)
Your function should return a series that calculates the new columns in one pass. Then you can use pandas.apply() to add the new fields.
import pandas as pd
df = pd.DataFrame( {'TranspoId':[1,2,3], 'LogId':[4,5,6]})
def testfunc(row):
new_cols = pd.Series([
row['TranspoId'] + row['LogId'],
row['LogId'] - row['TranspoId']])
return new_cols
df[['thing1','thing2']] = df.apply(testfunc, axis = 1)
print(df)
Output:
TranspoId LogId thing1 thing2
0 1 4 5 3
1 2 5 7 3
2 3 6 9 3
I have a million-row time-series dataframe, in which some of the values in the Date column have muddled day/month values.
How do I efficiently unmuddle them without also ruining those that are correct?
# this creates a dataframe with muddled dates
import pandas as pd
import numpy as np
from pandas import Timestamp
start = Timestamp(2013,1,1)
dates = pd.date_range(start, periods=942)[::-1]
muddler = {}
for d in dates:
if d.day < 13:
muddler[d] = Timestamp(d.year, d.day, d.month)
else:
muddler[d] = Timestamp(d.year, d.month, d.day)
df = pd.DataFrame()
df['Date'] = dates
df['Date'] = df['Date'].map(muddler)
# now what? (assuming I don't know how the dates are muddled)
An option might be to calculate a fit for the timestamps and modify those that deviate from the fit greater than a certain threshold. Example:
import pandas as pd
import numpy as np
start = pd.Timestamp(2013,1,1)
dates = pd.date_range(start, periods=942)[::-1]
muddler = {}
for d in dates:
if d.day < 13:
muddler[d] = pd.Timestamp(d.year, d.day, d.month)
else:
muddler[d] = pd.Timestamp(d.year, d.month, d.day)
df = pd.DataFrame()
df['Date'] = dates
df['Date'] = df['Date'].map(muddler)
# convert date col to posix timestamp
df['ts'] = df['Date'].values.astype(np.float) / 10**9
# calculate a linear fit for ts col
x = np.linspace(df['ts'].iloc[0], df['ts'].iloc[-1], df['ts'].size)
df['ts_linfit'] = np.polyval(np.polyfit(x, df['ts'], 1), x)
# set a thresh and derive a mask that masks differences between
# fit and timestamp greater than thresh:
thresh = 1.2e6 # you might want to tweak this...
m = np.absolute(df['ts']-df['ts_linfit']) > thresh
# create new date col as copy of original
df['Date_filtered'] = df['Date']
# modify values that were caught in the mask
df.loc[m, 'Date_filtered'] = df['Date_filtered'][m].apply(lambda x: pd.Timestamp(x.year, x.day, x.month))
# also to posix timestamp
df['ts_filtered'] = df['Date_filtered'].values.astype(np.float) / 10**9
ax = df['ts'].plot(label='original')
ax = df['ts_filtered'].plot(label='filtered')
ax.legend()
While attempting to create a minimal reproducible example, I have actually solved my problem -- but I expect there is a more efficient and effective way to do what I'm trying to do...
# i first define a function to examine the dates
def disordered_muddle(date_series, future_first=True):
"""Check whether a series of dates is disordered or just muddled"""
disordered = []
muddle = []
dates = date_series
different_dates = pd.Series(dates.unique())
date = different_dates[0]
for i, d in enumerate(different_dates[1:]):
# we expect the date's dayofyear to decrease by one
if d.dayofyear!=date.dayofyear-1:
# unless the year is changing
if d.year!=date.year-1:
try:
# we check if the day and month are muddled
# if d.day > 12 this will cause an Exception
unmuddle = Timestamp(d.year,d.day,d.month)
if unmuddle.dayofyear==date.dayofyear-1:
muddle.append(d)
d = unmuddle
elif unmuddle.year==date.year-1:
muddle.append(d)
d = unmuddle
else:
disordered.append(d)
except:
disordered.append(d)
date=d
if len(disordered)==0 and len(muddle)==0:
return False
else:
return disordered, muddle
disorder, muddle = disordered_muddle(df['Date'])
# finally unmuddle the dates
date_correction = {}
for d in df['Date']:
if d in muddle:
date_correction[d] = Timestamp(d.year, d.day, d.month)
else:
date_correction[d] = Timestamp(d.year, d.month, d.day)
df['CorrectedDate'] = df['Date'].map(date_correction)
disordered_muddle(df['CorrectedDate'])
I have data like this:
import datetime as dt
import pandas as pd
df = pd.DataFrame({'date':[dt.datetime(2018,8,25), dt.datetime(2018,7,21)],
'n':[10,7]})
I would like to create a third column which contains a date range created by pd.date_range, using 'date' as the start date and 'n' as the number of periods.
So the first entry should be:
pd.date_range(dt.datetime(2018,8,25), periods=10, freq='d')
(I have a list of "target" dates, and my goal is to check whether the date_range contains any of those target dates).
I tried this:
df['date_range'] = df.apply(lambda x: pd.date_range(x['date'],
x['n'],
freq='d'))
But this gives a KeyError: ('date', 'occurred at index date')
Any idea on how to do this without using a for loop, or is there a better solution altogether?
You can solve your problem without creating date range or day columns. To check if a target date in tgt belongs to a date range specified by rows of df, you can calculate the end of date range, and then check if each date in tgt falls in between the start and end of a time interval. The code below implements this, and produces "target_date" column identical to the one in your own answer:
df = pd.DataFrame({'date':[dt.datetime(2018,8,25), dt.datetime(2018,7,21)],
'n':[10,7]})
df["daterange_end"] = df.apply(lambda x: x["date"] + pd.Timedelta(days=x["n"]), axis=1)
tgt = [dt.datetime(2018,8,26)]
df['target_date'] = 0
df.loc[(tgt[0] > df.date) &(tgt[0] < df.daterange_end),"target_date"] = 1
print(df)
# date n daterange_end target_date
# 0 2018-08-25 10 2018-09-04 1
# 1 2018-07-21 7 2018-07-28 0
You should add axis=1 in apply
df['date_range'] = df.apply(lambda x: pd.date_range(x['date'], x['n'], freq='d'), axis=1)
I came up with a solution that works (but I'm sure there's a nicer way...)
# define target
tgt = [dt.datetime(2018,8,26)]
# find max n
max_n = max(df['n'])
# create that many columns and increment the day
for i in range(max_n):
df['date_{}'.format(i)] = df['date'] + dt.timedelta(days=i)
new_cols = ['date_{}'.format(n) for n in range(max_n)]
# check each one and replace with a 1 if it matches the "tgt"
df['target_date'] = 0
for col in new_cols:
df['target_date'] = np.where(df[col].isin(tgt),
1,
df['target_date'])
# drop intermediate cols
df = df[[i for i in df.columns if not i in new_cols]]