I have a dataframe and attempt to make the following operation:
data['SD_rates']=np.array([int((data['actual value'][i]-data['means'][i])/data['std'][i]) for i in range (len(data['means']))])
It breaks with the following message:
"Can't convert float Nan to int"
It is an error I understand but tested the df with data.isnull() and no column involved includes NaN (I controlled it manually by sending data.to_csv).
I even filled data['std'] with fillna(-1, inplace=True) but still, it breaks. I don't understand why, since there is no division by 0 (i also controlled that there were no zeros in this column, so no original 0 and Null/Nan filled with -1), and actual values and means are fillna(0) for missing values, and anyway the substraction can't produce a nan (data range in [0-10]).
What could be wrong? (as i said, the data right before triggering the operation is correct...). Thanks
Here is a code snippet:
One of my hypotheses is that in some way, groupby might generate NaN, that I can't get rid off when calculating my means (but I believed that it was ignored by pandas automatically...) and that are not filled with 0 or -1 (I chose -1 for standard deviation deliberately to avoid dividing by 0).
def stats_setting(data):
print('Stats settings')
print(data.columns)
print(data.dtypes)
#sys.exit()
data['marks']=np.log1p(data['marks'].astype(float))
data['students']=np.log1p(data['students'].astype(float))#Rossman9 think this has to be tested
#were filled with fillna before)
#First Part: by studentType and Assortment
types_DoM_select=['Type','Type2','Category']
#First Block:types_DoM students grouped by categories
#wonder if can do a groupby of groupb
print("types_DoM_marks_means")
types_DoM_marks_means = data.groupby(types_DoM_select)['marks'].mean()
types_DoM_marks_means.name = 'types_DoM_marks_means'
types_DoM_marks_means = types_DoM_marks_means.reset_index()
data = pd.merge(data, types_DoM_marks_means, on = types_DoM_select, how='left')
print("types_DoM_students_means")
types_DoM_students_means = data.groupby(types_DoM_select)['students'].mean() #.students won't work. Why?
types_DoM_students_means.name = 'types_DoM_students_means'
types_DoM_students_means=types_DoM_students_means.reset_index()
data = pd.merge(data, types_DoM_students_means, on = types_DoM_select, how='left')
print("types_DoM_marks_medians")
types_DoM_marks_medians = data.groupby(types_DoM_select)['marks'].median()
types_DoM_marks_medians.name = 'types_DoM_marks_medians'
types_DoM_marks_medians = types_DoM_marks_medians.reset_index()
data = pd.merge(data, types_DoM_marks_medians, on = types_DoM_select, how='left')
print("types_DoM_students_medians")
types_DoM_students_medians = data.groupby(types_DoM_select)['students'].median() #.students won't work. Why?
types_DoM_students_medians.name = 'types_DoM_students_medians'
types_DoM_students_medians=types_DoM_students_medians.reset_index()
data = pd.merge(data, types_DoM_students_medians, on = types_DoM_select, how='left')
print("types_DoM_marks_std")
types_DoM_marks_std = data.groupby(types_DoM_select)['marks'].std()
types_DoM_marks_std.name = 'types_DoM_marks_std'
types_DoM_marks_std = types_DoM_marks_std.reset_index()
data = pd.merge(data, types_DoM_marks_std, on = types_DoM_select, how='left')
print("types_DoM_students_std")
types_DoM_students_std = data.groupby(types_DoM_select)['students'].std()
types_DoM_students_std.name = 'types_DoM_students_std'
types_DoM_students_std = types_DoM_students_std.reset_index()
data = pd.merge(data, types_DoM_students_std, on = types_DoM_select, how='left')
data['types_DoM_marks_means'].fillna(-1, inplace=True)
data['types_DoM_students_means'].fillna(-1, inplace=True)
data['types_DoM_marks_medians'].fillna(-1, inplace=True)
data['types_DoM_students_medians'].fillna(-1, inplace=True)
data['types_DoM_marks_std'].fillna(-1, inplace=True)
data['types_DoM_students_std'].fillna(-1, inplace=True)
#Second Part: by specific student
student_DoM_select=['Type','Type2','Category']
#First Block:student_DoM
#wonder if can do a groupby of groupb
print("student_DoM_marks_means")
student_DoM_marks_means = data.groupby(student_DoM_select)['marks'].mean()
student_DoM_marks_means.name = 'student_DoM_marks_means'
student_DoM_marks_means = student_DoM_marks_means.reset_index()
data = pd.merge(data, student_DoM_marks_means, on = student_DoM_select, how='left')
print("student_DoM_students_means")
student_DoM_students_means = data.groupby(student_DoM_select)['students'].mean() #.students won't work. Why?
student_DoM_students_means.name = 'student_DoM_students_means'
student_DoM_students_means=student_DoM_students_means.reset_index()
data = pd.merge(data, student_DoM_students_means, on = student_DoM_select, how='left')
print("student_DoM_marks_medians")
student_DoM_marks_medians = data.groupby(student_DoM_select)['marks'].median()
student_DoM_marks_medians.name = 'student_DoM_marks_medians'
student_DoM_marks_medians = student_DoM_marks_medians.reset_index()
data = pd.merge(data, student_DoM_marks_medians, on = student_DoM_select, how='left')
print("student_DoM_students_medians")
student_DoM_students_medians = data.groupby(student_DoM_select)['students'].median() #.students won't work. Why?
student_DoM_students_medians.name = 'student_DoM_students_medians'
student_DoM_students_medians=student_DoM_students_medians.reset_index()
data = pd.merge(data, student_DoM_students_medians, on = student_DoM_select, how='left')
# May I use data['marks','students','marksMean','studentsMean','marksMedian','studentsMedian']=data['marks','students','marksMean','studentsMean','marksMedian','studentsMedian'].astype(int) to spare memory?
print("student_DoM_marks_std")
student_DoM_marks_std = data.groupby(student_DoM_select)['marks'].std()
student_DoM_marks_std.name = 'student_DoM_marks_std'
student_DoM_marks_std = student_DoM_marks_std.reset_index()
data = pd.merge(data, student_DoM_marks_std, on = student_DoM_select, how='left')
print("student_DoM_students_std")
student_DoM_students_std = data.groupby(student_DoM_select)['students'].std()
student_DoM_students_std.name = 'student_DoM_students_std'
student_DoM_students_std = student_DoM_students_std.reset_index()
data = pd.merge(data, student_DoM_students_std, on = student_DoM_select, how='left')
data['student_DoM_marks_means'].fillna(0, inplace=True)
data['student_DoM_students_means'].fillna(0, inplace=True)
data['student_DoM_marks_medians'].fillna(0, inplace=True)
data['student_DoM_students_medians'].fillna(0, inplace=True)
data['student_DoM_marks_std'].fillna(0, inplace=True)
data['student_DoM_students_std'].fillna(0, inplace=True)
#Third Part: Exceptional students
#I think int is better here as it helps defining categories but can't use it.#
#print(data.isnull().sum())
#print(data['types_DoM_marks_std'][data['types_DoM_marks_std']==0].sum())
#data.to_csv('ex')
#print(data.columns)
#Original version:#int raises the "can't convert Nan float to int. While there were no Nan as I verified in the data just before sending it to the
data['Except_student_IP2_DoM_marks_means']=np.array([int((data['student_IP2_DoM_marks_means'][i]-data['types_IP2_DoM_marks_means'][i])/data['types_IP2_DoM_students_std'][i]) for i in range (len(data['year']))])
data['Except_student_IP2_DoM_marks_medians']=np.array([int((data['student_IP2_DoM_marks_medians'][i]-data['types_IP2_DoM_marks_means'][i])/data['types_IP2_DoM_students_std'][i]) for i in range (len(data['year']))])
#Second version: raises no error but final data (returned) is filled with these stupid NaN
data['Except_student_P2M_DoM_marks_means']=np.array([np.round((data['student_DoM_marks_means'][i]-data['types_DoM_marks_means'][i])/data['types_DoM_marks_std'][i],0) for i in range (len(data['year']))])
data['Except_student_P2M_DoM_marks_medians']=np.array([np.round((data['student_DoM_marks_medians'][i]-data['types_DoM_marks_medians'][i])/data['types_DoM_marks_std'][i],0) for i in range (len(data['year']))])
#End
return data
Most likely you are correct that there are no Nans in your data frame, however you are creating them in your calculations. See the following:
In [15]: import pandas as pd
In [16]: df = pd.DataFrame([[1, 2], [0, 0]], columns=['actual value', 'col2'])
df['means'] = df.mean(axis=1)
df['std'] = df.std(axis=1)
In [17]: df
Out[17]:
actual value col2 means std
0 1 2 1.5 0.5
1 0 0 0.0 0.0
So the data frame doesn't have any Nans, but what about the calculations?
In [21]: [(df['actual value'][i]-df['means'][i])/df['std'][i] for i in range (len(df['means']))]
Out[21]: [-1.0, nan]
Now when you call int on that you get an error on the resulting list.
Finally, I would suggest (if possible) performing the operations directly in the underlying arrays rather then using a for loop, as it will be much faster.
In [25]: (df['actual value']-df['means'])/df['std']
Out[25]:
0 -1
1 NaN
dtype: float64
This may not be possible depending on what return value of a 0 division is desired though.
Related
I'm trying to subtract one data frame from another which all results should result in a 0 or blank based on the data in each my current excel files but will result in 0, 1, 2, or blank in the future. While some do result in a 0 or blank I'm also getting a -1 and 1. Any help that can be provided will be appreciated.
The two Excel sheets are identical except for number changes in second column.
Example
ExternalId TotalInteractions
name1 1
name2 2
name3 2
name4 1
Both sheets will look like the example and the output will look the same. I just need the difference between the two sheets
def GCList():
df1 = pd.read_excel('NewInter.xlsx')
df2 = pd.read_excel('PrevInter.xlsx')
df3 = df1['ExternalId']
df4 = df1['TotalInteractions']
df5 = df2['TotalInteractions']
df6 = df4.sub(df5)
frames = (df3, df6)
df = pd.concat(frames, axis = 1)
df.to_excel('GCList.xlsx')
GCList()
I managed to create a partial answer to getting the unexpected numbers. My problem now is that NewInter has more names than PrevInter does. Which results in a blank in TotalInteractions next to the new ExternalId. Any idea how to make it if it there is a blank to accept the value from NewInter?
def GCList():
df1 = pd.read_excel('NewInter.xlsx')
df2 = pd.read_excel('PrevInter.xlsx')
df3 = pd.merge(df1, df2, on = 'ExternalId', how = 'outer')
df4 = df3['TotalInteractions_x']
df5 = df3['TotalInteractions_y']
df6 = df3['ExternalId']
df7 = df4 - df5
frames = [df6,df7]
df = pd.concat(frames, axis = 1)
df.to_excel('GCList.xlsx')
GCList()
Figured out the issues. First part needed to be merged in order for the subtraction to work as the dataframes are not the same size. Also had to add in fill_value = 0 so it would take information from the new file.
def GCList():
df1 = pd.read_excel('NewInter.xlsx')
df2 = pd.read_excel('PrevInter.xlsx')
df3 = pd.merge(df1, df2, on = 'ExternalId', how = 'outer')
df4 = df3['TotalInteractions_x']
df5 = df3['TotalInteractions_y']
df6 = df3['ExternalId']
df7 = df4.sub(df5, fill_value = 0)
frames = [df6,df7]
df = pd.concat(frames, axis = 1)
df.to_excel('GCList.xlsx')
GCList()
I haven't found anything similar so.. I have 2 df's with the same Gene name but different p value's, example :
I am trying to run over combinedB values on "pvalues" column (numeric) and if they are >=0.05 to continue to combinedA values on "pvalues" column (numeric) which are <= 0.00005. I mustn't concat them
**EDITED
df = pd.read_csv("CombinedA.csv")
df = df['pvalue']
df1 = pd.read_csv("CombinedB.csv")
df1= df1['pvalue']
for i in df1:
if i >= 0.05:
while True:
for i in df:
if i <= 0.00005:
print(i)
Now it just running non stop. I think it prints only the "df" part
Here you are reading the table. You then overwrite df1 and get an array of the values.
df1 = pd.read_csv("CombinedB.csv")
df1= df1['pvalue']
Here you are iterating over the array of your values. These values are of type float.
for i in df1:
You are treating your float value as a dictionary. This is throwing the error.
if i['df1'] in df1 >= 0.05:
You probably meant to write:
if i >= 0.05
You are repeating the same mistake a couple more times.
df = pd.read_csv("Combined.csv", index_col = ["Gene"])
df = df['pvalue']
df.where(df <= 0.005, inplace = True)
df = df.replace(r'', np.NaN).dropna()
# Filter CombinedA
dfA = pd.read_csv("CombinedA.csv", index_col = ["Gene"])
dfA = dfA['pvalue']
dfA.where(dfA >= 0.05, inplace = True)
dfA = dfA.replace(r'', np.NaN).dropna()
df = df[df.index.isin(dfA.index)]
df.to_csv("CombinedRest.csv")
print(df)
This one is working.
Happy 2020! I would like to create a dataframe based on two others. I have the below two dataframes:
df1 = pd.DataFrame({'date':['03.05.1982','04.05.1982','05.05.1982','06.05.1982','07.05.1982','10.05.1982','11.05.1982'],'A': [63.63,64.08,64.19,65.11,65.36,65.25,65.36], 'B': [63.83, 64.10, 64.19, 65.08, 65.33, 65.28, 65.36], 'C':[63.99, 64.22, 64.30, 65.16, 65.41, 65.36, 65.44]})
df2 = pd.DataFrame({'Name':['A','B','C'],'Notice': ['05.05.1982','07.05.1982','12.05.1982']})
The idea is to create df3 such that this dataframe takes the value of A until A's notice date (found in df2) is reached, then df3 switches to the values of B until B's notice date is reached and so on. When we are during notice date, it should take the mean between the current column and the next one.
In the above example, df3 should be as follows (with formulas to illustrate):
df3 = pd.DataFrame({'date':['03.05.1982','04.05.1982','05.05.1982','06.05.1982','07.05.1982','10.05.1982','11.05.1982'], 'Result':[63.63,64.08,(64.19+64.19)/2,65.08,(65.33+65.41)/2,65.36,65.44]})
My idea was to first create a temporary dataframe with same dimensions as df1 and to fill it with 1's when the index date is prior to notice and 0's after. Doing a rolling mean with window 1 would give for each column a series of 1 until I reach 0.5 (signalling a switch).
Not sure if there is a better way to get df3?
I tried the following:
def fill_rule(df_p,df_t):
return np.where(df_p.index > df_t[df_t.Name==df_p.name]['Notice'][0], 0, 1)
df1['date'] = pd.to_datetime(df1['date'])
df2['notice'] = pd.to_datetime(df2['notice'])
df1.set_index("date", inplace = True)
temp = df1.apply(lambda x: fill_rule(x, df2), axis = 0)
And I got the following error: KeyError: (0, 'occurred at index B')
df1['t'] = df1['date'].map(df2.set_index(["Notice"])['Name'])
df1['t'] =df1['t'].fillna(method='bfill').fillna("C")
df3 = pd.DataFrame()
df3['Result'] = df1.apply(lambda row: row[row['t']],axis =1)
df3['date'] = df1['date']
You can use the between method to select the specific date ranges in both dataframes and then use iloc to substitute the specific values
#Initializing the output
df3 = df1.copy()
df3.drop(['B','C'], axis = 1, inplace = True)
df3.columns = ['date','Result']
df3['Result'] = 0.0
df3['count'] = 0
#Modifying df2 to add a dummy sample at the beginning
temp = df2.copy()
temp = temp.iloc[0]
temp = pd.DataFrame(temp).T
temp.Name ='Z'
temp.Notice = pd.to_datetime("05-05-1980")
df2 = pd.concat([temp,df2])
for i in range(len(df2)-1):
startDate = df2.iloc[i]['Notice']
endDate = df2.iloc[i+1]['Notice']
name = df2.iloc[i+1]['Name']
indices = [df1.date.between(startDate, endDate, inclusive=True)][0]
df3.loc[indices,'Result'] += df1[indices][name]
df3.loc[indices,'count'] += 1
df3.Result = df3.apply(lambda x : x.Result/x['count'], axis = 1)
I have a question about eliminating outliers from two-time series. One time series includes spot market prices and the other includes power outputs. The two series are from 2012 to 2016 and are both CSV files with the with a timestamp and then a value. As example for the power output: 2012-01-01 00:00:00,2335.2152646951617 and for the price: 2012-01-01 00:00:00,17.2
Because the spot market prices are very volatile and have a lot of outliers, I have filtered them. For the second time series, I have to delete the values with the same timestamp, which were eliminated in the time series of the prices. I thought about generating a list with the deleted values and writing a loop to delete the values with the same timestamp in the second time series. But so far that has not worked and I'm not really on. Does anyone have an idea?
My python code looks as follow:
import pandas as pd
import matplotlib.pyplot as plt
power_output = pd.read_csv("./data/external/power_output.csv", delimiter=",", parse_dates=[0], index_col=[0])
print(power_output.head())
plt.plot(power_output)
spotmarket = pd.read_csv("./data/external/spotmarket_dhp.csv", delimiter=",", parse_dates=[0], index_col=[0])
print(spotmarket.head())
r = spotmarket['price'].pct_change().dropna() * 100
print(r)
plt.plot(r)
Q1 = r.quantile(.25)
Q3 = r.quantile(.75)
q1 = Q1-2*(Q3-Q1)
q3 = Q3+2*(Q3-Q1)
a = r[r.between(q1, q3)]
print(a)
plt.plot(a)
Can somebody help me?
If your question is about how to compare two timestamps you can have a look at this.
Basically you could do:
out = r[~r.between(q1, q3)] # negation of your between to get the outliers
df=pd.merge(spotmarker,out,on=['date'],how="outer",indicator=True)
df=df[df['_merge']=='left_only']
Which is a merge operation that conserves only those rows that are only present in the left dataframe
The following suggestion is based on an answer of mine from a previous post.
You can solve your problem by merging both of your series and storing them in pandas dataframe. Then you can use any desired technique to identify and remove outliers. Take a look at the post mentioned above.
Here is my take on your particular problem using a snippet that can handle more than one series:
Since I don't have access to your data, the following snippet will produce two series where one of them has a distinctive outlier:
def sample(colname):
base = 100
nsample = 20
sigma = 10
# Basic df with trend and sinus seasonality
trend1 = np.linspace(0,1, nsample)
y1 = np.sin(trend1)
dates = pd.date_range(pd.datetime(2016, 1, 1).strftime('%Y-%m-%d'), periods=nsample).tolist()
df = pd.DataFrame({'dates':dates, 'trend1':trend1, 'y1':y1})
df = df.set_index(['dates'])
df.index = pd.to_datetime(df.index)
# Gaussian Noise with amplitude sigma
df['y2'] = sigma * np.random.normal(size=nsample)
df['y3'] = df['y2'] + base + (np.sin(trend1))
df['trend2'] = 1/(np.cos(trend1)/1.05)
df['y4'] = df['y3'] * df['trend2']
df=df['y4'].to_frame()
df.columns = [colname]
return(df)
df_sample1 = sample(colname = 'series1')
df_sample2 = sample(colname = 'series2')
df_sample2['series2'].iloc[10] = 800
df_sample1.plot()
df_sample2.plot()
Series 1 - No outliers
Series 2 - A distinctive outlier
Now you can merge those series like this:
# Merge dataframes
df_merged = pd.merge(df_sample1, df_sample2, how='outer', left_index=True, right_index=True)
df_merged.plot()
What is considered an outlier will depend full on the nature of your dataset. In this case, you can set the level for identifying outliers using sscipy.zscore(). In the following case, every observation with a difference that exceeds 3 is considered an outlier.
# A function for removing outliers
def noSpikes(df, level, keepFirst):
# 1. Get some info about the original data:
##%%
#df = df_merged
#level = 3
#keepFirst = True
##%%
firstVal = df[:1]
colNames = df.columns
colNumber = len(df.columns)
#cleanBy = 'Series1'
# 2. Take the first difference and
df_diff = df.diff()
# 3. Remove missing values
df_clean = df_diff.dropna()
# 4. Select a level for a Z-score to identify and remove outliers
df_Z = df_clean[(np.abs(stats.zscore(df_clean)) < level).all(axis=1)]
ix_keep = df_Z.index
# 5. Subset the raw dataframe with the indexes you'd like to keep
df_keep = df.loc[ix_keep]
# 6.
# df_keep will be missing some indexes.
# Do the following if you'd like to keep those indexes
# and, for example, fill missing values with the previous values
df_out = pd.merge(df_keep, df, how='outer', left_index=True, right_index=True)
# 7. Keep only the original columns (drop the diffs)
df_out = df_out.ix[:,:colNumber]
# 8. Fill missing values
df_complete = df_out.fillna(axis=0, method='ffill')
# 9. Reset column names
df_complete.columns = colNames
# Keep the first value
if keepFirst:
df_complete.iloc[0] = firstVal.iloc[0]
return(df_complete)
df_clean = noSpikes(df = df_merged, level = 3, keepFirst = True)
df_clean.plot()
Let me know how this works out for you.
Here's the whole thing for an easy copy-paste:
# Imports
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from scipy import stats
np.random.seed(22)
# A function for noisy data with a trend element
def sample(colname):
base = 100
nsample = 20
sigma = 10
# Basic df with trend and sinus seasonality
trend1 = np.linspace(0,1, nsample)
y1 = np.sin(trend1)
dates = pd.date_range(pd.datetime(2016, 1, 1).strftime('%Y-%m-%d'), periods=nsample).tolist()
df = pd.DataFrame({'dates':dates, 'trend1':trend1, 'y1':y1})
df = df.set_index(['dates'])
df.index = pd.to_datetime(df.index)
# Gaussian Noise with amplitude sigma
df['y2'] = sigma * np.random.normal(size=nsample)
df['y3'] = df['y2'] + base + (np.sin(trend1))
df['trend2'] = 1/(np.cos(trend1)/1.05)
df['y4'] = df['y3'] * df['trend2']
df=df['y4'].to_frame()
df.columns = [colname]
return(df)
df_sample1 = sample(colname = 'series1')
df_sample2 = sample(colname = 'series2')
df_sample2['series2'].iloc[10] = 800
df_sample1.plot()
df_sample2.plot()
# Merge dataframes
df_merged = pd.merge(df_sample1, df_sample2, how='outer', left_index=True, right_index=True)
df_merged.plot()
# A function for removing outliers
def noSpikes(df, level, keepFirst):
# 1. Get some info about the original data:
firstVal = df[:1]
colNames = df.columns
colNumber = len(df.columns)
#cleanBy = 'Series1'
# 2. Take the first difference and
df_diff = df.diff()
# 3. Remove missing values
df_clean = df_diff.dropna()
# 4. Select a level for a Z-score to identify and remove outliers
df_Z = df_clean[(np.abs(stats.zscore(df_clean)) < level).all(axis=1)]
ix_keep = df_Z.index
# 5. Subset the raw dataframe with the indexes you'd like to keep
df_keep = df.loc[ix_keep]
# 6.
# df_keep will be missing some indexes.
# Do the following if you'd like to keep those indexes
# and, for example, fill missing values with the previous values
df_out = pd.merge(df_keep, df, how='outer', left_index=True, right_index=True)
# 7. Keep only the original columns (drop the diffs)
df_out = df_out.ix[:,:colNumber]
# 8. Fill missing values
df_complete = df_out.fillna(axis=0, method='ffill')
# 9. Reset column names
df_complete.columns = colNames
# Keep the first value
if keepFirst:
df_complete.iloc[0] = firstVal.iloc[0]
return(df_complete)
df_clean = noSpikes(df = df_merged, level = 3, keepFirst = True)
df_clean.plot()
I have this large data frame and I need to when certain resource are available for the first time. Let me explain it from my code.
df1 = df[df['Resource_ID'] == 1348]
df1 = df1[['Format', 'Range_Start', 'Number']]
df1["Range_Start"] = df1["Range_Start"].str[:7]
df1 = df1.groupby(['Format', 'Range_Start'], as_index=True).last()
pd.options.display.float_format = '{:,.0f}'.format
df1 = df1.unstack()
df1.columns = df1.columns.droplevel()
df2 = df1[1:4].sum(axis=0)
df2.name = 'sum'
df2 = df1.append(df2)
df3 = df2.T[['entry', 'sum']].copy()
df3.index = pd.to_datetime(df3.index)
Now print(df3.first('1D')) gives the following output:
Format entry sum
Range_Start
2011-07-01 97 72
I can now see that Resource_ID 1348 first occurs on 2011-07-01, how do I extract only the Year from this information?
This is my sample input csv data:
Access_Stat_ID,Resource_ID,Range_Start,Range_End,Name,Format,Number,Matched_URL
1,15,"2009-03-01 00:00:00","2009-03-31 23:59:59","Mar 2009","entry",3,""
203,13,"2009-04-01 00:00:00","2009-04-30 23:59:59","Apr 2009","entry",18,""
204,13,"2009-04-01 00:00:00","2009-04-30 23:59:59","Apr 2009","pdf",7,""
It seems need:
first_year = df3.index.year[0]