so i'm currently trying to make a timeseries forecasting using LSTM and i'm still on the early stage where i wanted to make sure my data clean.
for the background:
i'm trying to make a model using LSTM for temperature, rain(?), and humidity (my english not good) for 3 Station, and so if i'm correct there will be 9 models, 3 models each for each station. as of now i'm doing an experiment using 1 year worth of data
the problem:
i named my file based on the index of the month, Jan as 1, Feb as 2, Mar as 3, and so on.
Using the os library i managed to loop through the folder for each file and clean the file, drop the column, filling the missing value, etc.
But when i'm trying to append the order of the month is not correct, it starts from month 11 then go to 8 etc. what am i doing wrong?
and how to print a full dataframe? currently i succed printing the full dataframe using this method
Here is the code:
Dir_data = '/content/DATA'
excel_clean = pd.DataFrame()
train_data=[]
for i in os.listdir(Dir_data):
excel_test = pd.read_excel(i)
#drop column
excel_test.drop(columns=['ff_avg', 'ddd_x', 'ddd_car', 'ff_avg', 'ff_x','ss','Tn','Tx'],inplace = True)
#Start Cleaning
excel_test = excel_test.replace(8888,'x').replace(9999,'x').replace('','x')
excel_test['RR'] = pd.to_numeric(excel_test['RR'], errors='coerce').astype('float64')
excel_test['RH_avg'] = pd.to_numeric(excel_test['RH_avg'], errors='coerce').astype('int64')
excel_test['Tavg'] = pd.to_numeric(excel_test['Tavg'], errors='coerce').astype('float64')
#excel_test.dtypes
#Filling Missing Values
excel_test['RR'] = excel_test['RR'].fillna(excel_test['RR'].mean())
excel_test['RH_avg'] = excel_test['RH_avg'].fillna(excel_test['RH_avg'].mean())
excel_test['Tavg'] = excel_test['Tavg'].fillna(excel_test['Tavg'].mean())
excel_test['RR'] = excel_test['RR'].round(decimals=1)
excel_test['Tavg'] = excel_test['Tavg'].round(decimals=1)
excel_clean = excel_clean.append(excel_test)
pd.set_option('max_rows', 99999)
pd.set_option('max_colwidth', 400)
pd.describe_option('max_colwidth')
excel_clean.reset_index(drop=True,inplace=True)
excel_clean
it's only for 1 station as this is an experiment
I am using the this dataset for a project.
I am trying to find the total yield for each inverter for the 34 day duration of the dataset (basically use the final and initial value available for each inverter). I have been able to get the list of inverters using pd.unique()(there are 22 inverters for each solar power plant.
I am having trouble querying the total_yield data for each inverter.
Here is what I have tried:
def get_yields(arr: np.ndarray, df:pd.core.frame.DataFrame) -> np.ndarray:
delta = np.zeros(len(arr))
index =0
for i in arr:
initial = df.loc[df["DATE_TIME"]=="15-05-2020 02:00"]
initial = initial.loc[initial["INVERTER_ID"]==i]
initial.reset_index(inplace=True,drop=True)
initial = initial.at[0,"TOTAL_YIELD"]
final = df.loc[(df["DATE_TIME"]=="17-06-2020 23:45")]
final = final.loc[final["INVERTER_ID"]==i]
final.reset_index(inplace=True, drop=True)
final = final.at[0,"TOTAL_YIELD"]
delta[index] = final - initial
index = index + 1
return delta
Reference: arr is the array of inverters, listed below. df is the generation dataframe for each plant.
The problem is that not every inverter has a data point for each interval. This makes this function only work for the inverters at the first plant, not the second one.
My second approach was to filter by the inverter first, then take the first and last data points. But I get an error- 'Series' objects are mutable, thus they cannot be hashed
Here is the code for that so far:
def get_yields2(arr: np.ndarray, df: pd.core.frame.DataFrame) -> np.ndarry:
delta = np.zeros(len(arr))
index = 0
for i in arr:
initial = df.loc(df["INVERTER_ID"] == i)
index += 1
break
return delta
List of inverters at plant 1 for reference(labeled as SOURCE_KEY):
['1BY6WEcLGh8j5v7' '1IF53ai7Xc0U56Y' '3PZuoBAID5Wc2HD' '7JYdWkrLSPkdwr4'
'McdE0feGgRqW7Ca' 'VHMLBKoKgIrUVDU' 'WRmjgnKYAwPKWDb' 'ZnxXDlPa8U1GXgE'
'ZoEaEvLYb1n2sOq' 'adLQvlD726eNBSB' 'bvBOhCH3iADSZry' 'iCRJl6heRkivqQ3'
'ih0vzX44oOqAx2f' 'pkci93gMrogZuBj' 'rGa61gmuvPhdLxV' 'sjndEbLyjtCKgGv'
'uHbuxQJl8lW7ozc' 'wCURE6d3bPkepu2' 'z9Y9gH1T5YWrNuG' 'zBIq5rxdHJRwDNY'
'zVJPv84UY57bAof' 'YxYtjZvoooNbGkE']
List of inverters at plant 2:
['4UPUqMRk7TRMgml' '81aHJ1q11NBPMrL' '9kRcWv60rDACzjR' 'Et9kgGMDl729KT4'
'IQ2d7wF4YD8zU1Q' 'LYwnQax7tkwH5Cb' 'LlT2YUhhzqhg5Sw' 'Mx2yZCDsyf6DPfv'
'NgDl19wMapZy17u' 'PeE6FRyGXUgsRhN' 'Qf4GUc1pJu5T6c6' 'Quc1TzYxW2pYoWX'
'V94E5Ben1TlhnDV' 'WcxssY2VbP4hApt' 'mqwcsP2rE7J0TFp' 'oZ35aAeoifZaQzV'
'oZZkBaNadn6DNKz' 'q49J1IKaHRwDQnt' 'rrq4fwE8jgrTyWY' 'vOuJvMaM2sgwLmb'
'xMbIugepa2P7lBB' 'xoJJ8DcxJEcupym']
Thank you very much.
I can't download the dataset to test this. Getting "To May Requests" Error.
However, you should be able to do this with a groupby.
import pandas as pd
result = df.groupby('INVERTER_ID')['TOTAL_YIELD'].agg(['max','min'])
result['delta'] = result['max']-result['min']
print(result[['delta']])
So if I'm understanding this right, what you want is the TOTAL_YIELD for each inverter for the beginning of the time period starting 5-05-2020 02:00 and ending 17-06-2020 23:45. Try this:
# enumerate lets you have an index value along with iterating through the array
for i, code in enumerate(arr):
# to filter the info to between the two dates, but not necessarily assuming that
# each inverter's data starts and ends at each date
inverter_df = df.loc[df['DATE_TIME'] >= pd.to_datetime('15-05-2020 02:00:00')]
inverter_df = inverter_df.loc[inverter_df['DATE_TIME'] <= pd.to_datetime('17-06-2020
23:45:00')]
inverter_df = inverter_df.loc[inverter_df["INVERTER_ID"]==code]]
# sort by date
inverter_df.sort_values(by='DATE_TIME', inplace= True)
# grab TOTAL_YIELD at the first available date
initial = inverter_df['TOTAL_YIELD'].iloc[0]
# grab TOTAL_YIELD at the last available date
final = inverter_df['TOTAL_YIELD'].iloc[-1]
delta[index] = final - initial
I have 13 CSV files that contain billing information in an unusual format. Multiple readings are recorded every 30 minutes of the day. Five days are recorded beside each other (columns). Then the next five days are recorded under it. To make things more complicated, the day of the week, date, and billing day is shown over the first recording of KVAR each day.
The image blow shows a small example. However, imagine that KW, KVAR, and KVA repeat 3 more times before continuing some 50 rows later.
My goal as to create a simple python script that would make the data into a data frame with the columns: DATE, TIME, KW, KVAR, KVA, and DAY.
The problem is my script returns NaN data for the KW, KVAR, and KVA data after the first five days (which is correlated with a new instance of a for loop). What is weird to me is that when I try to print out the same ranges I get the data that I expect.
My code is below. I have included comments to help further explain things. I also have an example of sample output of my function.
def make_df(df):
#starting values
output = pd.DataFrame(columns=["DATE", "TIME", "KW", "KVAR", "KVA", "DAY"])
time = df1.loc[3:50,0]
val_start = 3
val_end = 51
date_val = [0,2]
day_type = [1,2]
# There are 7 row movements that need to take place.
for row_move in range(1,8):
day = [1,2,3]
date_val[1] = 2
day_type[1] = 2
# There are 5 column movements that take place.
# The basic idea is that I would cycle through the five days, grab their data in a temporary dataframe,
# and then append that dataframe onto the output dataframe
for col_move in range(1,6):
temp_df = pd.DataFrame(columns=["DATE", "TIME", "KW", "KVAR", "KVA", "DAY"])
temp_df['TIME'] = time
#These are the 3 values that stop working after the first column change
# I get the values that I expect for the first 5 days
temp_df['KW'] = df.iloc[val_start:val_end, day[0]]
temp_df['KVAR'] = df.iloc[val_start:val_end, day[1]]
temp_df['KVA'] = df.iloc[val_start:val_end, day[2]]
# These 2 values work perfectly for the entire data set
temp_df['DAY'] = df.iloc[day_type[0], day_type[1]]
temp_df["DATE"] = df.iloc[date_val[0], date_val[1]]
# trouble shooting
print(df.iloc[val_start:val_end, day[0]])
print(temp_df)
output = output.append(temp_df)
# increase values for each iteration of row loop.
# seems to work perfectly when I print the data
day = [x + 3 for x in day]
date_val[1] = date_val[1] + 3
day_type[1] = day_type[1] + 3
# increase values for each iteration of column loop
# seems to work perfectly when I print the data
date_val[0] = date_val[0] + 55
day_type [0]= day_type[0] + 55
val_start = val_start + 55
val_end = val_end + 55
return output
test = make_df(df1)
Below is some sample output. It shows where the data starts to break down after the fifth day (or first instance of the column shift in the for loop). What am I doing wrong?
Could be pd.append requiring matched row indices for numerical values.
import pandas as pd
import numpy as np
output = pd.DataFrame(np.random.rand(5,2), columns=['a','b']) # fake data
output['c'] = list('abcdefghij') # add a column of non-numerical entries
tmp = pd.DataFrame(columns=['a','b','c'])
tmp['a'] = output.iloc[0:2, 2]
tmp['b'] = output.iloc[3:5, 2] # generates NaN
tmp['c'] = output.iloc[0:2, 2]
data.append(tmp)
(initial response)
How does df1 look like? Is df.iloc[val_start:val_end, day[0]] have any issue past the fifth day? The codes didn't show how you read from the csv files, or df1 itself.
My guess: if val_start:val_end gives invalid indices on the sixth day, or df1 happens to be malformed past the fifth day, df.iloc[val_start:val_end, day[0]] will return an empty Series object and possibly make its way into temp_df. iloc do not report invalid row indices, though similar column indices would trigger IndexError.
import numpy as np
import pandas as pd
df = pd.DataFrame(np.random.rand(5,3), columns=['a','b','c'], index=np.arange(5)) # fake data
df.iloc[0:2, 1] # returns the subset
df.iloc[100:102, 1] # returns: Series([], Name: b, dtype: float64)
A little off topic but I would recommend preprocessing the csv files rather than deal with indexing in Pandas DataFrame, as the original format was kinda complex. Slice the data by date and later use pd.melt or pd.groupby to shape them into the format you like. Or alternatively try multi-index if stick with Pandas I/O.
Hi I have a dataset in the following format:
Code for replicating the data:
import pandas as pd
d1 = {'Year':
['2008','2008','2008','2008','2008','2008','2008','2008','2008','2008'],
'Month':['1','1','2','6','7','8','8','11','12','12'],
'Day':['6','22','6','18','3','10','14','6','16','24'],
'Subject_A':['','30','','','','35','','','',''],
'Subject_B':['','','','','','','','40','',''],
'Subject_C': ['','','','','','65','','50','','']}
d1 = pd.DataFrame(d1)
I input the numbers as a string to show blank cells
Where the first three columns denotes date (Year, Month and Day) and the following columns represent individuals (My actual data file consists of about 300 such rows and about 1000 subjects. I presented a subset of the data here).
Where the column value refers to expenditure on FMCG products.
What I would like to do is the following:
Part 1 (Beginning and end points)
a) For each individual locate the first observation and duplicate the value of the first observation for atleast the previous six months. For example: Subject C's 1st observation is on the 10th of August 2008. In that case I would want all the rows from June 10, 2008 to be equal to 65 for Subject C (Roughly 2/12/2008
is the cutoff date. SO we leave the 3rd cell from the top for Subject_C's column blank).
b) Locate last observation and repeat the last observation for the following 3 months. For example for Subject_A, we repeat 35 twice (till 6th November 2008).
Please refer to the following diagram for the highlighted cell with the solutions.
Part II - (Rows in between)
Next I would like to do two things (I would need to do the following three steps separately, not all at one time):
For individuals like Subject_A, locate two observations that come one after the other (30 and 35).
i) Use the average of the two observations. In this case we would have 32.5 in the four rows without caring about time.
for eg:
ii) Find the total time between two observations and take the mean of the time. For the 1st half of the time period assign the first value and for the 2nd half assign the second value. For example - for subject 1, the total days between 01/22/208 and 08/10/2008 is 201 days. For the first 201/2 = 100.5 days assign the value of 30 to Subject_A and for the remaining value assign 35. In this case the columns for Subject_A and Subject_C will look like:
The final dataset will use (a), (b) & (i) or (a), (b) & (ii)
Final data I [using a,b and i]
Final data II [using a,b and ii]
I would appreciate any help with this. Thanks in advance. Please let me know if the steps are unclear.
Follow up question and Issues
Thanks #Juan for the initial answer. Here's my follow up question. Suppose that Subject_A has more than 2 observations (code for the example data below). Would we be able to extend this code to incorporate more than 2 observations?
import pandas as pd
d1 = {'Year':
['2008','2008','2008','2008','2008','2008','2008','2008','2008','2008'],
'Month':['1','1','2','6','7','8','8','11','12','12'],
'Day':['6','22','6','18','3','10','14','6','16','24'],
'Subject_A':['','30','','45','','35','','','',''],
'Subject_B':['','','','','','','','40','',''],
'Subject_C': ['','','','','','65','','50','','']}
d1 = pd.DataFrame(d1)
Issues
For the current code, I found an issue for part II (ii). This is the output that I get:
This is actually on the right track. The two cells above 35 does not seem to get updated. Is there something wrong on my end? Also the same question as before, would we be able to extend it to the case of >2 observations?
Here a code solution for subject A. Should work with the other subjects:
d1 = {'Year':
['2008','2008','2008','2008','2008','2008','2008','2008','2008','2008'],
'Month':['1','1','2','6','7','8','8','11','12','12'],
'Day':['6','22','6','18','3','10','14','6','16','24'],
'Subject_A':['','30','','45','','35','','','',''],
'Subject_B':['','','','','','','','40','',''],
'Subject_C': ['','','','','','65','','50','','']}
d1 = pd.DataFrame(d1)
d1 = pd.DataFrame(d1)
## Create a variable named date
d1['date']= pd.to_datetime(d1['Year']+'/'+d1['Month']+'/'+d1['Day'])
# convert to float, to calculate mean
d1['Subject_A'] = d1['Subject_A'].replace('',np.nan).astype(float)
# index of the not null rows
subja = d1['Subject_A'].notnull()
### max and min index row with notnull value
max_id_subja = d1.loc[subja,'date'].idxmax()
min_id_subja = d1.loc[subja,'date'].idxmin()
### max and min date for Sub A with notnull value
max_date_subja = d1.loc[subja,'date'].max()
min_date_subja = d1.loc[subja,'date'].min()
### value for max and min date
max_val_subja = d1.loc[max_id_subja,'Subject_A']
min_val_subja = d1.loc[min_id_subja,'Subject_A']
#### Cutoffs
min_cutoff = min_date_subja-pd.Timedelta(6, unit='M')
max_cutoff = max_date_subja+pd.Timedelta(3, unit='M')
## PART I.a
d1.loc[(d1['date']<min_date_subja) & (d1['date']>min_cutoff),'Subject_A'] = min_val_subja
## PART I.b
d1.loc[(d1['date']>max_date_subja) & (d1['date']<max_cutoff),'Subject_A'] = max_val_subja
## PART II
d1_2i = d1.copy()
d1_2ii = d1.copy()
lower_date = min_date_subja
lower_val = min_val_subja.copy()
next_dates_index = d1_2i.loc[(d1['date']>min_date_subja) & subja].index
for N in next_dates_index:
next_date = d1_2i.loc[N,'date']
next_val = d1_2i.loc[N,'Subject_A']
#PART II.i
d1_2i.loc[(d1['date']>lower_date) & (d1['date']<next_date),'Subject_A'] = np.mean([lower_val,next_val])
#PART II.ii
mean_time_a = pd.Timedelta((next_date-lower_date).days/2, unit='d')
d1_2ii.loc[(d1['date']>lower_date) & (d1['date']<=lower_date+mean_time_a),'Subject_A'] = lower_val
d1_2ii.loc[(d1['date']>lower_date+mean_time_a) & (d1['date']<=next_date),'Subject_A'] = next_val
lower_date = next_date
lower_val = next_val
print(d1_2i)
print(d1_2ii)