I have to use the same function twice. The first when the parameter is df, the second when the parameter is df3. How to do that? The function:
def add(df, df3):
df["timestamp"] = pd.to_datetime(df["timestamp"])
df = df.groupby(pd.Grouper(key = "timestamp", freq = "h")).agg("mean")
price = df["price"]
amount = df["amount"]
return (price * amount) // amount
The double use :
out = []
# This loop will use the add(df) function for every csv and append in a list
for f in csv_files:
df = pd.read_csv(f, header=0)
# Replace empty values with numpy, not sure if usefull, maybe pandas can handle this
df.replace("", np.nan)
#added aggregate DataFrame with new column to list of DataFrames
out.append(add(df))
out2 = []
df3 = pd.Series(dtype=np.float64)
for f in csv_files:
df2 = pd.read_csv(f, header=0)
df3 = pd.concat([df3, df2], ignore_index=True)
out2 = pd.DataFrame(add(df = df3))
out2
I got the error:
TypeError: add() missing 1 required positional argument: 'df3'
The names of the add function have nothing to do with the variable names df and df3 in the rest of the script.
As #garagnoth has stated, you only need one parameter in add. You can call it df, foo or myvariablename: it is not related to nor df, nor df3.
In your case, you can change the add function to the following:
def add(a_dataframe):
# I set the argument name to "a_dataframe" so you can
# see its name is not linked to outside variables
a_dataframe["timestamp"] = pd.to_datetime(a_dataframe["timestamp"])
a_dataframe = a_dataframe.groupby(pd.Grouper(key = "timestamp", freq = "h")).agg("mean")
price = a_dataframe["price"]
amount = a_dataframe["amount"]
return (price * amount) // amount
You can now call this function with df or df3 as the rest of the script already does.
Related
I've written a function to transform an excel sheet and take only one row from a monthly data. Mensually I'll have the data on a new excel sheet.
I've made this:
def bocapago(nombre):
path='/content/drive/MyDrive/Fundacion Frontera Economica/Muni/python/inputs/BOCAS DE PAGO'
filename = path + "/" + nombre.upper() + '.xlsx'
input_cols=[0,1,2,3] # Columnas a importar
df = pd.read_excel(filename,
header=0,
usecols = input_cols,
index_col=False,
)
df.columns = ['n_tasa','Fecha','Lugar','Importe']
pd.to_datetime(df['Fecha'])
df['Periodo'] = pd.DatetimeIndex(df['Fecha']).month
df['Periodo'] = nombre
df['Periodo'] = df['Periodo'].str[:3] + "-" + df['Periodo'].str[-4:]
df = pd.pivot_table(df, values='Importe', index='Periodo', columns='Lugar', aggfunc='sum')
df = df.assign(Total=df.sum(1))
df = df.rename(columns={'Total':'TOTAL GENERAL'})
df.head()
return df
That is the function to read an proccess the sheet. And then I did this as a second step:
ENERO1 = bocapago('ENERO2021')
FEBRERO1 = bocapago('FEBRERO2021')
MARZO1 = bocapago('MARZO2021')
MAYO1 = bocapago('MAYO2021')
ingxboca = [ENERO1, FEBRERO1, MARZO1, MAYO1]
ingxboca = pd.concat(ingxboca)
ingxboca = ingxboca.merge(ingresos['TOTAL IACM'], how='left', on='Periodo')
ingxboca['DIFERENCIA'] = ingxboca['TOTAL IACM']-ingxboca['TOTAL GENERAL']
ingxboca.head()
I use another dataframe called "ingresos" on this case to merge.
My doubt is how can I do a for or while bucle to do the second step, so I can include all of it inside the function called "bocapago" or make another function like "finishing".
I would keep bocapago as it's own function just like you've done and have the second function call it. It keeps the complexity of a single function lower and will be easier for code reuse in the future. IF I understood your question correctly, would this work?
def new_function(file_list:list):
ingxboca = pd.concat([bocapago(f) for f in file_list])
ingxboca = ingxboca.merge(ingresos['TOTAL IACM'], how='left', on='Periodo')
ingxboca['DIFERENCIA'] = ingxboca['TOTAL IACM']-ingxboca['TOTAL GENERAL']
return ingxboca.head()
I'm not sure if that answered the question or not. If so, I imagine the list comprehension in the first line did it. keep in mind you can add if statements to a list comprehension. You can also pass in a string and use something like glob to give you a filelist with rules in it.
I have the following df:
df = pd.DataFrame(columns=['Place', 'PLZ','shortName','Parzellen'])
new_row1 = {'Place':'Winterthur', 'PLZ':[8400, 8401, 8402, 8404, 8405, 8406, 8407, 8408, 8409, 8410, 8411], 'shortName':'WIN', 'Parzellen':[]}
new_row2 = {'Place':'Opfikon', 'PLZ':[8152], 'shortName':'OPF', 'Parzellen':[]}
new_row3 = {'Place':'Stadel', 'PLZ':[8174], 'shortName':'STA', 'Parzellen':[]}
new_row4 = {'Place':'Kloten', 'PLZ':[8302], 'shortName':'KLO', 'Parzellen':[]}
new_row5 = {'Place':'Niederhasli', 'PLZ':[8155,8156], 'shortName':'NIH', 'Parzellen':[]}
new_row6 = {'Place':'Bassersdorf', 'PLZ':[8303], 'shortName':'BAS', 'Parzellen':[]}
new_row7 = {'Place':'Oberglatt', 'PLZ':[8154], 'shortName':'OBE', 'Parzellen':[]}
new_row8 = {'Place':'Bülach', 'PLZ':[8180], 'shortName':'BUE', 'Parzellen':[]}
df = df.append(new_row1, ignore_index=True)
df = df.append(new_row2, ignore_index=True)
df = df.append(new_row3, ignore_index=True)
df = df.append(new_row4, ignore_index=True)
df = df.append(new_row5, ignore_index=True)
df = df.append(new_row6, ignore_index=True)
df = df.append(new_row7, ignore_index=True)
df = df.append(new_row8, ignore_index=True)
print (df)
Now I have a number like 8405 and I want to know the Place or whole Row which has this number under df['PLZ'].
I also tried with classes but it was hard to get all Numbers of all Objects because I want to be able to call all PLZ in a list and also check, if I have any number, to which Place it belongs. Maybe there is an obvious better way and I just don't know it.
try with boolean masking and map() method:
df[df['PLZ'].map(lambda x:8405 in x)]
OR
via boolean masking and agg() method:
df[df['PLZ'].agg(lambda x:8405 in x)]
#you can also use apply() in place of agg
output of above code:
Place PLZ shortName Parzellen
0 Winterthur [8400, 8401, 8402, 8404, 8405, 8406, 8407, 840... WIN []
I am trying to delete a column called Rank but nothing happens. The remaining code all executes without any issue but the column itself remains in the output file. I've highlighted the part of the code that is not working.
def read_csv():
file = "\mona" + yday+".csv"
#df=[]
df = pd.read_csv(save_path+file,skiprows=3,encoding = "ISO-8859-1",error_bad_lines=False)
return df
# replace . with / in column EPIC
def tickerchange():
df=read_csv()
df['EPIC'] = df['EPIC'].str.replace('.','/')
return df
def consolidate_AB_listings():
df=tickerchange()
Aline = df.loc[(df['EPIC'] =='RDSA'),'Mkt Cap (àm)']
Bline = df.loc[(df['EPIC'] =='RDSB'),'Mkt Cap (àm)']
df.loc[(df['EPIC'] =='RDSA'),'Mkt Cap (àm)']= float(Aline) + float(Bline)
df = df.loc[(df.Ind != 'I/E')]
df = df.loc[(df.Ind != 'FL')]
df = df.loc[(df.Ind != 'M')]
df = df.loc[(df.EPIC != 'RDSB')]
return df
def ranking_mktcap():
df = consolidate_AB_listings()
df['Rank']= df['Mkt Cap (àm)'].rank(ascending=False)
df = df.loc[(df.Rank != 1)]
df['Rank1']= df['Mkt Cap (Em)'].rank(ascending=False)
## This doesn't seem to work
df = df.drop(df['Security'], 1)
return df
def save_outputfile():
#df = drop()
df = ranking_mktcap()
df.to_csv(r'S:\Index_Analytics\UK\Index Methodology\FTSE\Py_file_download\MonitoredList.csv', index=False)
print("finished")
if __name__ == "__main__":
main()
read_csv()
tickerchange()
consolidate_AB_listings()
ranking_mktcap()
save_outputfile()
DataFrame.drop() takes the following: DataFrame.drop(self, labels=None, axis=0, index=None, columns=None, level=None, inplace=False, errors='raise').
When you call df = df.drop(df['Security'], 1) it's using df['security'] as the labels to drop. And the 1 is being passed through the axis parameter.
If you want to drop the column 'Security' then you'd want to do:
df = df.drop('Security', axis=1)
# this is same as
df = df.drop(labels='Security', axis=1)
# you can also specify the column name directly, like this
df = df.drop(columns='Security')
Note: the columns= parameter can take a single lable (str) like above, or can take a list of column names.
Try by replacing
df = df.drop(df['Security'], 1)
By
df.drop(['Security'],axis=1, inplace=True)
I had the same issue and all I did was add inplace = True
So it will be df = df.drop(df['Security'], 1, inplace = True)
Happy 2020! I would like to create a dataframe based on two others. I have the below two dataframes:
df1 = pd.DataFrame({'date':['03.05.1982','04.05.1982','05.05.1982','06.05.1982','07.05.1982','10.05.1982','11.05.1982'],'A': [63.63,64.08,64.19,65.11,65.36,65.25,65.36], 'B': [63.83, 64.10, 64.19, 65.08, 65.33, 65.28, 65.36], 'C':[63.99, 64.22, 64.30, 65.16, 65.41, 65.36, 65.44]})
df2 = pd.DataFrame({'Name':['A','B','C'],'Notice': ['05.05.1982','07.05.1982','12.05.1982']})
The idea is to create df3 such that this dataframe takes the value of A until A's notice date (found in df2) is reached, then df3 switches to the values of B until B's notice date is reached and so on. When we are during notice date, it should take the mean between the current column and the next one.
In the above example, df3 should be as follows (with formulas to illustrate):
df3 = pd.DataFrame({'date':['03.05.1982','04.05.1982','05.05.1982','06.05.1982','07.05.1982','10.05.1982','11.05.1982'], 'Result':[63.63,64.08,(64.19+64.19)/2,65.08,(65.33+65.41)/2,65.36,65.44]})
My idea was to first create a temporary dataframe with same dimensions as df1 and to fill it with 1's when the index date is prior to notice and 0's after. Doing a rolling mean with window 1 would give for each column a series of 1 until I reach 0.5 (signalling a switch).
Not sure if there is a better way to get df3?
I tried the following:
def fill_rule(df_p,df_t):
return np.where(df_p.index > df_t[df_t.Name==df_p.name]['Notice'][0], 0, 1)
df1['date'] = pd.to_datetime(df1['date'])
df2['notice'] = pd.to_datetime(df2['notice'])
df1.set_index("date", inplace = True)
temp = df1.apply(lambda x: fill_rule(x, df2), axis = 0)
And I got the following error: KeyError: (0, 'occurred at index B')
df1['t'] = df1['date'].map(df2.set_index(["Notice"])['Name'])
df1['t'] =df1['t'].fillna(method='bfill').fillna("C")
df3 = pd.DataFrame()
df3['Result'] = df1.apply(lambda row: row[row['t']],axis =1)
df3['date'] = df1['date']
You can use the between method to select the specific date ranges in both dataframes and then use iloc to substitute the specific values
#Initializing the output
df3 = df1.copy()
df3.drop(['B','C'], axis = 1, inplace = True)
df3.columns = ['date','Result']
df3['Result'] = 0.0
df3['count'] = 0
#Modifying df2 to add a dummy sample at the beginning
temp = df2.copy()
temp = temp.iloc[0]
temp = pd.DataFrame(temp).T
temp.Name ='Z'
temp.Notice = pd.to_datetime("05-05-1980")
df2 = pd.concat([temp,df2])
for i in range(len(df2)-1):
startDate = df2.iloc[i]['Notice']
endDate = df2.iloc[i+1]['Notice']
name = df2.iloc[i+1]['Name']
indices = [df1.date.between(startDate, endDate, inclusive=True)][0]
df3.loc[indices,'Result'] += df1[indices][name]
df3.loc[indices,'count'] += 1
df3.Result = df3.apply(lambda x : x.Result/x['count'], axis = 1)
I am trying to create a simple time-series, of different rolling types. One specific example, is a rolling mean of N periods using the Panda python package.
I get the following error : ValueError: DataFrame constructor not properly called!
Below is my code :
def py_TA_MA(v, n, AscendType):
df = pd.DataFrame(v, columns=['Close'])
df = df.sort_index(ascending=AscendType) # ascending/descending flag
M = pd.Series(df['Close'].rolling(n), name = 'MovingAverage_' + str(n))
df = df.join(M)
df = df.sort_index(ascending=True) #need to double-check this
return df
Would anyone be able to advise?
Kind regards
found the correction! It was erroring out (new error), where I had to explicitly declare n as an integer. Below, the code works
#xw.func
#xw.arg('n', numbers = int, doc = 'this is the rolling window')
#xw.ret(expand='table')
def py_TA_MA(v, n, AscendType):
df = pd.DataFrame(v, columns=['Close'])
df = df.sort_index(ascending=AscendType) # ascending/descending flag
M = pd.Series(df['Close'], name = 'Moving Average').rolling(window = n).mean()
#df = pd.Series(df['Close']).rolling(window = n).mean()
df = df.join(M)
df = df.sort_index(ascending=True) #need to double-check this
return df