import yfinance as yf
import pandas as pd
import talib
code = '2800'
para_dict = {
'sample_period_list': [200],
'fastperiod_list': [12, 16],
'slowperiod_list': [26, 30],
'signalperiod_list': [8, 12],
'higher_percentile_list': [0.8],
'profit_target': [0.04],
'stop_loss': [-0.04]
}
start_date = '2020-01-01'
end_date = '2022-10-10'
df_dict = {}
df = yf.Ticker(code + '.HK').history(start=start_date, end=end_date)
df = df[df['Volume'] > 0]
df = df[['Open', 'High', 'Low', 'Close']]
# df['pnl_percentage'] = df['Open'].pct_change()
df = df.reset_index()
for fastperiod in para_dict['fastperiod_list']:
for slowperiod in para_dict['slowperiod_list']:
for signalperiod in para_dict['signalperiod_list']:
macd_key = str(fastperiod) + '_' + str(slowperiod) + '_' + str(signalperiod)
df['macd'], df['macdsignal'], df['macdhist'] = talib.MACD(df['Close'], fastperiod=fastperiod, slowperiod=slowperiod, signalperiod=signalperiod)
df_dict[macd_key] = df
print(df_dict)
I cant get the right dataframe for different MACD periods, instead I generate the same dataframe using different MACD periods by below codes? WHY
I cant get the right dataframe for different MACD periods, instead I generate the same dataframe using different MACD periods by below codes? WHY
The reason is because you're pointing to the same dataframe , if you change one they all change so in your example they will be all equal to the last df.
you can read more in it in those questions :
Modifying one dataframe appears to change another
Why can pandas DataFrames change each other?
As a solution to your case , you need to use a copy of the dataframe not the actual dataframe :
df_dict[macd_key] = df.copy()
#instead of df_dict[macd_key] = df
it will solve your issue
Related
Working with python and pandas and I have cleaned some data and added a new column and added some data. Now the dataframe refuses to sort for some reason. I have tried two different methods to ensure the column "Review_Score" is in numeric form and both work. I then tried to sort by name and it would not work either. Can anyone explain what when wrong here?
from itertools import count
from platform import platform
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
#import the csv file
df = pd.read_csv("video_game.csv")
#clean the data converting str to numbers where needed + drop unwanted columns
df = df.drop(columns=["NA_players", "EU_players", "JP_players", "Other_players", "Global_players", "User_Count", "Rating", "Critic_Count"])
df['User_Score'] = pd.to_numeric(df['User_Score'] ,errors='coerce')
df = df.replace(np.nan, 0, regex=True)
df['User_Score'] = df['User_Score'].astype(float)
df['Critic_Score'] = pd.to_numeric(df['Critic_Score'] ,errors='coerce')
df = df.replace(np.nan, 0, regex=True)
df['Critic_Score'] = df['Critic_Score'].astype(float)
#filter all the NES games released in 1988 on the list
df2 = df.loc[(df['Platform'] == 'NES') & (df['Year_of_Release'] == 1988)]
score = [94, 78, 80, 76, 72, 43, 94, 95, 65, 35, 68]
#add new column and populate with the review scores from the list score
df2['Review_Score'] = score
#df2.Review_Score = df2.Review_Score.astype(float)
df2.Review_Score = pd.to_numeric(df2.Review_Score, errors='coerce')
df2.sort_values('Review_Score', ascending=True)
print(df2)
For the sort_values method, you will need to add an extra parameter inplace=True in order to apply the changes to the original dataframe. You could do it as such:
df2.sort_values('Review_Score', ascending=True, inplace=True)
Another way you could apply the changes is by reassigning the dataframe to the original variable as such:
df2 = df2.sort_values('Review_Score', ascending=True)
I would like to ask how to sum using python or excel.
Like to do summation of "number" columns based on "time" column.
Sum of the Duration for (00:00 am - 00:59 am) is (2+4) 6.
Sum of the Duration for (02:00 am - 02:59 am) is (3+1) 4.
Could you please advise how to ?
When you have a dataframe you can use groupby to accomplish this:
# import pandas module
import pandas as pd
# Create a dictionary with the values
data = {
'time' : ["12:20:51", "12:40:51", "2:26:35", "2:37:35"],
'number' : [2, 4, 3, 1]}
# create a Pandas dataframe
df = pd.DataFrame(data)
# or load the CSV
df = pd.read_csv('path/dir/filename.csv')
# Convert time column to datetime data type
df['time'] = df['time'].apply(pd.to_datetime, format='%H:%M:%S')
# add values by hour
dff = df.groupby(df['time'].dt.hour)['number'].sum()
print(dff.head(50))
output:
time
12 6
2 4
When you need more than one column. You can pass the columns as a list inside .groupby(). The code will look like this:
import pandas as pd
df = pd.read_csv('filename.csv')
# Convert time column to datetime data type
df['time'] = df['time'].apply(pd.to_datetime, format='%H:%M:%S')
df['date'] = df['date'].apply(pd.to_datetime, format='%d/%m/%Y')
# add values by hour
dff = df.groupby([df['date'], df['time'].dt.hour])['number'].sum()
print(dff.head(50))
# save the file
dff.to_csv("filename.csv")
I have a dataframe which looks like this:
I wanted to make a dataframe which looks like this:
For this I have referred the post at pandas convert some columns into rows.
By using the merge function I get a dataframe as shown below:
How do I get my dataframe in the format required?
The complete code is as shown:
import pandas as pd
from nsepy import get_history
from datetime import date
import numpy as np
stock = ['APLAPOLLO','AUBANK','AARTIDRUGS','AARTIIND','AAVAS','ABBOTINDIA','ADANIENT','ADANIGAS','ADANIGREEN','ADANIPORTS']
res = dict(zip(stock,stock))
start = date (2020, 11, 22)
end = date (2020, 12, 22)
for stock_name in stock:
data = get_history(symbol=stock_name, start=start, end=end)
res[stock_name]=data
for key, df in res.items():
# create a column called "key name"
df['key_name'] = key
lst = list(res.values())
df = pd.concat(lst)
df['boolean'] = df['Prev Close'] < df['Close']
df1 = pd.DataFrame({'boolean' : [True] + [False] * 2 + [True] * 3})
a = df['boolean']
b = a.cumsum()
df['trend'] = (b-b.mask(a).ffill().fillna(0).astype(int)).where(a, 0)
conditions = [(df['boolean']==True), (df['boolean']==False)]
values=['Win','Loose']
df['Win/Loss']=np.select(conditions,values)
df=df.drop(['Win/Loose'],axis=1)
df.to_csv('data.csv')
conditions = [(df['trend']>=2), df['trend']<2]
df2=df[['trend','Symbol']]
w=df2.melt(id_vars=["trend"],value_vars=['Symbol'])
IIUC, this can be solved with pivot_table():
Given the original dataframe you show in the first image:
new_df = df.pivot_table(index='Date',columns='Symbol',value='trend')
I have the following code, where I am binning a Pandas dataframe into given number of bins:
def contibin(data, target, bins=10):
#Empty Dataframe
newDF,woeDF = pd.DataFrame(), pd.DataFrame()
#Extract Column Names
cols = data.columns
for ivars in cols[~cols.isin([target])]:
if (data[ivars].dtype.kind in 'bifc') and (len(np.unique(data[ivars]))>10):
binned_x = pd.qcut(data[ivars], bins, duplicates='drop')
d0 = pd.DataFrame({'x': binned_x, 'y': data[target]})
#print(d0)
else:
d0 = pd.DataFrame({'x': data[ivars], 'y': data[target]})
d = d0.groupby("x", as_index=False).agg({"y": ["count", "sum"]})
d.columns = ['Range', 'Total', 'No. of Good']
d['No. of Bad'] = d['Total'] - d['No. of Good']
d['Dist. of Good'] = np.maximum(d['No. of Good'], 0.5) / d['No. of Good'].sum()
d['Dist. of Bad'] = np.maximum(d['No. of Bad'], 0.5) / d['No. of Bad'].sum()
d['WoE'] = np.log(d['Dist. of Good']/d['Dist. of Bad'])
d['IV'] = d['WoE'] * (d['Dist. of Good'] - d['Dist. of Bad'])
#temp =pd.DataFrame({"Variable" : [ivars], "IV" : [d['IV'].sum()]}, columns = ["Variable", "IV"])
#newDF=pd.concat([newDF,temp], axis=0)
woeDF=pd.concat([woeDF,d], axis=0)
return woeDF
The problem I am facing is when I try to integrate the code on front end using Django, I am not being able to represent woeDF['Range'] in Django the way I am able to see it normally. I tried converting the Pandas.Series to string, but it still isn't giving me what I want. To illustrate what I want to see in my frontend, I am attaching a picture of a sample table which I got by running this code on the Churn modelling Dataset.The image of the table I need
You can turn the Dataframe in an array of objects using DataFrame.itertuples(index=False)
you will then be able to iterate through the dataframe in Jinja by accessing the columns via their names. See the below example in Python:
import pandas as pd
columns = {"name": ["john", "skip", "abu", "harry", "ben"],
"age": [10, 20, 30, 40, 50]}
df = pd.DataFrame(columns)
print(df)
df_objects = df.itertuples(index=False)
for person in df_objects:
print("{0}: {1}".format(person.name, person.age))
I have a dataframe with a start date and a finish date for several people :
# input df
df_input = pd.DataFrame([
["John", "2018-08-03", "2018-08-05"],
["Jack", "2018-08-20", "2018-08-21"]
])
df_input.columns = ["name", "start_day", "finish_day"]
I want to create a date range for every people (I want a pd.Series that contains date range) :
# output df
df_output = pd.DataFrame([
["John", "2018-08-03", "2018-08-05", "['2018-08-03', '2018-08-04', '2018-08-05']"],
["Jack", "2018-08-20", "2018-08-21", "['2018-08-20', '2018-08-21']"]
])
df_output.columns = ["name", "start_day", "finish_day", "date_range"]
I don't know how to create that range.
Any idea ?
Challanging and interesting one! I think the following snippet gets pretty close to what you are asking, though the shape is a little different from the exact output you requested. Yet, the restructured shape of the output does contain the range of dates, the name and the end date.
import pandas as pd
df_input = pd.DataFrame([["John", "2018-08-03", "2018-08-05"],["Jack", "2018-08-20", "2018-08-21"]], columns=['Name','Start_Date','End_Date'])
df_input['Start_Date'] = pd.to_datetime(df_input['Start_Date'], format='%Y-%m-%d')
df_input['End_Date'] = pd.to_datetime(df_input['End_Date'], format='%Y-%m-%d')
df_input.set_index('Start_Date', inplace=True)
def reindex_by_date(df_input):
dates = pd.date_range(df_input.index.min(), df_input['End_Date'].min())
return df_input.reindex(dates).ffill()
finaldf = df_input.groupby('Name').apply(reindex_by_date)
finaldf