I have the following code, where I am binning a Pandas dataframe into given number of bins:
def contibin(data, target, bins=10):
#Empty Dataframe
newDF,woeDF = pd.DataFrame(), pd.DataFrame()
#Extract Column Names
cols = data.columns
for ivars in cols[~cols.isin([target])]:
if (data[ivars].dtype.kind in 'bifc') and (len(np.unique(data[ivars]))>10):
binned_x = pd.qcut(data[ivars], bins, duplicates='drop')
d0 = pd.DataFrame({'x': binned_x, 'y': data[target]})
#print(d0)
else:
d0 = pd.DataFrame({'x': data[ivars], 'y': data[target]})
d = d0.groupby("x", as_index=False).agg({"y": ["count", "sum"]})
d.columns = ['Range', 'Total', 'No. of Good']
d['No. of Bad'] = d['Total'] - d['No. of Good']
d['Dist. of Good'] = np.maximum(d['No. of Good'], 0.5) / d['No. of Good'].sum()
d['Dist. of Bad'] = np.maximum(d['No. of Bad'], 0.5) / d['No. of Bad'].sum()
d['WoE'] = np.log(d['Dist. of Good']/d['Dist. of Bad'])
d['IV'] = d['WoE'] * (d['Dist. of Good'] - d['Dist. of Bad'])
#temp =pd.DataFrame({"Variable" : [ivars], "IV" : [d['IV'].sum()]}, columns = ["Variable", "IV"])
#newDF=pd.concat([newDF,temp], axis=0)
woeDF=pd.concat([woeDF,d], axis=0)
return woeDF
The problem I am facing is when I try to integrate the code on front end using Django, I am not being able to represent woeDF['Range'] in Django the way I am able to see it normally. I tried converting the Pandas.Series to string, but it still isn't giving me what I want. To illustrate what I want to see in my frontend, I am attaching a picture of a sample table which I got by running this code on the Churn modelling Dataset.The image of the table I need
You can turn the Dataframe in an array of objects using DataFrame.itertuples(index=False)
you will then be able to iterate through the dataframe in Jinja by accessing the columns via their names. See the below example in Python:
import pandas as pd
columns = {"name": ["john", "skip", "abu", "harry", "ben"],
"age": [10, 20, 30, 40, 50]}
df = pd.DataFrame(columns)
print(df)
df_objects = df.itertuples(index=False)
for person in df_objects:
print("{0}: {1}".format(person.name, person.age))
Related
Here's some data:
import numpy as np
import random
import pandas as pd
random.seed(365)
duration = np.random.exponential(scale = 5, size = 100).round(1)
numbers = np.random.normal(loc = 50, scale = 2, size = 100).round(2)
group = np.random.choice(["A", "B", "C", "D"], size = len(duration))
gender = np.random.choice(["Male", "Female"], p = [0.7, 0.3], size = len(duration))
provider = np.random.choice(["2Degrees", "Skinny", "Vodafone", "Spark"], p = [0.25, 0.25, 0.25, 0.25], size = len(duration))
df = pd.DataFrame(
{"Duration":duration,
"Numbers":numbers,
"Group":group,
"Gender":gender,
"Provider":provider}
)
I attempting to concatenate multiple pandas.styler objects together into one figure.
I have all the "pieces" of the figure as individual pandas.styler objects. These I created as data-frames and "styled" them to have their own individual captions.
Here is the code I used to generate the first two "pieces" of this figure (much of the other code I used to generate the other pieces is very similar):
#Gets the number of rows and columns
pd.DataFrame({
"Number of Rows":df.shape[0],
"Number of Columns":df.shape[1]
}, index = [""])
#Gets the info on the data set's categorical columns
data = []
for column in df:
if df[column].dtype == "object":
freq = df[column].value_counts(ascending = False)
data.append({
"Column Name":column,
"Unique Values":len(df[column].unique()),
"Missing Values":df[column].isna().sum(),
"Most Frequently Occurring":freq.index[0],
"Occurrences":freq[0],
"% of Total":freq[0] / freq.sum()*100
})
pd.DataFrame(data).style.format(precision = 1).set_caption("Categorical Columns").set_table_styles([{
"selector": "caption",
"props": [
("font-size", "16px")
]
}])
The figure I attempting to create looks something like this (this I made in an Excel spreadsheet):
See that the pandas.style objects (apart from the first data-frame which states the number of rows and columns in the data set) are stacked on top of each with enough padding between them
Ideally, this entire figure would be exportable to an Excel spreadsheet.
I pretty much have all the code I need, its just getting this final part together that I need help with. Any ideas how to tackle this?
I would like to run two separate loops on df. In the first step, I would like to filter the df by sex (male, female) and year (yrs 2008:2013) and save these dataframes in a list. In the second step, I would like to do some kind of analysis to each element of the list and name the output based on which sex & year combination it came from.
I realize I can do this in one step, but my actual code and significantly more complex and throws an error, which stops the loop and it never advances to the second stage. consequently, I need to break it up into two steps. This is what I have so far. I would like to ask for help on the second stage. How do I run the make_graph function on each element of the list and name it according to sex&year combination?
import random
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
df_toy=pd.DataFrame([])
df_toy['value'] = np.random.randint(low=1, high=1000, size=100000)
df_toy['age'] = np.random.choice(range(0, 92), 100000)
df_toy['sex'] = np.random.choice([0, 1], 100000)
df_toy['year'] = np.random.randint(low=2008, high=2013, size=100000)
def format_data(df_toy, SEX, YEAR):
df_toy = df_toy[(df_toy["sex"] == SEX) & (df_toy["year"] == YEAR) ]
return df_toy
def make_graph(df_):
plt.scatter(age, value)
return df_toy
dfs = []
for SEX in range(0,3):
for YEAR in range(2008,2014):
dfs.append(format_data(df_toy, SEX, YEAR))
for i in range(len(dfs)):
df_=dfs[i]
make_graph(df_)
df_YEAR_SEX=df_
IIUC you could filter plot and save the data like this. Since I don't know the actual data I don't know why you need to do it in 2 steps, here is how you could do it with a few changes.
# Input data
df_toy = pd.DataFrame({
'value' : np.random.randint(low=1, high=1000, size=100000),
'age' : np.random.choice(range(0, 92), 100000),
'sex' : np.random.choice([0, 1], 100000),
'year' : np.random.randint(low=2008, high=2013, size=100000)
})
def filter_and_plot(df, SEX, YEAR):
# filter the df for sex and year
tmp = df[(df["sex"] == SEX) & (df["year"] == YEAR)]
# create a new plot for each filtered df and plot it
fig, ax = plt.subplots()
ax.scatter(x=tmp['age'], y=tmp['value'], s=0.4)
# return the filtered df
return tmp
result_dict = {}
for SEX in range(0,2):
for YEAR in range(2008, 2013):
# use a f-string to build a key in a dictionary which includes sex and year
# keys look like this: "df_1_2009", the value to each key is the filtered dataframe
result_dict[f"df_{SEX}_{YEAR}"] = filter_and_plot(df_toy, SEX, YEAR)
import yfinance as yf
import pandas as pd
import talib
code = '2800'
para_dict = {
'sample_period_list': [200],
'fastperiod_list': [12, 16],
'slowperiod_list': [26, 30],
'signalperiod_list': [8, 12],
'higher_percentile_list': [0.8],
'profit_target': [0.04],
'stop_loss': [-0.04]
}
start_date = '2020-01-01'
end_date = '2022-10-10'
df_dict = {}
df = yf.Ticker(code + '.HK').history(start=start_date, end=end_date)
df = df[df['Volume'] > 0]
df = df[['Open', 'High', 'Low', 'Close']]
# df['pnl_percentage'] = df['Open'].pct_change()
df = df.reset_index()
for fastperiod in para_dict['fastperiod_list']:
for slowperiod in para_dict['slowperiod_list']:
for signalperiod in para_dict['signalperiod_list']:
macd_key = str(fastperiod) + '_' + str(slowperiod) + '_' + str(signalperiod)
df['macd'], df['macdsignal'], df['macdhist'] = talib.MACD(df['Close'], fastperiod=fastperiod, slowperiod=slowperiod, signalperiod=signalperiod)
df_dict[macd_key] = df
print(df_dict)
I cant get the right dataframe for different MACD periods, instead I generate the same dataframe using different MACD periods by below codes? WHY
I cant get the right dataframe for different MACD periods, instead I generate the same dataframe using different MACD periods by below codes? WHY
The reason is because you're pointing to the same dataframe , if you change one they all change so in your example they will be all equal to the last df.
you can read more in it in those questions :
Modifying one dataframe appears to change another
Why can pandas DataFrames change each other?
As a solution to your case , you need to use a copy of the dataframe not the actual dataframe :
df_dict[macd_key] = df.copy()
#instead of df_dict[macd_key] = df
it will solve your issue
I am creating my own dataset for a Uni project. I've used the merge function often and it always worked perfectly. This time I get x and y suffixes which I can not understand. I know pandas does this because -> The rows in the two data frames that match on the specified columns are extracted, and joined together. If there is more than one match, all possible matches contribute one row each. But I really don't get why. I assume it has to do with a warning I got earlier:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
unique_website_user['PurchaseOnWebsite'] = [apply_website_user() for x in unique_website_user.index]
I tried to merge the dataframes on the column 'CustomerID' where they obviously match. I really don't get the error.
Here is my code:
I first want to remove duplicate rows where the relevant columns are CustomerID and WebsiteID
Then I want to apply a function which returns true or false as a string randomly. Up to this point the resulting dataframe looks fine. The only warning I get is the one I described earlier.
And lastly I want to merge them and it results in a dataframe way larger than the original one. I really don't understand that.
import numpy as np
import pandas as pd
from numpy.random import choice
df = pd.DataFrame()
df['AdID'] = np.random.randint(1,1000001, size=100000)
df['CustomerID'] = np.random.randint(1,1001, size=len(df))
df["Datetime"] = choice(pd.date_range('2015-01-01', '2020-12-31'), len(df))
def check_weekday(date):
res = len(pd.bdate_range(date, date))
if res == 0:
result = "Weekend"
else:
result = "Working Day"
return result
df["Weekend"] = df["Datetime"].apply(check_weekday)
def apply_age():
age = choice([16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36],
p=[.00009, .00159, .02908, .06829, .09102, .10043, .10609, .10072, .09223, .08018, .06836, .05552,
.04549,.03864, .03009, .02439, .01939, .01586, .01280, .01069, .00905])
return age
def apply_income_class():
income_class = choice([np.random.randint(50,501),np.random.randint(502,1001), np.random.randint(1002,1501),np.random.randint(1502,2001)],
p=[.442, .387, .148, .023])
return income_class
def apply_gender():
gender = choice(['male', 'female'], p=[.537, .463])
return gender
unique_customers = df[['CustomerID']].drop_duplicates(keep="first")
unique_customers['Age'] = [apply_age() for x in unique_customers.index]
unique_customers['Gender'] = [apply_gender() for x in unique_customers.index]
unique_customers['Monthly Income'] = [apply_income_class() for x in unique_customers.index]
unique_customers['Spending Score'] = [np.random.randint(1,101) for x in unique_customers.index]
df = df.merge(unique_customers, on=['CustomerID'], how='left')
df['WebsiteID'] = np.random.randint(1,31, len(df))
df['OfferID'] = np.random.randint(1,2001, len(df))
df['BrandID'] = np.random.randint(1,10, len(df))
unique_offers = df[['OfferID']].drop_duplicates(keep="first")
print(len(unique_offers))
unique_offers['CategoryID'] = [np.random.randint(1,501) for x in unique_offers.index]
unique_offers['NPS'] = [np.random.randint(1, 101) for x in unique_offers.index]
df = df.merge(unique_offers, on=['OfferID'], how='left')
def apply_website_user():
purchase = np.random.choice(['True', 'False'])
return purchase
unique_website_user = df.drop_duplicates(subset=['CustomerID', 'WebsiteID'], keep="first").copy()
unique_website_user['PurchaseOnWebsite'] = [apply_website_user() for x in unique_website_user.index]
print(unique_website_user.head())
df = df.merge(unique_website_user[['CustomerID','PurchaseOnWebsite']], on='CustomerID', how='left')
#df['PurchaseOnWebsite']= df.groupby(['CustomerID', 'WebsiteID']).apply(apply_website_user)
print(df.head)
#Erstellen der csv-Datei
#df.to_csv(r'/Users/alina/Desktop/trainingsdaten.csv', sep=',', #index=False)
It's better to paste the data, rather than provide images, so this is just guidance as I can't test it. You have a couple issues and I don't think they are related.
copy or slice warning. You might be able to get rid of this two ways. One is reconfigure the line:
unique_website_user['PurchaseOnWebsite'] = [apply_website_user() for x in unique_website_user.index]
to the format it is suggesting. The other, more simple way that might work is to use .copy() on the line before it. You are dropping duplicates and then modifying it, and pandas is just warning that you are modifying a slice or view of the original. try this:
unique_website_user = df.drop_duplicates(subset=['CustomerID', 'WebsiteID'], keep="first").copy()
If you just want to merge back that one column and reduce number of columns, try this:
df = df.merge(unique_website_user[['CustomerID','PurchaseOnWebsite']], on='CustomerID', how='left')
Another alternative to this would be to use groupby() and apply your True/False function in and apply method. Something like:
df.groupby(['CustomerID']).apply(yourfunctionhere)
This gets rid of creating and merging dataframes. If you post all the code actual dataframe, we can be more specific.
UPDATE:
Saw your comment that you found your own answer. Also, this is way faster than your call to the weekday function.
df["Weekend"] = df['Datetime'].apply(lambda x: 'Weekend' if (x.weekday() == 5 or x.weekday() == 6) else 'Working Day')
I have a pandas dataframe which has the following columns ( pk1, pk2 type, qty_6, qty_7 ). I have type as predicted_90, override_90, predicted_50, override 50. Now Based upon combination of pk1 and pk2 If for type predicted_50, predicted_90 contains some value for override_50, override_90 apart from NaN, I want to update my dataframe columns predicted_50, predicted_90 with override_50 and override_90 respectively. Also, I want to capture this change in a boolean column called qty_6_overridden, qty_7_overridden. Also, I want to capture the difference between the both in a column qty_6_dev, qty_7_dev.
qty_6_dev = qty_6 override - qty_6 predicted
Example dataframe :
data=[
['B01FV0FBX4','2019-01-13','predicted_90',2207.931,2217.841],
['B01FV0FBX4','2019-01-13','predicted_50',1561.033,1521.567],
['B01FV0FBX4','2019-01-13','override_90',1973.000,np.NaN],
['B01FV0FBX4','2019-01-13','override_50',1233.000,np.NaN],
['B01FV0FBX4','2019-01-06','override_50',np.NaN,1233.000],
['B01FV0FBX4','2019-01-06','predicted_50',1210.129,1213.803],
['B01FV0FBX4','2019-01-06','override_90',np.NaN,1973.000],
['B01FV0FBX4','2019-01-06','predicted_90',1911.205,1921.594]
]
df = pd.DataFrame(data,columns=['pk1','pk2', 'type', 'qty_6', 'qty_7'])
Expected output :
data=[
['B01FV0FBX4','2019-01-13','predicted_90',1973.000,2217.841,-234.931,0,True,False],
['B01FV0FBX4','2019-01-13','predicted_50',1233.000,1521.567,-328.033,0,True,False],
['B01FV0FBX4','2019-01-13','override_90',1973.000,np.NaN,0,0,False,False],
['B01FV0FBX4','2019-01-13','override_50',1233.000,np.NaN,0,0,False,False],
['B01FV0FBX4','2019-01-06','override_50',np.NaN,1233.000,0,0,False,False],
['B01FV0FBX4','2019-01-06','predicted_50',1210.129,1213.000,0,-0.803,False,True],
['B01FV0FBX4','2019-01-06','override_90',np.NaN,1973.000,0,0,False,False],
['B01FV0FBX4','2019-01-06','predicted_90',1911.205,1973.000,0,51.406,False,True]
]
df = pd.DataFrame(data,columns=['pk1','pk2', 'type', 'qty_6', 'qty_7','qty_6_dev','qty_7_dev', 'qty_6_overridden','qty_7_overridden'])
In the example you can see, the quantities with override exchange quantitties with predicted and we get the corresponding columns 'qty_6_dev','qty_7_dev', 'qty_6_overridden','qty_7_overridden'.
I was able to write a solution. It works but it looks horrible and very difficult to understand for others.
import pandas as pd
import numpy as np
import math
data=[
['B01FV0FBX4','2019-01-13','predicted_90',2207.931,2217.841],
['B01FV0FBX4','2019-01-13','predicted_50',1561.033,1521.567],
['B01FV0FBX4','2019-01-13','override_90',1973.000,np.NaN],
['B01FV0FBX4','2019-01-13','override_50',1233.000,np.NaN],
['B01FV0FBX4','2019-01-06','override_50',np.NaN,1233.000],
['B01FV0FBX4','2019-01-06','predicted_50',1210.129,1213.803],
['B01FV0FBX4','2019-01-06','override_90',np.NaN,1973.000],
['B01FV0FBX4','2019-01-06','predicted_90',1911.205,1921.594]
]
df = pd.DataFrame(data,columns=['pk1','pk2', 'type', 'qty_6', 'qty_7'])
override_map = {
"predicted_50" : "override_50",
"predicted_90" : "override_90"
}
def transform_df(df):
transformed_df = pd.DataFrame()
for index, row in df.iterrows():
row_type = row['type']
row_pk1 = row['pk1']
row_pk2 = row['pk2']
if row_type in override_map.keys():
override_type = override_map.get(row_type)
else:
for i in range(6,8):
qty_dev_col = 'qty_'+str(i)+'_dev'
qty_override_col = 'qty_'+str(i)+'_overridden'
row[qty_dev_col] = 0
row[qty_override_col] = False
transformed_df=transformed_df.append(row, ignore_index=True)
continue
corr_df = df.loc[(df.type == override_type)
& (df.pk1 == row_pk1)
& (df.pk2 == row_pk2)]
for i in range(6,8):
qty_col = 'qty_'+str(i)
qty_dev_col = 'qty_'+str(i)+'_dev'
qty_override_col = 'qty_'+str(i)+'_overridden'
if not (math.isnan(corr_df[qty_col])) and (corr_df[qty_col].values[0] != row[qty_col]):
row[qty_dev_col] = corr_df[qty_col].values[0] - row[qty_col]
row[qty_col] = corr_df[qty_col].values[0]
row[qty_override_col] = True
else:
row[qty_dev_col] = 0
row[qty_override_col] = False
transformed_df=transformed_df.append(row, ignore_index=True)
return transformed_df
x1 = transform_df(df)
Is there a better way to do this using lambdas or something ? Also this takes like forever to run over a bigger dataframe.