Visually separating bar chart clusters in pandas - python

This is more of a hack that almost works.
#!/usr/bin/env python
from pandas import *
import matplotlib.pyplot as plt
from numpy import zeros
# Create original dataframe
df = DataFrame(np.random.rand(5,4), index=['art','mcf','mesa','perl','gcc'],
columns=['pol1','pol2','pol3','pol4'])
# Estimate average
average = df.mean()
average.name = 'average'
# Append dummy row with zeros and then average
row = DataFrame([dict({p:0.0 for p in df.columns}), ])
df = df.append(row)
df = df.append(average)
print df
df.plot(kind='bar')
plt.show()
and gives:
pol1 pol2 pol3 pol4
art 0.247309 0.139797 0.673009 0.265708
mcf 0.951582 0.319486 0.447658 0.259821
mesa 0.888686 0.177007 0.845190 0.946728
perl 0.902977 0.863369 0.194451 0.698102
gcc 0.836407 0.700306 0.739659 0.265613
0 0.000000 0.000000 0.000000 0.000000
average 0.765392 0.439993 0.579993 0.487194
and
It gives the visual separation between benchmarks and average.
Is there a way to get rid of the 0 at the x-axis??
It turns out that DataFrame does not allow me to have muptiple dummy rows this way.
My solution was to change
row = pd.DataFrame([dict({p:0.0 for p in df.columns}), ])
into
row = pd.Series([dict({p:0.0 for p in df.columns}), ])
row.name = ""
Series can be named with empty string.

Still pretty hacky, but it works:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
# Create original dataframe
df = pd.DataFrame(np.random.rand(5,4), index=['art','mcf','mesa','perl','gcc'],
columns=['pol1','pol2','pol3','pol4'])
# Estimate average
average = df.mean()
average.name = 'average'
# Append dummy row with zeros and then average
row = pd.DataFrame([dict({p:0.0 for p in df.columns}), ])
df = df.append(row)
df = df.reindex(np.where(df.index, df.index, ''))
df = df.append(average)
print df
df.plot(kind='bar')
plt.show()

Related

Matching geographic coordinates between two data frames

I have two data frames that have Longitude and Latitude columns.
DF1 and DF2:
DF1 = pd.DataFrame([[19.827658,-20.372238,8614], [19.825407,-20.362608,7412], [19.081514,-17.134456,8121]], columns=['Longitude1', 'Latitude1','Echo_top_height'])
DF2 = pd.DataFrame([[19.083727, -17.151207, 285.319994], [19.169403, -17.154144, 284.349994], [19.081514,-17.154456, 285.349994]], columns=['Longitude2', 'Latitude2','BT'])
I need to find a match for long and lat in DF1 with a long and lat in DF2. And where data match, add the corresponding value from the BT column from DF2 to DF1.
I used the code from here and managed to check if there is a match:
from sklearn.metrics.pairwise import haversine_distances
threshold = 5000 # meters
earth_radius = 6371000 # meters
DF1['nearby'] = (
# get the distance between all points of each DF
haversine_distances(
# note that you need to convert to radiant with *np.pi/180
X=DF1[['Latitude1','Longitude1']].to_numpy()*np.pi/180,
Y=DF2[['Latitude2','Longitude2']].to_numpy()*np.pi/180)
*earth_radius < threshold).any(axis=1).astype(int)
So the result I need would look like this:
Longitude1 Latitude1 Echo_top_height BT
19.82 -20.37 8614 290.345
19.82 -20.36 7412 289.235
and so on...
You can use BallTree:
# Update: for newer versions of sklearn
from sklearn.neighbors import BallTree
from sklearn.metrics import DistanceMetric
# from sklearn.neighbors import BallTree, DistanceMetric
# DF1
coords = np.radians(df2[['Latitude2', 'Longitude2']])
dist = DistanceMetric.get_metric('haversine')
tree = BallTree(coords, metric=dist)
# DF2
coords = np.radians(df1[['Latitude1', 'Longitude1']])
distances, indices = tree.query(coords, k=1)
df1['BT'] = df2['BT'].iloc[indices.flatten()].values
df1['Distance'] = distances.flatten()
Output:
Longitude1
Latitude1
Echo_top_height
BT
Distance
19.8277
-20.3722
8614
284.35
0.0572097
19.8254
-20.3626
7412
284.35
0.0570377
19.0815
-17.1345
8121
285.32
0.000294681
It looks like you are comparing the dataframes by index, so you can use join and drop the unnecessary rows and columns:
DF3 = DF1.join(DF2[['BT']])
DF3 = DF3[DF3['nearby'].eq(1)].drop('nearby', axis=1)
DF3
full reproducible code:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import haversine_distances
DF1 = pd.DataFrame([[19.827658,-20.372238,8614], [19.825407,-20.362608,7412], [19.081514,-17.134456,8121]], columns=['Longitude1', 'Latitude1','Echo_top_height'])
DF2 = pd.DataFrame([[19.083727, -17.151207, 285.319994], [19.169403, -17.154144, 284.349994], [19.081514,-17.154456, 285.349994]], columns=['Longitude2', 'Latitude2','BT'])
DF1, DF2
threshold = 5000 # meters
earth_radius = 6371000 # meters
DF1['nearby'] = (
# get the distance between all points of each DF
haversine_distances(
# note that you need to convert to radiant with *np.pi/180
X=DF1[['Latitude1','Longitude1']].to_numpy()*np.pi/180,
Y=DF2[['Latitude2','Longitude2']].to_numpy()*np.pi/180)
*earth_radius < threshold).any(axis=1).astype(int)
DF3 = DF1.join(DF2[['BT']])
DF3 = DF3[DF3['nearby'].eq(1)].drop('nearby', axis=1)
DF3
Output:
Out[1]:
Longitude1 Latitude1 Echo_top_height BT
2 19.081514 -17.134456 8121 285.349994

Dataframe with Monte Carlo Simulation calculation next row Problem

I want to build up a Dataframe from scratch with calculations based on the Value before named Barrier option. I know that i can use a Monte Carlo simulation to solve it but it just wont work the way i want it to.
The formula is:
Value in row before * np.exp((r-sigma**2/2)*T/TradingDays+sigma*np.sqrt(T/TradingDays)*z)
The first code I write just calculates the first column. I know that I need a second loop but can't really manage it.
The result should be, that for each simulation it will calculate a new value using the the value before, for 500 Day meaning S_1 should be S_500 with a total of 1000 simulations. (I need to generate new columns based on the value before using the formular.)
similar to this:
So for the 1. Simulations 500 days, 2. Simulation 500 day and so on...
import numpy as np
import pandas as pd
from scipy.stats import norm
import random as rd
import math
simulation = 0
S_0 = 42
T = 2
r = 0.02
sigma = 0.20
TradingDays = 500
df = pd.DataFrame()
for i in range (0,TradingDays):
z = norm.ppf(rd.random())
simulation = simulation + 1
S_1 = S_0*np.exp((r-sigma**2/2)*T/TradingDays+sigma*np.sqrt(T/TradingDays)*z)
df = df.append ({
'S_1':S_1,
'S_0':S_0
}, ignore_index=True)
df = df.round ({'Z':6,
'S_T':2
})
df.index += 1
df.index.name = 'Simulation'
print(df)
I found another possible code which i found here and it does solve the problem but just for one row, the next row is just the same calculation. Generate a Dataframe that follow a mathematical function for each column / row
If i just replace it with my formular i get the same problem.
replacing:
exp(r - q * sqrt(sigma))*T+ (np.random.randn(nrows) * sqrt(deltaT)))
with:
exp((r-sigma**2/2)*T/nrows+sigma*np.sqrt(T/nrows)*z))
import numpy as np
import pandas as pd
from scipy.stats import norm
import random as rd
import math
S_0 = 42
T = 2
r = 0.02
sigma = 0.20
TradingDays = 50
Simulation = 100
df = pd.DataFrame({'s0': [S_0] * Simulation})
for i in range(1, TradingDays):
z = norm.ppf(rd.random())
df[f's{i}'] = df.iloc[:, -1] * np.exp((r-sigma**2/2)*T/TradingDays+sigma*np.sqrt(T/TradingDays)*z)
print(df)
I would work more likely with the last code and solve the problem with it.
How about just overwriting the value of S_0 by the new value of S_1 while you loop and keeping all simulations in a list?
Like this:
import numpy as np
import pandas as pd
import random
from scipy.stats import norm
S_0 = 42
T = 2
r = 0.02
sigma = 0.20
trading_days = 50
output = []
for i in range(trading_days):
z = norm.ppf(random.random())
value = S_0*np.exp((r - sigma**2 / 2) * T / trading_days + sigma * np.sqrt(T/trading_days) * z)
output.append(value)
S_0 = value
df = pd.DataFrame({'simulation': output})
Perhaps I'm missing something, but I don't see the need for a second loop.
Also, this eliminates calling df.append() in a loop, which should be avoided. (See here)
Solution based on the the answer of bartaelterman, thank you very much!
import numpy as np
import pandas as pd
from scipy.stats import norm
import random as rd
import math
#Dividing the list in chunks to later append it to the dataframe in the right order
def chunk_list(lst, chunk_size):
for i in range(0, len(lst), chunk_size):
yield lst[i:i + chunk_size]
def blackscholes():
d1 = ((math.log(S_0/K)+(r+sigma**2/2)*T)/(sigma*np.sqrt(2)))
d2 = ((math.log(S_0/K)+(r-sigma**2/2)*T)/(sigma*np.sqrt(2)))
preis_call_option = S_0*norm.cdf(d1)-K*np.exp(-r*T)*norm.cdf(d2)
return preis_call_option
K = 40
S_0 = 42
T = 2
r = 0.02
sigma = 0.2
U = 38
simulation = 10000
trading_days = 500
trading_days = trading_days -1
#creating 2 lists for the first and second loop
loop_simulation = []
loop_trading_days = []
#first loop calculates the first column in a list
for j in range (0,simulation):
print("Progressbar_1_2 {:2.2%}".format(j / simulation), end="\n\r")
S_Tag_new = 0
NORM_S_INV = norm.ppf(rd.random())
S_Tag = S_0*np.exp((r-sigma**2/2)*T/trading_days+sigma*np.sqrt(T/trading_days)*NORM_S_INV)
S_Tag_new = S_Tag
loop_simulation.append(S_Tag)
#second loop calculates the the rows for the columns in a list
for i in range (0,trading_days):
NORM_S_INV = norm.ppf(rd.random())
S_Tag = S_Tag_new*np.exp((r-sigma**2/2)*T/trading_days+sigma*np.sqrt(T/trading_days)*NORM_S_INV)
loop_trading_days.append(S_Tag)
S_Tag_new = S_Tag
#values from the second loop will be divided in number of Trading days per Simulation
loop_trading_days_chunked = list(chunk_list(loop_trading_days,trading_days))
#First dataframe with just the first results from the firstloop for each simulation
df1 = pd.DataFrame({'S_Tag 1': loop_simulation})
#Appending the the chunked list from the second loop to a second dataframe
df2 = pd.DataFrame(loop_trading_days_chunked)
#Merging both dataframe into one
df3 = pd.concat([df1, df2], axis=1)

Plot distribution of differences between two pandas dataframe columns

I have a pandas dataframe, which have columns A & B
I just want to plot a distribution graph of the percentage of differences between column A & B
A B
1 1.051990e+10 1.051990e+04
2 1.051990e+10 1.051990e+04
5 4.841800e+10 1.200000e+10
8 2.327700e+10 2.716000e+10
9 1.204900e+10 2.100000e+08
Distribution graph will be like, how many records are having 10% of differences, how many are 20% difference
I tried as follows
df percCal(x,y):
return (x-y)*100/x
df['perc'] = df.apply(lambda x: percCal(df['A'], df['B']), axis=1)
This is not working, as i'm newbie please help
You don't need the lambda operation.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
df1 = pd.DataFrame(np.random.randint(1, 10, (20, 2)), columns=['A', 'B'])
def percCal(x,y):
return (x-y)*100/x
Alternatively, just manipulate the columns directly:
df1['diff'] = (df1['A'] - df1['B']) * 100 / df1['A']
Apply the function and plot:
df1['diff'] = percCal(df1['A'], df1['B'])
df1['diff'].plot(kind='density')
df['perc'] = (df['A'] - df['B']) *100/df['A']
def percCal(x,y):
return (x-y)*100/x
df['perc'] = df.apply(lambda x: percCal(x['A'], x['B']), axis=1)
Change dfin lambda for x in this case you are giving the function the data xthat means you are giving the percCalwhat you have in the row of the data frame and when you use dfyou are giving actually the data frame and the function is returning a data frame not a value. But please check your code, if xin the function can be 0 is a problem.
Think this is what you are looking for:
# Dummy df
data = [
[1.051990e+10, 1.051990e+04],
[1.051990e+10, 1.051990e+04],
[4.841800e+10, 1.200000e+10],
[2.327700e+10, 2.716000e+10],
[1.204900e+10, 2.100000e+08],
]
cols = ['A', 'B']
df2 = pd.DataFrame(data, columns=cols)
# Solution
import seaborn as sns
df2['pct_diff'] = (df2['A'] - df2['B']) / df2['A']
sns.distplot(df2['pct_diff']);

python: increase performance of finding the best timeshift for a correlation between each X column and y

I have a dataframe X with several columns and a dataframe y with only one column (series). The rows in X represent timesteps and I want to find the interval I need to shift each column of X to obtain the highest correlation with y. I wrote a function that loops over all columns and then loops over all timesteps and correlates the X column with y. If the R² is better than before I store the timestep. However, with over 300 columns this routine is really taking some time and I need to increase the performance. Is there a nice way to simplify this code?
(In the example I used the iris data set which is of course not a timeseries...)
from sklearn import datasets
import pandas as pd
import numpy as np
#import matplotlib.pyplot as plt
from copy import deepcopy
def get_best_shift(dfX, dfy, ti=60, maxt=1440):
"""
determines the best correlation for the last maxt minutes based on a
timestep of ti minutes. Creates a dataframe with the shifted variables based on the
best match (strongest correlation).
"""
df_out = deepcopy(dfX)
for xcol in dfX:
bestshift = 0
Rmax = 0
for ishift in range(0, int(maxt / ti)):
xvals = dfX[xcol].iloc[0:(dfX.shape[0] - ishift)].values
yvals = np.array([val[0] for val in dfy.iloc[ishift:dfy.shape[0]].values])
selector = np.array([str(val)!="nan" for val in (xvals*yvals)],dtype=bool)
xvals = xvals[selector]
yvals = yvals[selector]
R = np.corrcoef(xvals,yvals)[0][1]
# plt.figure()
# plt.plot(xvals,yvals,'k.')
# plt.show()
if R ** 2 > Rmax:
Rmax = R ** 2
# print(Rmax)
bestshift = ishift
df_out[xcol] = list(np.zeros(bestshift)) + list(dfX[xcol].iloc[0:dfX.shape[0] - bestshift].values)
df_out = df_out.rename(columns={xcol: ''.join([str(xcol), '_t-', str(bestshift)])})
return df_out
iris = datasets.load_iris()
X = pd.DataFrame(iris.data)
y = pd.DataFrame(iris.target)
df = get_best_shift(X,y)

How to adjust dataframe rows to columns

import pandas as pd
import pandas.io.data as web
from pandas import Series, DataFrame
import matplotlib
import matplotlib.pyplot as plt
from numpy.random import randn
import numpy as np
matplotlib.style.use('ggplot')
stocks = {'xom': '2014-01-01', 'dvn': '2013-01-01', 'aapl': '2013-01-01'}
L = dict()
for stock, date in stocks.items():
price = web.get_data_yahoo(stock, date)['Adj Close']
change = price.diff().cumsum()
perChange = change / price.iloc[0]
L[stock] = perChange
df = pd.concat(L, axis=1)
df2 = df.describe()
How do I format df2 so that the columns are min, max, std, etc...and the rows are the stock symbol?
use the transpose of the dateframe: DataFrame.T
df2 = df.describe().T # this is the equivalent of df.describe().transpose()
print df2
count mean std min 25% 50% 75% max
aapl 665 0.195720 0.331271 -0.284546 -0.089219 0.110605 0.501857 0.783157
dvn 665 0.202538 0.143291 -0.246586 0.104409 0.175463 0.286709 0.548577
xom 413 -0.049164 0.062285 -0.273573 -0.096234 -0.045035 -0.001124 0.060982
You want to add;
df2 = df2.transpose()

Categories

Resources