How to adjust dataframe rows to columns - python

import pandas as pd
import pandas.io.data as web
from pandas import Series, DataFrame
import matplotlib
import matplotlib.pyplot as plt
from numpy.random import randn
import numpy as np
matplotlib.style.use('ggplot')
stocks = {'xom': '2014-01-01', 'dvn': '2013-01-01', 'aapl': '2013-01-01'}
L = dict()
for stock, date in stocks.items():
price = web.get_data_yahoo(stock, date)['Adj Close']
change = price.diff().cumsum()
perChange = change / price.iloc[0]
L[stock] = perChange
df = pd.concat(L, axis=1)
df2 = df.describe()
How do I format df2 so that the columns are min, max, std, etc...and the rows are the stock symbol?

use the transpose of the dateframe: DataFrame.T
df2 = df.describe().T # this is the equivalent of df.describe().transpose()
print df2
count mean std min 25% 50% 75% max
aapl 665 0.195720 0.331271 -0.284546 -0.089219 0.110605 0.501857 0.783157
dvn 665 0.202538 0.143291 -0.246586 0.104409 0.175463 0.286709 0.548577
xom 413 -0.049164 0.062285 -0.273573 -0.096234 -0.045035 -0.001124 0.060982

You want to add;
df2 = df2.transpose()

Related

Plotting and calculating mid price and weightet mid-price

I have a problem with my code - Somehow it keeps giving me a keyerror: "None of [Float]...."
I need to calculate: P_mid = P_offer+P_bid/2
and
volume weightet mid_price = VWMP = (P_bid * Size_offer)+(P_offer * Size_bid)/Size_Offer+Size_Bid
So far my code looks like this:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
nasdaq_1 = pd.read_csv (r'Path to csv')
np.array(nasdaq_1)
#print(nasdaq_1)
mid_price = (np.array(nasdaq_1.Offer_Price) + np.array(nasdaq_1.Bid_Price))/2
#print(mid_price)
weightet_mid_price = (np.array(nasdaq_1.Offer_Price)*np.array(nasdaq_1.Bid_Size) + np.array(nasdaq_1.Bid_Price)*np.array(nasdaq_1.Offer_Size))/(np.array(nasdaq_1.Offer_Size)+np.array(nasdaq_1.Bid_Size))
print(weightet_mid_price)
nasdaq_1[mid_price].plot()
plt.figure(figsize=(10,10))
plt.plot(nasdaq_1.index, nasdaq_1[mid_price])
plt.xlabel("Datetime")
plt.ylabel("$ price")
plt.title("Mid-price between bid and offer prices")
All help is highly appreciated!!
CSV data sample:
|DateTime,Time,Exchange,Symbol,Bid_Price,Bid_Size,Offer_Price,Offer_Size
|2017-01-03 09:30:00,93000766290000.0,T,PFE,32.55,8.0,32.76,8.0
|2017-01-03 09:30:01,93001992610000.0,T,PFE,32.67,8.0,32.7,31.0
|2017-01-03 09:30:02,93002933311000.0,T,PFE,32.67,7.0,32.7,2.0
|2017-01-03 09:30:03,93003882764000.0,T,PFE,32.7,1.0,32.76,17.0
|2017-01-03 09:30:04,93004943608000.0,T,PFE,32.7,1.0,32.73,13.0
|2017-01-03 09:30:05,93005991747000.0,T,PFE,32.69,2.0,32.74,41.0
|2017-01-03 09:30:06,93006506218000.0,T,PFE,32.67,5.0,32.74,41.0
Image shows the data I am using. Screenshot below.
You do not need to cast the data frame columns into numpy arrays for your calculations.
The error you see is due to the line nasdaq_1[mid_price].plot().
df[x] expects x to be either a column name or a list/array of columns. You are passing a numpy array with entries which cannot be found.
Try the following:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import io
s = io.StringIO("""DateTime,Time,Exchange,Symbol,Bid_Price,Bid_Size,Offer_Price,Offer_Size
2017-01-03 09:30:00,93000766290000.0,T,PFE,32.55,8.0,32.76,8.0
2017-01-03 09:30:01,93001992610000.0,T,PFE,32.67,8.0,32.7,31.0
2017-01-03 09:30:02,93002933311000.0,T,PFE,32.67,7.0,32.7,2.0
2017-01-03 09:30:03,93003882764000.0,T,PFE,32.7,1.0,32.76,17.0
2017-01-03 09:30:04,93004943608000.0,T,PFE,32.7,1.0,32.73,13.0
2017-01-03 09:30:05,93005991747000.0,T,PFE,32.69,2.0,32.74,41.0
2017-01-03 09:30:06,93006506218000.0,T,PFE,32.67,5.0,32.74,41.0
""")
nasdaq_1 = pd.read_csv(s, parse_dates=['DateTime'])
mid_price = (nasdaq_1["Offer_Price"] + nasdaq_1["Bid_Price"])/2
weightet_mid_price = (
(nasdaq_1["Offer_Price"]*nasdaq_1["Bid_Size"] + nasdaq_1["Bid_Price"]*nasdaq_1["Offer_Size"])
/ (nasdaq_1["Offer_Size"] + nasdaq_1["Bid_Size"])
)
fig, ax = plt.subplots(figsize=(10,10))
ax.plot(nasdaq_1["DateTime"], mid_price)
ax.set_xlabel("Datetime")
ax.set_ylabel("$ price")
ax.set_title("Mid-price between bid and offer prices")
fig.autofmt_xdate()
Edit:
Parse the DateTime column to make it datetime values instead of strings.

Plot distribution of differences between two pandas dataframe columns

I have a pandas dataframe, which have columns A & B
I just want to plot a distribution graph of the percentage of differences between column A & B
A B
1 1.051990e+10 1.051990e+04
2 1.051990e+10 1.051990e+04
5 4.841800e+10 1.200000e+10
8 2.327700e+10 2.716000e+10
9 1.204900e+10 2.100000e+08
Distribution graph will be like, how many records are having 10% of differences, how many are 20% difference
I tried as follows
df percCal(x,y):
return (x-y)*100/x
df['perc'] = df.apply(lambda x: percCal(df['A'], df['B']), axis=1)
This is not working, as i'm newbie please help
You don't need the lambda operation.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
df1 = pd.DataFrame(np.random.randint(1, 10, (20, 2)), columns=['A', 'B'])
def percCal(x,y):
return (x-y)*100/x
Alternatively, just manipulate the columns directly:
df1['diff'] = (df1['A'] - df1['B']) * 100 / df1['A']
Apply the function and plot:
df1['diff'] = percCal(df1['A'], df1['B'])
df1['diff'].plot(kind='density')
df['perc'] = (df['A'] - df['B']) *100/df['A']
def percCal(x,y):
return (x-y)*100/x
df['perc'] = df.apply(lambda x: percCal(x['A'], x['B']), axis=1)
Change dfin lambda for x in this case you are giving the function the data xthat means you are giving the percCalwhat you have in the row of the data frame and when you use dfyou are giving actually the data frame and the function is returning a data frame not a value. But please check your code, if xin the function can be 0 is a problem.
Think this is what you are looking for:
# Dummy df
data = [
[1.051990e+10, 1.051990e+04],
[1.051990e+10, 1.051990e+04],
[4.841800e+10, 1.200000e+10],
[2.327700e+10, 2.716000e+10],
[1.204900e+10, 2.100000e+08],
]
cols = ['A', 'B']
df2 = pd.DataFrame(data, columns=cols)
# Solution
import seaborn as sns
df2['pct_diff'] = (df2['A'] - df2['B']) / df2['A']
sns.distplot(df2['pct_diff']);

python scipy spearman correlations

I am trying to obtain the column names from the dataframe (df) and associate them to the resulting array produced by the spearmanr correlation function. I need to associate both the column names (a-j) back to the correlation value (spearman) and the p-values (spearman_pvalue). Is there an intuitive way to perform this task?
from scipy.stats import pearsonr,spearmanr
import numpy as np
import pandas as pd
df=pd.DataFrame(np.random.randint(0,100,size= (100,10)),columns=list('abcdefghij'))
def binary(row):
if row>=50:
return 1
else:
return 0
df['target']=df.a.apply(binary)
spearman,spearman_pvalue=spearmanr(df.drop(['target'],axis=1),df.target)
print(spearman)
print(spearman_pvalue)
It seems you need:
from scipy.stats import spearmanr
df=pd.DataFrame(np.random.randint(0,100,size= (100,10)),columns=list('abcdefghij'))
#print (df)
#faster for binary df
df['target'] = (df['a'] >= 50).astype(int)
#print (df)
spearman,spearman_pvalue=spearmanr(df.drop(['target'],axis=1),df.target)
df1 = pd.DataFrame(spearman.reshape(-1, 11), columns=df.columns)
#print (df1)
df2 = pd.DataFrame(spearman_pvalue.reshape(-1, 11), columns=df.columns)
#print (df2)
### Kyle, we can assign the index back to the column names for the total matrix:
df2=df2.set_index(df.columns)
df1=df1.set_index(df.columns)
Or:
df1 = pd.DataFrame(spearman.reshape(-1, 11),
columns=df.columns,
index=df.columns)
df2 = pd.DataFrame(spearman_pvalue.reshape(-1, 11),
columns=df.columns,
index=df.columns)

Pandas rolling standard deviation

Is anyone else having trouble with the new rolling.std() in pandas? The deprecated method was rolling_std(). The new method runs fine but produces a constant number that does not roll with the time series.
Sample code is below. If you trade stocks, you may recognize the formula for Bollinger bands. The output I get from rolling.std() tracks the stock day by day and is obviously not rolling.
This in in pandas 0.19.1. Any help would be appreciated.
import datetime
import pandas as pd
import pandas_datareader.data as web
start = datetime.datetime(2012,1,1)
end = datetime.datetime(2012,12,31)
g = web.DataReader(['AAPL'], 'yahoo', start, end)
stocks = g['Close']
stocks['Date'] = pd.to_datetime(stocks.index)
stocks['AAPL_LO'] = stocks['AAPL'] - stocks['AAPL'].rolling(20).std() * 2
stocks['AAPL_HI'] = stocks['AAPL'] + stocks['AAPL'].rolling(20).std() * 2
stocks.dropna(axis=0, how='any', inplace=True)
import pandas as pd
from pandas_datareader import data as pdr
import numpy as np
import datetime
end = datetime.date.today()
begin=end-pd.DateOffset(365*10)
st=begin.strftime('%Y-%m-%d')
ed=end.strftime('%Y-%m-%d')
data = pdr.get_data_yahoo("AAPL",st,ed)
def bollinger_strat(data, window, no_of_std):
rolling_mean = data['Close'].rolling(window).mean()
rolling_std = data['Close'].rolling(window).std()
df['Bollinger High'] = rolling_mean + (rolling_std * no_of_std)
df['Bollinger Low'] = rolling_mean - (rolling_std * no_of_std)
bollinger_strat(data,20,2)

Visually separating bar chart clusters in pandas

This is more of a hack that almost works.
#!/usr/bin/env python
from pandas import *
import matplotlib.pyplot as plt
from numpy import zeros
# Create original dataframe
df = DataFrame(np.random.rand(5,4), index=['art','mcf','mesa','perl','gcc'],
columns=['pol1','pol2','pol3','pol4'])
# Estimate average
average = df.mean()
average.name = 'average'
# Append dummy row with zeros and then average
row = DataFrame([dict({p:0.0 for p in df.columns}), ])
df = df.append(row)
df = df.append(average)
print df
df.plot(kind='bar')
plt.show()
and gives:
pol1 pol2 pol3 pol4
art 0.247309 0.139797 0.673009 0.265708
mcf 0.951582 0.319486 0.447658 0.259821
mesa 0.888686 0.177007 0.845190 0.946728
perl 0.902977 0.863369 0.194451 0.698102
gcc 0.836407 0.700306 0.739659 0.265613
0 0.000000 0.000000 0.000000 0.000000
average 0.765392 0.439993 0.579993 0.487194
and
It gives the visual separation between benchmarks and average.
Is there a way to get rid of the 0 at the x-axis??
It turns out that DataFrame does not allow me to have muptiple dummy rows this way.
My solution was to change
row = pd.DataFrame([dict({p:0.0 for p in df.columns}), ])
into
row = pd.Series([dict({p:0.0 for p in df.columns}), ])
row.name = ""
Series can be named with empty string.
Still pretty hacky, but it works:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
# Create original dataframe
df = pd.DataFrame(np.random.rand(5,4), index=['art','mcf','mesa','perl','gcc'],
columns=['pol1','pol2','pol3','pol4'])
# Estimate average
average = df.mean()
average.name = 'average'
# Append dummy row with zeros and then average
row = pd.DataFrame([dict({p:0.0 for p in df.columns}), ])
df = df.append(row)
df = df.reindex(np.where(df.index, df.index, ''))
df = df.append(average)
print df
df.plot(kind='bar')
plt.show()

Categories

Resources