import numpy as np
import pandas as pd
import scipy.stats as st
import matplotlib.pyplot as plt
from IPython.display import display, HTML
dataframe for this project
nba_wins_df = pd.read_csv('nba_wins_data.csv')
display(HTML(nba_wins_df.head().to_html()))
print("printed only the first five observations...")
print("Number of rows in the dataset =", len(nba_wins_df))
import scipy.stats as st
---- TODO: make your edits here ----
plt.plot(nba_wins_df['avg_elo_n'], nba_wins_df['total_wins'], 'o')
plt.title('Total Number of Wins by Average Relative Skill', fontsize=20)
plt.xlabel('Average Relative Skill')
plt.ylabel('Total Number of Wins')
plt.show()
---- TODO: make your edits here ----
correlation_coefficient, p_value = st.pearsonr(nba_wins_df['avg_elo_n'], correlation_coefficient, p_value = st.pearsonr(nba_wins_df['avg_elo_n'], correlation_coefficient, p_value = st.pearsonr(nba_wins_df['avg_elo_n'], nba_wins_df['total_wins'])
['total_wins'])
['total_wins'])
print("Correlation between Average Relative Skill and the Total Number of Wins ")
print("Pearson Correlation Coefficient =", round(correlation_coefficient,4))
print("P-value =", round(p_value,4))
I tried importing scipy.stats, but that didn't work.
Related
How to find Shift phase between two sinusoids in Python.
For example, I created two sinusoid with phase shift 180 radian (Visually). Can we calculate the phase shift in python script if we know only graph_1 and graph_2?
import matplotlib.pyplot as plt
import numpy as np
data=[]
def sin (f):
x=np.array(range(1,200))
y = 10*np.sin((0.1*x)+f)
return (y)
import matplotlib.pyplot as plt
graph_1 = sin(3.12)
graph_2 = sin(0)
plt.plot(graph_1 ,graph_2)
plt.show()
Please see the image here
I have created a list of values of Shannon entropy for a pair of multiple sequence aligned sequences. While plotting the values I get a simple plot. I want to plot a smooth curve over the lines. Can anyone suggest to me what will be the right way to process it? BAsically I want to plot a smooth curve that touches the tip of every bar and goes to zero where the "y axis value" is zero.
link for image: [1]: https://i.stack.imgur.com/SY3jH.png
#importing the relevant packages
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.interpolate import make_interp_spline
from Bio import AlignIO
import warnings
warnings.filterwarnings("ignore")
#function to calculate the Shannon Entropy of a MSA
# H = -sum[p(x).log2(px)]
def shannon_entropy(list_input):
unique_aa = set(list_input)
M = len(list_input)
entropy_list = []
# Number of residues in column
for aa in unique_aa:
n_i = list_input.count(aa)
P_i = n_i/float(M)
entropy_i = P_i*(math.log(P_i,2))
entropy_list.append(entropy_i)
sh_entropy = -(sum(entropy_list))
#print(sh_entropy)
return sh_entropy
#importing the MSA file
#importing the clustal file
align_clustal1 =AlignIO.read("/home/clustal.aln", "clustal")
def shannon_entropy_list_msa(alignment_file):
shannon_entropy_list = []
for col_no in range(len(list(alignment_file[0]))):
list_input = list(alignment_file[:, col_no])
shannon_entropy_list.append(shannon_entropy(list_input))
return shannon_entropy_list
clustal_omega1 = shannon_entropy_list_msa(align_clustal1)
# Plotting the data
plt.figure(figsize=(18,10))
plt.plot(clustal_omega1, 'r')
plt.xlabel('Residue', fontsize=16)
plt.ylabel("Shannon's entropy", fontsize=16)
plt.show()
Edit 1:
Here is what my graph looks like after implementing the "pchip" method. link for the pchip output: https://i.stack.imgur.com/hA3KW.png
pchip monotonic spline output
One approach would be to use PCHIP interpolation, which will give you the monotonic curve with the required behaviour for zero values on the y-axis.
We can't run your exact code example on our machines because you point to a local Clustal file in your 'home' directory.
Here's a simple working example, with link to output image:
import numpy as np
import matplotlib.pyplot as plt
from scipy.interpolate import pchip
mylist = [10,0,0,0,0,9,9,0,0,0,11,11,11,0,0]
mylist_np = np.array(mylist)
samples = np.array(range(len(mylist)))
xnew = np.linspace(samples.min(), samples.max(), 100)
plt.plot(xnew,pchip(samples, mylist_np )(xnew))
plt.show()
I'm have got a programme that uses matplotlib and pandas to plot the rolling mean and standard deviation for the price of bitcoin. I'm wondering how I can plot the z values ( the number of standard deviations the price is from the mean).
import pandas as pd
from matplotlib import pyplot as plt
btc_1_day = pd.read_csv('C:\Users\Oliver\Desktop\data\data1_btcusdt_1day.csv')
df1_btc = pd.DataFrame(btc_1_day)
df1_btc['SMA_10'] = df1_btc.price_close.rolling(10).mean()
df1_btc['SMSD_10'] = df1_btc.price_close.rolling(10).std()
plt.grid(True)
plt.plot(btc_1_day.price_close)
plt.plot(df1_btc['SMA_10'],label='10 day moving average')
plt.plot(df1_btc['SMSD_10'],label='10 day standard deviation')
plt.legend(loc=2)
plt.show()
Since I don't have your csv file, I'll show you how I would do this using some random data and a pandas dataframe. You can find the z score using stats.zscore(df['btc'], but that would give you numbers on a very different scale from the ones you're trying to plot in your example.
Plot 1:
Code 1:
import pandas as pd
from matplotlib import pyplot as plt
import statsmodels.api as sm
import pandas as pd
import numpy as np
import datetime
from scipy import stats
# data
np.random.seed(1234)
numdays=100
df = pd.DataFrame({'btc': (np.random.randint(low=-20, high=20, size=numdays).cumsum()+100).tolist()})
# moving averages and standard deviations
df['SMA_10'] = df['btc'].rolling(10).mean()
df['SMSD_10+sigma'] = df['btc'].rolling(10).mean()+df['btc'].rolling(10).std()
df['SMSD_10-sigma'] = df['btc'].rolling(10).mean()-df['btc'].rolling(10).std()
# matplotlib
df['ZScore']=stats.zscore(df['btc'])
plt.figure()
df['btc'].plot()
df['ZScore'].plot()
plt.show()
In order to illustrate your dataset together with averages and starndard deviations for rolling windows, I'd rather use an approach such as:
Plot 2:
Code 2:
import pandas as pd
from matplotlib import pyplot as plt
import statsmodels.api as sm
import pandas as pd
import numpy as np
import datetime
from scipy import stats
# data
np.random.seed(1234)
numdays=100
df = pd.DataFrame({'btc': (np.random.randint(low=-20, high=20, size=numdays).cumsum()+100).tolist()})
# moving averages and standard deviations
df['SMA_10'] = df['btc'].rolling(10).mean()
df['SMSD_10+sigma'] = df['btc'].rolling(10).mean()+df['btc'].rolling(10).std()
df['SMSD_10-sigma'] = df['btc'].rolling(10).mean()-df['btc'].rolling(10).std()
# matplotlib
plt.grid(True)
plt.plot( df['btc'])
plt.plot(df['SMA_10'],label='10 day moving average')
plt.plot(df['SMSD_10+sigma'],label='10 day standard deviation',
color='green',
linewidth=0.5)
plt.plot(df['SMSD_10-sigma'],label='10 day standard deviation',
color='green',
linewidth=0.5)
plt.plot(df['btc'], color='blue', linewidth=1.5)
plt.legend(loc=2)
plt.show()
How to check for any given number how many times a plot hit that number horizontally and get higher?
I have already tried:
import os
import numpy as np
import pylab as plt
import pandas as pd
df = pd.read_csv('C:/Users/Payam/Desktop/tesla-stock-price.csv')
df['avg'] = df[['high', 'low']].mean(axis=1)
e=df['avg'].values
x = df['date'].values
y = df['close'].values
z = df['open'].values
f, ax = plt.subplots(figsize=(20,10))
ax.plot(x,y,'b')
ax.set_xticks(x[::150]);
plt.xticks(rotation=90)
ax.plot(np.arange(len(x)),np.zeros(len(x))+50,'k.')
Given some test array
test_array=np.array([1,6,8,65,4,2,5,8,9,6,4,6,9,0,8,6,4,32,])
you can get all indices where the value is greater than a number, say 5, like so
print (np.where(test_array>5))
print (test_array[np.where(test_array>5)])
I am producing the probability distribution function of my variable, which is temperature:
and I am going to produce several plots with temperature PDF evolution.
For this reason, I would like to link the color of the plot (rainbow-style) with the value of the peak of the temperature distribution.
In this way, it is easy to associate the average value of the temperature just by looking at the color.
Here's the code I have written for producing plots of the PDF evolution:
from netCDF4 import Dataset
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.basemap import Basemap
import seaborn as sns
from scipy.stats import gaussian_kde
my_file = 'tas/tas.nc'
fh = Dataset(my_file, mode='r')
lons = (fh.variables['rlon'][:])
lats = (fh.variables['rlat'][:])
t = (fh.variables['tas'][:])-273
step = len(t[:,0,0])
t_units = fh.variables['tas'].units
fh.close()
len_lon = len(t[0,0,:])
len_lat = len(t[0,:,0])
len_tot = len_lat*len_lon
temperature = np.zeros(len_tot)
for i in range(step):
temperature=t[i,:,:]
temperature_array = temperature.ravel()
density = gaussian_kde(temperature_array)
xs = np.linspace(-80,50,200)
density.covariance_factor = lambda : .25
density._compute_covariance()
plt.title(str(1999+i))
plt.xlabel("Temperature (C)")
plt.ylabel("Frequency")
plt.plot(xs,density(xs))
plt.savefig('temp_'+str(i))
Because the question is lacking a working snippet, I had to come up with some sample data. This creates three datasets, where each one is colored with a specific color between blue (cold) and red (hot) according to their maximum value.
import matplotlib.pyplot as plt
import random
from colour import Color
nrange = 20
mydata1 = random.sample(range(nrange), 3)
mydata2 = random.sample(range(nrange), 3)
mydata3 = random.sample(range(nrange), 3)
colorlist = list(Color('blue').range_to(Color('red'), nrange))
# print(mydata1) print(mydata2) print(mydata3)
plt.plot(mydata1, color='{}'.format(colorlist[max(mydata1)]))
plt.plot(mydata2, color='{}'.format(colorlist[max(mydata2)]))
plt.plot(mydata3, color='{}'.format(colorlist[max(mydata3)]))
plt.show()