I am writing a program to determine the expectation value, expectation of the X^2 and E(X - X_avg)^2. I have written a program like so:
# program : expectation value
import csv
import pandas as pd
import numpy as np
from scipy.stats import chi2_contingency
import seaborn as sns
import matplotlib.pyplot as plt
import logging
logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s')
# Step 1: read csv
probabilityCSV = open('probability.csv')
df = pd.read_csv(probabilityCSV)
logging.debug(df['X'])
logging.debug(df['P'])
logging.debug(type(df['X']))
logging.debug(type(df['P']))
# Step 2: convert dataframe to ndarry
# https://stackoverflow.com/questions/13187778/convert-pandas-dataframe-to-numpy-array
X = df['X'].to_numpy()
p = df['P'].to_numpy()
logging.debug(f'X={X}')
logging.debug(f'p={p}')
# Step 3: calculate E(X)
# https://www.statology.org/expected-value-in-python/
def expected_value(values, weights):
return np.sum((np.dot(values,weights))) / np.sum(weights)
logging.debug('Step 3: calculate E(X)')
expectation = expected_value(X,p)
logging.debug(f'E(X)={expectation}')
# Step 4: calculate E(X^2)
logging.debug('Step 4: calculate E(X^2)')
# add normalize='index'
contingency_pct = pd.crosstab(df['Observed'],df['Expected'],normalize='index')
logging.debug(f'contingency_pct:{contingency_pct}')
# Step 5: calculate E(X - X_avg)^2
logging.debug('Step 5: calculate E(X - X_avg)^2')
The dataset that I am using is:
X,P
8,1/8
12,1/6
16,3/8
20,1/4
24,1/12
Expected:
E(X) = 16
E(X^2) = 276
E(X- X_avg)^2 =20
Actual:
Traceback (most recent call last):
File "/Users/evangertis/development/PythonAutomation/Statistics/expectation.py", line 35, in <module>
expectation = expected_value(X,p)
File "/Users/evangertis/development/PythonAutomation/Statistics/expectation.py", line 32, in expected_value
return np.sum((np.dot(values,weights))) / np.sum(weights)
File "<__array_function__ internals>", line 5, in sum
File "/usr/local/lib/python3.9/site-packages/numpy/core/fromnumeric.py", line 2259, in sum
return _wrapreduction(a, np.add, 'sum', axis, dtype, out, keepdims=keepdims,
File "/usr/local/lib/python3.9/site-packages/numpy/core/fromnumeric.py", line 86, in _wrapreduction
return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
TypeError: cannot perform reduce with flexible type
Your problem is the step 1, so I took the liberty of rewriting it:
# Step 1.1: read csv in the right way
probabilityCSV = open('probability.csv')
df = pd.read_csv(probabilityCSV)
df["P"] = df.P.str.split("/", expand=True)[0].astype(int) / df.P.str.split("/", expand=True)[1].astype(int)
df:
X P
0 8 0.125000
1 12 0.166667
2 16 0.375000
3 20 0.250000
4 24 0.083333
The second step is right:
# Step 2: convert dataframe to ndarry
X = df['X'].to_numpy()
p = df['P'].to_numpy()
X, p:
(array([ 8, 12, 16, 20, 24]),
array([0.125 , 0.16666667, 0.375 , 0.25 , 0.08333333]))
After this you correctly defined the function:
def expected_value(values, weights):
return np.sum((np.dot(values,weights))) / np.sum(weights)
You can use this function to compute E(X), E(X^2) and E(X - X_avg)^2. In particular:
expected_value(X,p)
# returns E(X) = 16.0
expected_value(X**2, p)
# returns E(X^2) = 276.0
expected_value((X-X.mean())**2, p)
# returns E(X - X_avg)^2 = 20.0
The error has occurred because your df["P"] column is a string column.
Related
I have this code to estimate a model using a tobit regression in Python. This is the code which is parsed in three parts: data definition, the estimator builder and estimation.
import numpy as np
from scipy.optimize import minimize
# define the dependent variable and independent variables
X = data.iloc[:, 1:]
y = data.iloc[:, 0]
# Add a column of ones to the independent variables for the constant term
X = np.c_[np.ones(X.shape[0]), X]
# Define the likelihood function for the Tobit model
def likelihood(params, y, X, lower, upper):
beta = params[:-1]
sigma = params[-1]
mu = X # beta
prob = (1 / (sigma * np.sqrt(2 * np.pi)) * np.exp(-0.5 * ((y - mu) / sigma)**2))
prob[y < lower] = 0
prob[y > upper] = 0
return -np.log(prob).sum()
# Set the initial values for the parameters and the lower and upper bounds for censoring
params_init = np.random.normal(size=X.shape[1] + 1)
bounds = [(None, None) for i in range(X.shape[1])] + [(1e-10, None)]
# Perform the MLE estimation
res = minimize(likelihood, params_init, args=(y, X, 0, 100), bounds=bounds, method='L-BFGS-B')
# Extract the estimated parameters and their standard errors
params = res.x
stderr = np.sqrt(np.diag(res.hess_inv))
# Print the results
print(f'Coefficients: {params[:-1]}')
print(f'Standard Errors: {stderr[:-1]}')
print(f'Sigma: {params[-1]:.4f}')
Why am I getting this error message?
Thank you.
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-245-5f39f416cc07> in <module>
31 # Extract the estimated parameters and their standard errors
32 params = res.x
---> 33 stderr = np.sqrt(np.diag(res.hess_inv))
34
35 # Print the results
/opt/anaconda3/lib/python3.8/site-packages/numpy/core/overrides.py in diag(*args, **kwargs)
/opt/anaconda3/lib/python3.8/site-packages/numpy/lib/twodim_base.py in diag(v, k)
307 return diagonal(v, k)
308 else:
--> 309 raise ValueError("Input must be 1- or 2-d.")
310
311
ValueError: Input must be 1- or 2-d.
EDIT: If you wanna look at the type of data I'm dealing with, you can simulate them using these lines of code I just wrote:
data = pd.DataFrame()
# Append 'interview probabilities' for individuals with and without disabilities
interview_prob_disabled = np.random.normal(38.63, 28.72, 619)
interview_prob_enabled = np.random.normal(44.27, 28.19, 542)
interview_prob = np.append(interview_prob_disabled, interview_prob_enabled)
# Correct the variable by its mean and standard deviation, without it being negative, nor exceeding 100, nor a float
interview_prob = np.clip(interview_prob, 0, 100)
interview_prob = np.round(interview_prob)
# Add the 'interview probabilities' variable to the dataframe
data['Interview Probabilities'] = interview_prob
# Add other variables such as age, gender, employment status, education, etc.
data['Age'] = np.random.randint(18, 65, size=len(interview_prob))
data['Gender'] = np.random.choice(['Male', 'Female'], size=len(interview_prob))
data['Employment Status'] = np.random.choice(['Employed', 'Unemployed', 'Retired'], size=len(interview_prob))
data['Education Level'] = np.random.choice(['High School', 'College', 'Vocational', 'Graduate School'], size=len(interview_prob))
# Add a 'disability status' variable as a dummy
data['Disability Status'] = np.append(np.repeat('Disabled', 619), np.repeat('Non-disabled', 542))
# Categorical variables
data['Gender'] = data['Gender'].map({'Male': 0, 'Female': 1})
data['Employment Status'] = data['Employment Status'].map({'Employed': 0, 'Unemployed': 1})
data['Education Level'] = data['Education Level'].map({'High School': 0, 'College': 1, 'Vocational': 2, 'Graduate School': 3})
data['Disability Status'] = data['Disability Status'].map({'Disabled': 1, 'Non-disabled': 0})
# Print the df
data
The problem is that your solver, L-BFGS-B yields a LbfgsInvHessProduct object (a linear operator) out of .hess_inv instead of a numpy array (which something like BFGS would give).
One solution to your problem would be to use res.hess_inv.todense() instead.
I am using deepgraph in python to compute correlation coefficients for large matrices. The output gives a multi-index data frame:
s t
0 1 -0.006066
2 0.094063
3 -0.025529
4 0.074080
5 0.035490
6 0.005221
7 0.032064
I want to add a column with corresponding p-values.
The original code with input example is obtained from https://deepgraph.readthedocs.io/en/latest/tutorials/pairwise_correlations.html
The code surrounded by hashtags is my approach to get p-values.
I want to merge the separate edge lists later on.
#!/bin/python
import os
from multiprocessing import Pool
import numpy as np
import pandas as pd
import deepgraph as dg
from numpy.random import RandomState
from scipy.stats import pearsonr, spearmanr
prng = RandomState(0)
n_features = int(5e3)
n_samples = int(1e2)
X = prng.randint(100, size=(n_features, n_samples)).astype(np.float64)
# Spearman's correlation coefficients
X = X.argsort(axis=1).argsort(axis=1)
# whiten variables for fast parallel computation later on
X = (X - X.mean(axis=1, keepdims=True)) / X.std(axis=1, keepdims=True)
# save in binary format
np.save('samples', X)
# parameters (change these to control RAM usage)
step_size = 1e5
n_processes = 100
# load samples as memory-map
X = np.load('samples.npy', mmap_mode='r')
# create node table that stores references to the mem-mapped samples
v = pd.DataFrame({'index': range(X.shape[0])})
# connector function to compute pairwise pearson correlations
def corr(index_s, index_t):
features_s = X[index_s]
features_t = X[index_t]
corr = np.einsum('ij,ij->i', features_s, features_t) / n_samples
return corr
#################################
def p_Val(index_s, index_t):
features_s = X[index_s]
features_t = X[index_t]
p = spearmanr(features_s, features_t)[1]
return p
#################################
# index array for parallelization
pos_array = np.array(np.linspace(0, n_features*(n_features-1)//2, n_processes), dtype=int)
# parallel computation
def create_ei(i):
from_pos = pos_array[i]
to_pos = pos_array[i+1]
# initiate DeepGraph
g = dg.DeepGraph(v)
# create edges
g.create_edges(connectors=corr, step_size=step_size, from_pos=from_pos, to_pos=to_pos)
# store edge table
g.e.to_pickle('tmp/correlations/{}_corr.pickle'.format(str(i).zfill(3)))
#################################
gp = dg.DeepGraph(v)
# create edges
gp.create_edges(connectors=p_Val, step_size=step_size, from_pos=from_pos, to_pos=to_pos)
# store edge table
gp.e.to_pickle('tmp/correlations/{}_pval.pickle'.format(str(i).zfill(3)))
#################################
# computation
if __name__ == '__main__':
os.makedirs("tmp/correlations", exist_ok=True)
indices = np.arange(0, n_processes - 1)
p = Pool()
for _ in p.imap_unordered(create_ei, indices):
pass
# store correlation values
files = os.listdir('tmp/correlations/')
files.sort()
for f in files:
et = pd.read_pickle('tmp/correlations/{}'.format(f))
print(et)
store.close()
I get the following error:
Traceback (most recent call last):
File "/lib/python3.9/multiprocessing/pool.py", line 125, in worker
result = (True, func(*args, **kwds))
File "pairwise_corr.py", line 64, in create_ei
gp.create_edges(connectors=p_Val, step_size=step_size, from_pos=from_pos, to_pos=to_pos)
File "/lib/python3.9/site-packages/deepgraph/deepgraph.py", line 616, in create_edges
self.e = _matrix_iterator(
File "/lib/python3.9/site-packages/deepgraph/deepgraph.py", line 4875, in _matrix_iterator
ei = _select_and_return(vi, sources_k, targets_k, ft_feature,
File "/lib/python3.9/site-packages/deepgraph/deepgraph.py", line 5339, in _select_and_return
ei = pd.DataFrame({col: data[col] for col in coldtypedic})
File "/lib/python3.9/site-packages/pandas/core/frame.py", line 614, in __init__
mgr = dict_to_mgr(data, index, columns, dtype=dtype, copy=copy, typ=manager)
File "/lib/python3.9/site-packages/pandas/core/internals/construction.py", line 464, in dict_to_mgr
return arrays_to_mgr(
File "/lib/python3.9/site-packages/pandas/core/internals/construction.py", line 124, in arrays_to_mgr
arrays = _homogenize(arrays, index, dtype)
File "/lib/python3.9/site-packages/pandas/core/internals/construction.py", line 589, in _homogenize
val = sanitize_array(
File "/lib/python3.9/site-packages/pandas/core/construction.py", line 576, in sanitize_array
subarr = _sanitize_ndim(subarr, data, dtype, index, allow_2d=allow_2d)
File "/lib/python3.9/site-packages/pandas/core/construction.py", line 627, in _sanitize_ndim
raise ValueError("Data must be 1-dimensional")
ValueError: Data must be 1-dimensional
Any suggestions?
Thanks!
I was able to solve it with
def p_Val(index_s, index_t):
features_s = X[index_s]
features_t = X[index_t]
p = [pearsonr(features_s[i, :], features_t[i, :])[1] for i in range(len(features_s))]
p_val = np.asarray(p)
return p_val
I am trying to do some nls regression using R within Python. I am getting stuck with a RRuntimeError and am getting to a point where I am way outside my expertise and have struggled for a few days to get it to work so would appreciate some help.
This is my csv of data:
http://www.sharecsv.com/s/4cdd4f832b606d6616260f9dc0eedf38/ratedata.csv
This is my code:
import pandas as pd
import rpy2.robjects as ro
from rpy2.robjects.packages import importr
from rpy2.robjects import pandas2ri
pandas2ri.activate()
dfData = pd.read_csv('C:\\Users\\nick\\Desktop\\ratedata.csv')
rdf = pandas2ri.py2ri(dfData)
a = 0.5
b = 1.1
count = rdf.rx(True, 'Trials')
rates = rdf.rx(True, 'Successes')
base = importr('base', robject_translations={'with': '_with'})
stats = importr('stats', robject_translations={'format_perc': '_format_perc'})
my_formula = stats.as_formula('rates ~ 1-(1/(10^(a * count ^ (b-1))))')
d = ro.ListVector({'a': a, 'b': b})
fit = stats.nls(my_formula, weights=count, start=d)
Everything is compiling apart from:
fit = stats.nls(my_formula, weights=count, start=d)
I am getting the following traceback:
---------------------------------------------------------------------------
RRuntimeError Traceback (most recent call last)
<ipython-input-12-3f7fcd7d7851> in <module>()
6 d = ro.ListVector({'a': a, 'b': b})
7
----> 8 fit = stats.nls(my_formula, weights=count, start=d)
~\AppData\Local\Continuum\anaconda3\lib\site-packages\rpy2\robjects\functions.py in __call__(self, *args, **kwargs)
176 v = kwargs.pop(k)
177 kwargs[r_k] = v
--> 178 return super(SignatureTranslatedFunction, self).__call__(*args, **kwargs)
179
180 pattern_link = re.compile(r'\\link\{(.+?)\}')
~\AppData\Local\Continuum\anaconda3\lib\site-packages\rpy2\robjects\functions.py in __call__(self, *args, **kwargs)
104 for k, v in kwargs.items():
105 new_kwargs[k] = conversion.py2ri(v)
--> 106 res = super(Function, self).__call__(*new_args, **new_kwargs)
107 res = conversion.ri2ro(res)
108 return res
RRuntimeError: Error in (function (formula, data = parent.frame(), start, control = nls.control(), :
parameters without starting value in 'data': rates, count
I would be eternally thankful if anyone can see where I am going wrong, or can offer advice. All I want is the two numbers from that formula back in Python so I can use those to construct some confidence intervals.
Thank you
Consider incorporating all your formula variables into a single dataframe and use the data argument. The as_formula call looks in the R environment but rates and count are in the Python scope. Hence, contain all items in same object. Then run your nls with either the Pandas dataframe or R dataframe:
import pandas as pd
import rpy2.robjects as ro
from rpy2.robjects.packages import importr
from rpy2.robjects import pandas2ri
base = importr('base', robject_translations={'with': '_with'})
stats = importr('stats', robject_translations={'format_perc': '_format_perc'})
a = 0.05
b = 1.1
d = ro.ListVector({'a': a, 'b': b})
dfData = pd.read_csv('Input.csv')
dfData['count'] = dfData['Trials'].astype('float')
dfData['rates'] = dfData['Successes'] / dfData['Trials']
dfData['a'] = a
dfData['b'] = b
pandas2ri.activate()
rdf = pandas2ri.py2ri(dfData)
my_formula = stats.as_formula('rates ~ 1-(1/(10^(a * count ^ (b-1))))')
# WITH PANDAS DATAFRAME
fit = stats.nls(formula=my_formula, data=dfData, weights=dfData['count'], start=d)
print(fit)
# WITH R DATAFRAME
fit = stats.nls(formula=my_formula, data=rdf, weights=rdf.rx(True, 'count'), start=d)
print(fit)
Alternatively, you can use robjects.globalenv and not use data argument:
ro.globalenv['rates'] = dfData['rates']
ro.globalenv['count'] = dfData['count']
ro.globalenv['a'] = dfData['a']
ro.globalenv['b'] = dfData['b']
fit = stats.nls(formula=my_formula, weights=dfData['count'], start=d)
print(fit)
# Nonlinear regression model
# model: rates ~ 1 - (1/(10^(a * count^(b - 1))))
# data: parent.frame()
# a b
# 0.01043 1.24943
# weighted residual sum-of-squares: 14.37
# Number of iterations to convergence: 6
# Achieved convergence tolerance: 9.793e-07
# To return parameters
num = fit.rx('m')[0].names.index('getPars')
obj = fit.rx('m')[0][num]()
print(obj[0])
# 0.010425686223717435
print(obj[1])
# 1.2494303314553932
Equivalently in R:
dfData <- read.csv('Input.csv')
a <- .05
b <- 1.1
d <- list(a=a, b=b)
dfData$count <- dfData$Trials
dfData$rates <- dfData$Successes / dfData$Trials
dfData$a <- a
dfData$b <- b
my_formula <- stats::as.formula("rates ~ 1-(1/(10^(a * count ^ (b-1))))")
fit <- stats::nls(my_formula, data=dfData, weights=dfData$count, start=d)
print(fit)
# Nonlinear regression model
# model: rates ~ 1 - (1/(10^(a * count^(b - 1))))
# data: dfData
# a b
# 0.01043 1.24943
# weighted residual sum-of-squares: 14.37
# Number of iterations to convergence: 6
# Achieved convergence tolerance: 9.793e-07
# To return parameters
fit$m$getPars()['a']
# 0.01042569
fit$m$getPars()['b']
# 1.24943
I am practicing with SciPy and I encountered an error when trying to use fmin_slsqp. I set up a problem in which I want to maximize an objective function, U, given a set of constraints.
I have two control variables, x[0,t] and x[1,t] and, as you can see, they are indexed by t (time periods). The objective function is:
def obj_fct(x, alpha,beta,Al):
U = 0
x[1,0] = x0
for t in trange:
U = U - beta**t * ( (Al[t]*L)**(1-alpha) * x[1,t]**alpha - x[0,t])
return U
The constraints are defined over these two variables and one of them links the variables from one period (t) to another (t-1).
def constr(x,alpha,beta,Al):
return np.array([
x[0,t],
x[1,0] - x0,
x[1,t] - x[0,t] - (1-delta)*x[1,t-1]
])
Finally, here is the use of fmin_slsqp:
sol = fmin_slsqp(obj_fct, x_init, f_eqcons=constr, args=(alpha,beta,Al))
Leaving aside the fact that there are better ways to solve such dynamic problems, my question is about the syntax. When running this simple code, I get the following error:
Traceback (most recent call last):
File "xxx", line 34, in <module>
sol = fmin_slsqp(obj_fct, x_init, f_eqcons=constr, args=(alpha,beta,Al))
File "D:\Anaconda3\lib\site-packages\scipy\optimize\slsqp.py", line 207, in fmin_slsqp
constraints=cons, **opts)
File "D:\Anaconda3\lib\site-packages\scipy\optimize\slsqp.py", line 311, in _minimize_slsqp
meq = sum(map(len, [atleast_1d(c['fun'](x, *c['args'])) for c in cons['eq']]))
File "D:\Anaconda3\lib\site-packages\scipy\optimize\slsqp.py", line 311, in <listcomp>
meq = sum(map(len, [atleast_1d(c['fun'](x, *c['args'])) for c in cons['eq']]))
File "xxx", line 30, in constr
x[0,t],
IndexError: too many indices for array
[Finished in 0.3s with exit code 1]
What am I doing wrong?
The initial part of the code, assigning values to the parameters, is:
from scipy.optimize import fmin_slsqp
import numpy as np
T = 30
beta = 0.96
L = 1
x0 = 1
gl = 0.02
alpha = 0.3
delta = 0.05
x_init = np.array([1,0.1])
A_l0 = 1000
Al = np.zeros((T+1,1))
Al[1] = A_l0
trange = np.arange(1,T+1,1, dtype='Int8') # does not include period zero
for t in trange: Al[t] = A_l0*(1 + gl)**(t-1)
The array x passed to your objective and constraint functions will be a one-dimensional array (just like your x_init is). You can't index a one-dimensional array with two indices, so expressions such as x[1,0] and x[0,t] will generate an error.
I need to make 200 bins that are evenly spaced and have my data be sorted into them, so that I can make a histogram out of he data. Can someone help me make a script that can make 200 bins and have data be sorted inside of them.
This is my current code:
#!/usr/bin/python
import operator
import matplotlib.pyplot as plt
import numpy as np
l=[]
with open("testdata") as f:
line = f.next()
f.next()# skip headers
nat = int(line.split()[0])
print nat
for line in f:
if line.strip():
if line.strip():
l.append(map(float,line.split()[1:]))
b = 0
a = 1
for b in range(53):
for a in range(b+1,54):
vector1 = (l[b][0],l[b][1],l[b][2])
vector2 = (l[a][0],l[a][1],l[a][2])
x = vector1
y = vector2
vector3 = list(np.array(x) - np.array(y))
dotProduct = reduce( operator.add, map( operator.mul, vector3, vector3))
dp = dotProduct**.5
print dp
#data = dp
#num_bins = 200 # <- number of bins for the histogram
#plt.hist(data, num_bins)
#plt.show()
Errors:
/usr/lib64/python2.6/site-packages/matplotlib/backends/backend_gtk.py:621: DeprecationWarning: Use the new widget gtk.Tooltip
self.tooltips = gtk.Tooltips()
Traceback (most recent call last):
File "vector_final", line 42, in <module>
plt.hist(data, num_bins)
File "/usr/lib64/python2.6/site-packages/matplotlib/pyplot.py", line 2008, in hist
ret = ax.hist(x, bins, range, normed, weights, cumulative, bottom, histtype, align, orientation, rwidth, log, **kwargs)
File "/usr/lib64/python2.6/site-packages/matplotlib/axes.py", line 7098, in hist
w = [None]*len(x)
TypeError: len() of unsized object
You are pretty close. The only thing you are missing is storing your data and passing it to the histogram function correctly.
#!/usr/bin/python
import operator
import matplotlib.pyplot as plt
import numpy as np
l=[]
with open("testdata") as f:
line = f.next()
f.next()# skip headers
nat = int(line.split()[0])
print nat
for line in f:
# store striped line and only store if there is data on the line.
cleaned = line.strip()
if cleaned:
# convert to float and remove characters in first index
l.append(map(float,cleaned.split()[1:]))
b = 0
a = 1
# create a list to store our calculations in
distances = []
num_vects = len(l)
for b in range(num_vects-1):
for a in range(b+1,num_vects):
vector1 = (l[b][0],l[b][1],l[b][2])
vector2 = (l[a][0],l[a][1],l[a][2])
x = vector1
y = vector2
vector3 = list(np.array(x) - np.array(y))
dotProduct = reduce( operator.add, map( operator.mul, vector3, vector3))
dp = dotProduct**.5
# store individual data point into the list of calculated distances
distances.append(dp)
# plot histogram
num_bins = 200 # <- number of bins for the histogram
# store useful data returned by the histogram function
(n, bins, patches) = plt.hist(distances, num_bins)
plt.show()