I am writing a program to determine the expectation value, expectation of the X^2 and E(X - X_avg)^2. I have written a program like so:
# program : expectation value
import csv
import pandas as pd
import numpy as np
from scipy.stats import chi2_contingency
import seaborn as sns
import matplotlib.pyplot as plt
import logging
logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s')
# Step 1: read csv
probabilityCSV = open('probability.csv')
df = pd.read_csv(probabilityCSV)
# Step 2: convert dataframe to ndarry
# https://stackoverflow.com/questions/13187778/convert-pandas-dataframe-to-numpy-array
X = df['X'].to_numpy()
p = df['P'].to_numpy()
# Step 3: calculate E(X)
# https://www.statology.org/expected-value-in-python/
def expected_value(values, weights):
return np.sum((np.dot(values,weights))) / np.sum(weights)
logging.debug('Step 3: calculate E(X)')
expectation = expected_value(X,p)
# Step 4: calculate E(X^2)
logging.debug('Step 4: calculate E(X^2)')
# add normalize='index'
contingency_pct = pd.crosstab(df['Observed'],df['Expected'],normalize='index')
# Step 5: calculate E(X - X_avg)^2
logging.debug('Step 5: calculate E(X - X_avg)^2')
The dataset that I am using is:
E(X) = 16
E(X^2) = 276
E(X- X_avg)^2 =20
Traceback (most recent call last):
File "/Users/evangertis/development/PythonAutomation/Statistics/expectation.py", line 35, in <module>
expectation = expected_value(X,p)
File "/Users/evangertis/development/PythonAutomation/Statistics/expectation.py", line 32, in expected_value
return np.sum((np.dot(values,weights))) / np.sum(weights)
File "<__array_function__ internals>", line 5, in sum
File "/usr/local/lib/python3.9/site-packages/numpy/core/fromnumeric.py", line 2259, in sum
return _wrapreduction(a, np.add, 'sum', axis, dtype, out, keepdims=keepdims,
File "/usr/local/lib/python3.9/site-packages/numpy/core/fromnumeric.py", line 86, in _wrapreduction
return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
TypeError: cannot perform reduce with flexible type
Your problem is the step 1, so I took the liberty of rewriting it:
# Step 1.1: read csv in the right way
probabilityCSV = open('probability.csv')
df = pd.read_csv(probabilityCSV)
df["P"] = df.P.str.split("/", expand=True)[0].astype(int) / df.P.str.split("/", expand=True)[1].astype(int)
0 8 0.125000
1 12 0.166667
2 16 0.375000
3 20 0.250000
4 24 0.083333
The second step is right:
# Step 2: convert dataframe to ndarry
X = df['X'].to_numpy()
p = df['P'].to_numpy()
X, p:
(array([ 8, 12, 16, 20, 24]),
array([0.125 , 0.16666667, 0.375 , 0.25 , 0.08333333]))
After this you correctly defined the function:
def expected_value(values, weights):
return np.sum((np.dot(values,weights))) / np.sum(weights)
You can use this function to compute E(X), E(X^2) and E(X - X_avg)^2. In particular:
# returns E(X) = 16.0
expected_value(X**2, p)
# returns E(X^2) = 276.0
expected_value((X-X.mean())**2, p)
# returns E(X - X_avg)^2 = 20.0
The error has occurred because your df["P"] column is a string column.
I have this code to estimate a model using a tobit regression in Python. This is the code which is parsed in three parts: data definition, the estimator builder and estimation.
import numpy as np
from scipy.optimize import minimize
# define the dependent variable and independent variables
X = data.iloc[:, 1:]
y = data.iloc[:, 0]
# Add a column of ones to the independent variables for the constant term
X = np.c_[np.ones(X.shape[0]), X]
# Define the likelihood function for the Tobit model
def likelihood(params, y, X, lower, upper):
beta = params[:-1]
sigma = params[-1]
mu = X # beta
prob = (1 / (sigma * np.sqrt(2 * np.pi)) * np.exp(-0.5 * ((y - mu) / sigma)**2))
prob[y < lower] = 0
prob[y > upper] = 0
return -np.log(prob).sum()
# Set the initial values for the parameters and the lower and upper bounds for censoring
params_init = np.random.normal(size=X.shape[1] + 1)
bounds = [(None, None) for i in range(X.shape[1])] + [(1e-10, None)]
# Perform the MLE estimation
res = minimize(likelihood, params_init, args=(y, X, 0, 100), bounds=bounds, method='L-BFGS-B')
# Extract the estimated parameters and their standard errors
params = res.x
stderr = np.sqrt(np.diag(res.hess_inv))
# Print the results
print(f'Coefficients: {params[:-1]}')
print(f'Standard Errors: {stderr[:-1]}')
print(f'Sigma: {params[-1]:.4f}')
Why am I getting this error message?
Thank you.
ValueError Traceback (most recent call last)
<ipython-input-245-5f39f416cc07> in <module>
31 # Extract the estimated parameters and their standard errors
32 params = res.x
---> 33 stderr = np.sqrt(np.diag(res.hess_inv))
35 # Print the results
/opt/anaconda3/lib/python3.8/site-packages/numpy/core/overrides.py in diag(*args, **kwargs)
/opt/anaconda3/lib/python3.8/site-packages/numpy/lib/twodim_base.py in diag(v, k)
307 return diagonal(v, k)
308 else:
--> 309 raise ValueError("Input must be 1- or 2-d.")
ValueError: Input must be 1- or 2-d.
EDIT: If you wanna look at the type of data I'm dealing with, you can simulate them using these lines of code I just wrote:
data = pd.DataFrame()
# Append 'interview probabilities' for individuals with and without disabilities
interview_prob_disabled = np.random.normal(38.63, 28.72, 619)
interview_prob_enabled = np.random.normal(44.27, 28.19, 542)
interview_prob = np.append(interview_prob_disabled, interview_prob_enabled)
# Correct the variable by its mean and standard deviation, without it being negative, nor exceeding 100, nor a float
interview_prob = np.clip(interview_prob, 0, 100)
interview_prob = np.round(interview_prob)
# Add the 'interview probabilities' variable to the dataframe
data['Interview Probabilities'] = interview_prob
# Add other variables such as age, gender, employment status, education, etc.
data['Age'] = np.random.randint(18, 65, size=len(interview_prob))
data['Gender'] = np.random.choice(['Male', 'Female'], size=len(interview_prob))
data['Employment Status'] = np.random.choice(['Employed', 'Unemployed', 'Retired'], size=len(interview_prob))
data['Education Level'] = np.random.choice(['High School', 'College', 'Vocational', 'Graduate School'], size=len(interview_prob))
# Add a 'disability status' variable as a dummy
data['Disability Status'] = np.append(np.repeat('Disabled', 619), np.repeat('Non-disabled', 542))
# Categorical variables
data['Gender'] = data['Gender'].map({'Male': 0, 'Female': 1})
data['Employment Status'] = data['Employment Status'].map({'Employed': 0, 'Unemployed': 1})
data['Education Level'] = data['Education Level'].map({'High School': 0, 'College': 1, 'Vocational': 2, 'Graduate School': 3})
data['Disability Status'] = data['Disability Status'].map({'Disabled': 1, 'Non-disabled': 0})
# Print the df
The problem is that your solver, L-BFGS-B yields a LbfgsInvHessProduct object (a linear operator) out of .hess_inv instead of a numpy array (which something like BFGS would give).
One solution to your problem would be to use res.hess_inv.todense() instead.
I am using deepgraph in python to compute correlation coefficients for large matrices. The output gives a multi-index data frame:
s t
0 1 -0.006066
2 0.094063
3 -0.025529
4 0.074080
5 0.035490
6 0.005221
7 0.032064
I want to add a column with corresponding p-values.
The original code with input example is obtained from https://deepgraph.readthedocs.io/en/latest/tutorials/pairwise_correlations.html
The code surrounded by hashtags is my approach to get p-values.
I want to merge the separate edge lists later on.
import os
from multiprocessing import Pool
import numpy as np
import pandas as pd
import deepgraph as dg
from numpy.random import RandomState
from scipy.stats import pearsonr, spearmanr
prng = RandomState(0)
n_features = int(5e3)
n_samples = int(1e2)
X = prng.randint(100, size=(n_features, n_samples)).astype(np.float64)
# Spearman's correlation coefficients
X = X.argsort(axis=1).argsort(axis=1)
# whiten variables for fast parallel computation later on
X = (X - X.mean(axis=1, keepdims=True)) / X.std(axis=1, keepdims=True)
# save in binary format
np.save('samples', X)
# parameters (change these to control RAM usage)
step_size = 1e5
n_processes = 100
# load samples as memory-map
X = np.load('samples.npy', mmap_mode='r')
# create node table that stores references to the mem-mapped samples
v = pd.DataFrame({'index': range(X.shape[0])})
# connector function to compute pairwise pearson correlations
def corr(index_s, index_t):
features_s = X[index_s]
features_t = X[index_t]
corr = np.einsum('ij,ij->i', features_s, features_t) / n_samples
return corr
def p_Val(index_s, index_t):
features_s = X[index_s]
features_t = X[index_t]
p = spearmanr(features_s, features_t)[1]
return p
# index array for parallelization
pos_array = np.array(np.linspace(0, n_features*(n_features-1)//2, n_processes), dtype=int)
# parallel computation
def create_ei(i):
from_pos = pos_array[i]
to_pos = pos_array[i+1]
# initiate DeepGraph
g = dg.DeepGraph(v)
# create edges
g.create_edges(connectors=corr, step_size=step_size, from_pos=from_pos, to_pos=to_pos)
# store edge table
gp = dg.DeepGraph(v)
# create edges
gp.create_edges(connectors=p_Val, step_size=step_size, from_pos=from_pos, to_pos=to_pos)
# store edge table
# computation
if __name__ == '__main__':
os.makedirs("tmp/correlations", exist_ok=True)
indices = np.arange(0, n_processes - 1)
p = Pool()
for _ in p.imap_unordered(create_ei, indices):
# store correlation values
files = os.listdir('tmp/correlations/')
for f in files:
et = pd.read_pickle('tmp/correlations/{}'.format(f))
I get the following error:
Traceback (most recent call last):
File "/lib/python3.9/multiprocessing/pool.py", line 125, in worker
result = (True, func(*args, **kwds))
File "pairwise_corr.py", line 64, in create_ei
gp.create_edges(connectors=p_Val, step_size=step_size, from_pos=from_pos, to_pos=to_pos)
File "/lib/python3.9/site-packages/deepgraph/deepgraph.py", line 616, in create_edges
self.e = _matrix_iterator(
File "/lib/python3.9/site-packages/deepgraph/deepgraph.py", line 4875, in _matrix_iterator
ei = _select_and_return(vi, sources_k, targets_k, ft_feature,
File "/lib/python3.9/site-packages/deepgraph/deepgraph.py", line 5339, in _select_and_return
ei = pd.DataFrame({col: data[col] for col in coldtypedic})
File "/lib/python3.9/site-packages/pandas/core/frame.py", line 614, in __init__
mgr = dict_to_mgr(data, index, columns, dtype=dtype, copy=copy, typ=manager)
File "/lib/python3.9/site-packages/pandas/core/internals/construction.py", line 464, in dict_to_mgr
return arrays_to_mgr(
File "/lib/python3.9/site-packages/pandas/core/internals/construction.py", line 124, in arrays_to_mgr
arrays = _homogenize(arrays, index, dtype)
File "/lib/python3.9/site-packages/pandas/core/internals/construction.py", line 589, in _homogenize
val = sanitize_array(
File "/lib/python3.9/site-packages/pandas/core/construction.py", line 576, in sanitize_array
subarr = _sanitize_ndim(subarr, data, dtype, index, allow_2d=allow_2d)
File "/lib/python3.9/site-packages/pandas/core/construction.py", line 627, in _sanitize_ndim
raise ValueError("Data must be 1-dimensional")
ValueError: Data must be 1-dimensional
Any suggestions?
I was able to solve it with
def p_Val(index_s, index_t):
features_s = X[index_s]
features_t = X[index_t]
p = [pearsonr(features_s[i, :], features_t[i, :])[1] for i in range(len(features_s))]
p_val = np.asarray(p)
return p_val
I am trying to do some nls regression using R within Python. I am getting stuck with a RRuntimeError and am getting to a point where I am way outside my expertise and have struggled for a few days to get it to work so would appreciate some help.
This is my csv of data:
This is my code:
import pandas as pd
import rpy2.robjects as ro
from rpy2.robjects.packages import importr
from rpy2.robjects import pandas2ri
dfData = pd.read_csv('C:\\Users\\nick\\Desktop\\ratedata.csv')
rdf = pandas2ri.py2ri(dfData)
a = 0.5
b = 1.1
count = rdf.rx(True, 'Trials')
rates = rdf.rx(True, 'Successes')
base = importr('base', robject_translations={'with': '_with'})
stats = importr('stats', robject_translations={'format_perc': '_format_perc'})
my_formula = stats.as_formula('rates ~ 1-(1/(10^(a * count ^ (b-1))))')
d = ro.ListVector({'a': a, 'b': b})
fit = stats.nls(my_formula, weights=count, start=d)
Everything is compiling apart from:
fit = stats.nls(my_formula, weights=count, start=d)
I am getting the following traceback:
RRuntimeError Traceback (most recent call last)
<ipython-input-12-3f7fcd7d7851> in <module>()
6 d = ro.ListVector({'a': a, 'b': b})
----> 8 fit = stats.nls(my_formula, weights=count, start=d)
~\AppData\Local\Continuum\anaconda3\lib\site-packages\rpy2\robjects\functions.py in __call__(self, *args, **kwargs)
176 v = kwargs.pop(k)
177 kwargs[r_k] = v
--> 178 return super(SignatureTranslatedFunction, self).__call__(*args, **kwargs)
180 pattern_link = re.compile(r'\\link\{(.+?)\}')
~\AppData\Local\Continuum\anaconda3\lib\site-packages\rpy2\robjects\functions.py in __call__(self, *args, **kwargs)
104 for k, v in kwargs.items():
105 new_kwargs[k] = conversion.py2ri(v)
--> 106 res = super(Function, self).__call__(*new_args, **new_kwargs)
107 res = conversion.ri2ro(res)
108 return res
RRuntimeError: Error in (function (formula, data = parent.frame(), start, control = nls.control(), :
parameters without starting value in 'data': rates, count
I would be eternally thankful if anyone can see where I am going wrong, or can offer advice. All I want is the two numbers from that formula back in Python so I can use those to construct some confidence intervals.
Thank you
Consider incorporating all your formula variables into a single dataframe and use the data argument. The as_formula call looks in the R environment but rates and count are in the Python scope. Hence, contain all items in same object. Then run your nls with either the Pandas dataframe or R dataframe:
import pandas as pd
import rpy2.robjects as ro
from rpy2.robjects.packages import importr
from rpy2.robjects import pandas2ri
base = importr('base', robject_translations={'with': '_with'})
stats = importr('stats', robject_translations={'format_perc': '_format_perc'})
a = 0.05
b = 1.1
d = ro.ListVector({'a': a, 'b': b})
dfData = pd.read_csv('Input.csv')
dfData['count'] = dfData['Trials'].astype('float')
dfData['rates'] = dfData['Successes'] / dfData['Trials']
dfData['a'] = a
dfData['b'] = b
rdf = pandas2ri.py2ri(dfData)
my_formula = stats.as_formula('rates ~ 1-(1/(10^(a * count ^ (b-1))))')
fit = stats.nls(formula=my_formula, data=dfData, weights=dfData['count'], start=d)
fit = stats.nls(formula=my_formula, data=rdf, weights=rdf.rx(True, 'count'), start=d)
Alternatively, you can use robjects.globalenv and not use data argument:
ro.globalenv['rates'] = dfData['rates']
ro.globalenv['count'] = dfData['count']
ro.globalenv['a'] = dfData['a']
ro.globalenv['b'] = dfData['b']
fit = stats.nls(formula=my_formula, weights=dfData['count'], start=d)
# Nonlinear regression model
# model: rates ~ 1 - (1/(10^(a * count^(b - 1))))
# data: parent.frame()
# a b
# 0.01043 1.24943
# weighted residual sum-of-squares: 14.37
# Number of iterations to convergence: 6
# Achieved convergence tolerance: 9.793e-07
# To return parameters
num = fit.rx('m')[0].names.index('getPars')
obj = fit.rx('m')[0][num]()
# 0.010425686223717435
# 1.2494303314553932
Equivalently in R:
dfData <- read.csv('Input.csv')
a <- .05
b <- 1.1
d <- list(a=a, b=b)
dfData$count <- dfData$Trials
dfData$rates <- dfData$Successes / dfData$Trials
dfData$a <- a
dfData$b <- b
my_formula <- stats::as.formula("rates ~ 1-(1/(10^(a * count ^ (b-1))))")
fit <- stats::nls(my_formula, data=dfData, weights=dfData$count, start=d)
# Nonlinear regression model
# model: rates ~ 1 - (1/(10^(a * count^(b - 1))))
# data: dfData
# a b
# 0.01043 1.24943
# weighted residual sum-of-squares: 14.37
# Number of iterations to convergence: 6
# Achieved convergence tolerance: 9.793e-07
# To return parameters
# 0.01042569
# 1.24943
I am practicing with SciPy and I encountered an error when trying to use fmin_slsqp. I set up a problem in which I want to maximize an objective function, U, given a set of constraints.
I have two control variables, x[0,t] and x[1,t] and, as you can see, they are indexed by t (time periods). The objective function is:
def obj_fct(x, alpha,beta,Al):
U = 0
x[1,0] = x0
for t in trange:
U = U - beta**t * ( (Al[t]*L)**(1-alpha) * x[1,t]**alpha - x[0,t])
return U
The constraints are defined over these two variables and one of them links the variables from one period (t) to another (t-1).
def constr(x,alpha,beta,Al):
return np.array([
x[1,0] - x0,
x[1,t] - x[0,t] - (1-delta)*x[1,t-1]
Finally, here is the use of fmin_slsqp:
sol = fmin_slsqp(obj_fct, x_init, f_eqcons=constr, args=(alpha,beta,Al))
Leaving aside the fact that there are better ways to solve such dynamic problems, my question is about the syntax. When running this simple code, I get the following error:
Traceback (most recent call last):
File "xxx", line 34, in <module>
sol = fmin_slsqp(obj_fct, x_init, f_eqcons=constr, args=(alpha,beta,Al))
File "D:\Anaconda3\lib\site-packages\scipy\optimize\slsqp.py", line 207, in fmin_slsqp
constraints=cons, **opts)
File "D:\Anaconda3\lib\site-packages\scipy\optimize\slsqp.py", line 311, in _minimize_slsqp
meq = sum(map(len, [atleast_1d(c['fun'](x, *c['args'])) for c in cons['eq']]))
File "D:\Anaconda3\lib\site-packages\scipy\optimize\slsqp.py", line 311, in <listcomp>
meq = sum(map(len, [atleast_1d(c['fun'](x, *c['args'])) for c in cons['eq']]))
File "xxx", line 30, in constr
IndexError: too many indices for array
[Finished in 0.3s with exit code 1]
What am I doing wrong?
The initial part of the code, assigning values to the parameters, is:
from scipy.optimize import fmin_slsqp
import numpy as np
T = 30
beta = 0.96
L = 1
x0 = 1
gl = 0.02
alpha = 0.3
delta = 0.05
x_init = np.array([1,0.1])
A_l0 = 1000
Al = np.zeros((T+1,1))
Al[1] = A_l0
trange = np.arange(1,T+1,1, dtype='Int8') # does not include period zero
for t in trange: Al[t] = A_l0*(1 + gl)**(t-1)
The array x passed to your objective and constraint functions will be a one-dimensional array (just like your x_init is). You can't index a one-dimensional array with two indices, so expressions such as x[1,0] and x[0,t] will generate an error.
I need to make 200 bins that are evenly spaced and have my data be sorted into them, so that I can make a histogram out of he data. Can someone help me make a script that can make 200 bins and have data be sorted inside of them.
This is my current code:
import operator
import matplotlib.pyplot as plt
import numpy as np
with open("testdata") as f:
line = f.next()
f.next()# skip headers
nat = int(line.split()[0])
print nat
for line in f:
if line.strip():
if line.strip():
b = 0
a = 1
for b in range(53):
for a in range(b+1,54):
vector1 = (l[b][0],l[b][1],l[b][2])
vector2 = (l[a][0],l[a][1],l[a][2])
x = vector1
y = vector2
vector3 = list(np.array(x) - np.array(y))
dotProduct = reduce( operator.add, map( operator.mul, vector3, vector3))
dp = dotProduct**.5
print dp
#data = dp
#num_bins = 200 # <- number of bins for the histogram
#plt.hist(data, num_bins)
/usr/lib64/python2.6/site-packages/matplotlib/backends/backend_gtk.py:621: DeprecationWarning: Use the new widget gtk.Tooltip
self.tooltips = gtk.Tooltips()
Traceback (most recent call last):
File "vector_final", line 42, in <module>
plt.hist(data, num_bins)
File "/usr/lib64/python2.6/site-packages/matplotlib/pyplot.py", line 2008, in hist
ret = ax.hist(x, bins, range, normed, weights, cumulative, bottom, histtype, align, orientation, rwidth, log, **kwargs)
File "/usr/lib64/python2.6/site-packages/matplotlib/axes.py", line 7098, in hist
w = [None]*len(x)
TypeError: len() of unsized object
You are pretty close. The only thing you are missing is storing your data and passing it to the histogram function correctly.
import operator
import matplotlib.pyplot as plt
import numpy as np
with open("testdata") as f:
line = f.next()
f.next()# skip headers
nat = int(line.split()[0])
print nat
for line in f:
# store striped line and only store if there is data on the line.
cleaned = line.strip()
if cleaned:
# convert to float and remove characters in first index
b = 0
a = 1
# create a list to store our calculations in
distances = []
num_vects = len(l)
for b in range(num_vects-1):
for a in range(b+1,num_vects):
vector1 = (l[b][0],l[b][1],l[b][2])
vector2 = (l[a][0],l[a][1],l[a][2])
x = vector1
y = vector2
vector3 = list(np.array(x) - np.array(y))
dotProduct = reduce( operator.add, map( operator.mul, vector3, vector3))
dp = dotProduct**.5
# store individual data point into the list of calculated distances
# plot histogram
num_bins = 200 # <- number of bins for the histogram
# store useful data returned by the histogram function
(n, bins, patches) = plt.hist(distances, num_bins)