I am trying to do some nls regression using R within Python. I am getting stuck with a RRuntimeError and am getting to a point where I am way outside my expertise and have struggled for a few days to get it to work so would appreciate some help.
This is my csv of data:
http://www.sharecsv.com/s/4cdd4f832b606d6616260f9dc0eedf38/ratedata.csv
This is my code:
import pandas as pd
import rpy2.robjects as ro
from rpy2.robjects.packages import importr
from rpy2.robjects import pandas2ri
pandas2ri.activate()
dfData = pd.read_csv('C:\\Users\\nick\\Desktop\\ratedata.csv')
rdf = pandas2ri.py2ri(dfData)
a = 0.5
b = 1.1
count = rdf.rx(True, 'Trials')
rates = rdf.rx(True, 'Successes')
base = importr('base', robject_translations={'with': '_with'})
stats = importr('stats', robject_translations={'format_perc': '_format_perc'})
my_formula = stats.as_formula('rates ~ 1-(1/(10^(a * count ^ (b-1))))')
d = ro.ListVector({'a': a, 'b': b})
fit = stats.nls(my_formula, weights=count, start=d)
Everything is compiling apart from:
fit = stats.nls(my_formula, weights=count, start=d)
I am getting the following traceback:
---------------------------------------------------------------------------
RRuntimeError Traceback (most recent call last)
<ipython-input-12-3f7fcd7d7851> in <module>()
6 d = ro.ListVector({'a': a, 'b': b})
7
----> 8 fit = stats.nls(my_formula, weights=count, start=d)
~\AppData\Local\Continuum\anaconda3\lib\site-packages\rpy2\robjects\functions.py in __call__(self, *args, **kwargs)
176 v = kwargs.pop(k)
177 kwargs[r_k] = v
--> 178 return super(SignatureTranslatedFunction, self).__call__(*args, **kwargs)
179
180 pattern_link = re.compile(r'\\link\{(.+?)\}')
~\AppData\Local\Continuum\anaconda3\lib\site-packages\rpy2\robjects\functions.py in __call__(self, *args, **kwargs)
104 for k, v in kwargs.items():
105 new_kwargs[k] = conversion.py2ri(v)
--> 106 res = super(Function, self).__call__(*new_args, **new_kwargs)
107 res = conversion.ri2ro(res)
108 return res
RRuntimeError: Error in (function (formula, data = parent.frame(), start, control = nls.control(), :
parameters without starting value in 'data': rates, count
I would be eternally thankful if anyone can see where I am going wrong, or can offer advice. All I want is the two numbers from that formula back in Python so I can use those to construct some confidence intervals.
Thank you
Consider incorporating all your formula variables into a single dataframe and use the data argument. The as_formula call looks in the R environment but rates and count are in the Python scope. Hence, contain all items in same object. Then run your nls with either the Pandas dataframe or R dataframe:
import pandas as pd
import rpy2.robjects as ro
from rpy2.robjects.packages import importr
from rpy2.robjects import pandas2ri
base = importr('base', robject_translations={'with': '_with'})
stats = importr('stats', robject_translations={'format_perc': '_format_perc'})
a = 0.05
b = 1.1
d = ro.ListVector({'a': a, 'b': b})
dfData = pd.read_csv('Input.csv')
dfData['count'] = dfData['Trials'].astype('float')
dfData['rates'] = dfData['Successes'] / dfData['Trials']
dfData['a'] = a
dfData['b'] = b
pandas2ri.activate()
rdf = pandas2ri.py2ri(dfData)
my_formula = stats.as_formula('rates ~ 1-(1/(10^(a * count ^ (b-1))))')
# WITH PANDAS DATAFRAME
fit = stats.nls(formula=my_formula, data=dfData, weights=dfData['count'], start=d)
print(fit)
# WITH R DATAFRAME
fit = stats.nls(formula=my_formula, data=rdf, weights=rdf.rx(True, 'count'), start=d)
print(fit)
Alternatively, you can use robjects.globalenv and not use data argument:
ro.globalenv['rates'] = dfData['rates']
ro.globalenv['count'] = dfData['count']
ro.globalenv['a'] = dfData['a']
ro.globalenv['b'] = dfData['b']
fit = stats.nls(formula=my_formula, weights=dfData['count'], start=d)
print(fit)
# Nonlinear regression model
# model: rates ~ 1 - (1/(10^(a * count^(b - 1))))
# data: parent.frame()
# a b
# 0.01043 1.24943
# weighted residual sum-of-squares: 14.37
# Number of iterations to convergence: 6
# Achieved convergence tolerance: 9.793e-07
# To return parameters
num = fit.rx('m')[0].names.index('getPars')
obj = fit.rx('m')[0][num]()
print(obj[0])
# 0.010425686223717435
print(obj[1])
# 1.2494303314553932
Equivalently in R:
dfData <- read.csv('Input.csv')
a <- .05
b <- 1.1
d <- list(a=a, b=b)
dfData$count <- dfData$Trials
dfData$rates <- dfData$Successes / dfData$Trials
dfData$a <- a
dfData$b <- b
my_formula <- stats::as.formula("rates ~ 1-(1/(10^(a * count ^ (b-1))))")
fit <- stats::nls(my_formula, data=dfData, weights=dfData$count, start=d)
print(fit)
# Nonlinear regression model
# model: rates ~ 1 - (1/(10^(a * count^(b - 1))))
# data: dfData
# a b
# 0.01043 1.24943
# weighted residual sum-of-squares: 14.37
# Number of iterations to convergence: 6
# Achieved convergence tolerance: 9.793e-07
# To return parameters
fit$m$getPars()['a']
# 0.01042569
fit$m$getPars()['b']
# 1.24943
Related
I am writing a program to determine the expectation value, expectation of the X^2 and E(X - X_avg)^2. I have written a program like so:
# program : expectation value
import csv
import pandas as pd
import numpy as np
from scipy.stats import chi2_contingency
import seaborn as sns
import matplotlib.pyplot as plt
import logging
logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s')
# Step 1: read csv
probabilityCSV = open('probability.csv')
df = pd.read_csv(probabilityCSV)
logging.debug(df['X'])
logging.debug(df['P'])
logging.debug(type(df['X']))
logging.debug(type(df['P']))
# Step 2: convert dataframe to ndarry
# https://stackoverflow.com/questions/13187778/convert-pandas-dataframe-to-numpy-array
X = df['X'].to_numpy()
p = df['P'].to_numpy()
logging.debug(f'X={X}')
logging.debug(f'p={p}')
# Step 3: calculate E(X)
# https://www.statology.org/expected-value-in-python/
def expected_value(values, weights):
return np.sum((np.dot(values,weights))) / np.sum(weights)
logging.debug('Step 3: calculate E(X)')
expectation = expected_value(X,p)
logging.debug(f'E(X)={expectation}')
# Step 4: calculate E(X^2)
logging.debug('Step 4: calculate E(X^2)')
# add normalize='index'
contingency_pct = pd.crosstab(df['Observed'],df['Expected'],normalize='index')
logging.debug(f'contingency_pct:{contingency_pct}')
# Step 5: calculate E(X - X_avg)^2
logging.debug('Step 5: calculate E(X - X_avg)^2')
The dataset that I am using is:
X,P
8,1/8
12,1/6
16,3/8
20,1/4
24,1/12
Expected:
E(X) = 16
E(X^2) = 276
E(X- X_avg)^2 =20
Actual:
Traceback (most recent call last):
File "/Users/evangertis/development/PythonAutomation/Statistics/expectation.py", line 35, in <module>
expectation = expected_value(X,p)
File "/Users/evangertis/development/PythonAutomation/Statistics/expectation.py", line 32, in expected_value
return np.sum((np.dot(values,weights))) / np.sum(weights)
File "<__array_function__ internals>", line 5, in sum
File "/usr/local/lib/python3.9/site-packages/numpy/core/fromnumeric.py", line 2259, in sum
return _wrapreduction(a, np.add, 'sum', axis, dtype, out, keepdims=keepdims,
File "/usr/local/lib/python3.9/site-packages/numpy/core/fromnumeric.py", line 86, in _wrapreduction
return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
TypeError: cannot perform reduce with flexible type
Your problem is the step 1, so I took the liberty of rewriting it:
# Step 1.1: read csv in the right way
probabilityCSV = open('probability.csv')
df = pd.read_csv(probabilityCSV)
df["P"] = df.P.str.split("/", expand=True)[0].astype(int) / df.P.str.split("/", expand=True)[1].astype(int)
df:
X P
0 8 0.125000
1 12 0.166667
2 16 0.375000
3 20 0.250000
4 24 0.083333
The second step is right:
# Step 2: convert dataframe to ndarry
X = df['X'].to_numpy()
p = df['P'].to_numpy()
X, p:
(array([ 8, 12, 16, 20, 24]),
array([0.125 , 0.16666667, 0.375 , 0.25 , 0.08333333]))
After this you correctly defined the function:
def expected_value(values, weights):
return np.sum((np.dot(values,weights))) / np.sum(weights)
You can use this function to compute E(X), E(X^2) and E(X - X_avg)^2. In particular:
expected_value(X,p)
# returns E(X) = 16.0
expected_value(X**2, p)
# returns E(X^2) = 276.0
expected_value((X-X.mean())**2, p)
# returns E(X - X_avg)^2 = 20.0
The error has occurred because your df["P"] column is a string column.
Minimum reproducible example:
import cudf
from cuml.neighbors import KNeighborsRegressor
d = {
'id':['a','b','c','d','e','f'],
'latitude':[50,-22,13,37,43,14],
'longitude':[3,-43,100,27,-4,121],
}
df = cudf.DataFrame(d)
knn = KNeighborsRegressor(n_neighbors = 4, metric = 'haversine')
knn.fit(df[['latitude','longitude']],df.index)
dists, nears = knn.kneighbors(df[['latitude','longitude']], return_distance = True)
Throws an error number of landmark samples must be >= k
the whole trace is:
---------------------------------------------------------------------------
RuntimeError Traceback (most recent call last)
/tmp/ipykernel_33/1073358290.py in <module>
10 knn = KNeighborsRegressor(n_neighbors = 4, metric = 'haversine')
11 knn.fit(df[['latitude','longitude']],df.index)
---> 12 dists, nears = knn.kneighbors(df[['latitude','longitude']], return_distance = True)
/opt/conda/lib/python3.7/site-packages/cuml/internals/api_decorators.py in inner_get(*args, **kwargs)
584
585 # Call the function
--> 586 ret_val = func(*args, **kwargs)
587
588 return cm.process_return(ret_val)
cuml/neighbors/nearest_neighbors.pyx in cuml.neighbors.nearest_neighbors.NearestNeighbors.kneighbors()
cuml/neighbors/nearest_neighbors.pyx in cuml.neighbors.nearest_neighbors.NearestNeighbors._kneighbors()
cuml/neighbors/nearest_neighbors.pyx in cuml.neighbors.nearest_neighbors.NearestNeighbors._kneighbors_dense()
RuntimeError: exception occured! file=_deps/raft-src/cpp/include/raft/spatial/knn/detail/ball_cover.cuh line=326: number of landmark samples must be >= k
Obtained 64 stack frames
...
I have been trying hard to get around this error for days but the only way i know is to convert the cudf to pandas df and use sklearn. And it works perfectly:
import pandas as pd
from sklearn.neighbors import KNeighborsRegressor
d = {
'id':['a','b','c','d','e','f'],
'latitude':[50,-22,13,37,43,14],
'longitude':[3,-43,100,27,-4,121],
}
df = pd.DataFrame(d)
knn = KNeighborsRegressor(n_neighbors = 4, metric = 'haversine')
knn.fit(df[['latitude','longitude']],df.index)
dists, nears = knn.kneighbors(df[['latitude','longitude']], return_distance = True)
dists
gives us the distances array
Can you help me find a pure RAPIDS solution?
UPDATE: I found out that it works for number of neighbors <= length of the total data//2
UPDATE: Its a bug, and an appropriate issue has been opened here. We can pass algorithm='brute' as a work around until the issue gets resolved
I wrote this function to compute the normalized percentage correlation between two filter functions (with one shifted). The function works but takes about 8 to 12 minutes depending on the number of elements in nbs. I would like to know if there is another way to make this operation faster. Here is my code below:
import numpy as np
DT = 0.08
def corr_g(*nbs, Np=10000, sf = 0.5):
wb = 0.25 # bandwidth in Hz
freq = (1/DT)*np.linspace(-0.5,0.5-1/Np,Np) # frequency vector
dCg_norms = np.zeros((Np,len(nbs)))
for idx, nb in enumerate(nbs): # nb is the filter parameter
d_k_vector = np.linspace(-Np*sf, Np*sf, Np) # indices vector
dCg = d_k_vector*0 # array to hold correlation
g = ((1+np.exp(-nb))**2)/((1+np.exp(-nb*(freq+wb)/wb))*(1+np.exp(nb*(freq-wb)/wb))) # filter function
for index2, d_k in enumerate(d_k_vector): # loop through the new indices vector
for index, sth in enumerate(g):
# form a new array from g using the indices vector use only values within the limits of g. Then do a dot product operation
if (index+d_k) < Np and (index+d_k) >= 0:
dCg[index2] += g[index] * g[index+int(d_k)]
dCg_norm = dCg/np.max(dCg)*100 # normalized correlation
dCg_norms[:,idx] = dCg_norm # add to allocated array
return dCg_norms
my_arr = corr_g(*[2,4,8,16])
import matplotlib.pyplot as plt
Np = 10000
DT = 0.08
d_k_vector = np.linspace(-5000, 5000, Np)
plt.plot(d_k_vector/(10000*DT)/0.25,my_arr[:,1])
You should not calculate correlation yourself, better use np.correlate(vector, 'same'). There are small differences between your result and mine and I am pretty sure error is on your side.
def corr_g2(*nbs, Np=10000, sf = 0.5):
wb = 0.25 # bandwidth in Hz
freq = (1/DT)*np.linspace(-0.5,0.5-1/Np,Np) # frequency vector
dCg_norms = np.zeros((Np,len(nbs)))
for idx, nb in enumerate(nbs): # nb is the filter parameter
g = ((1+np.exp(-nb))**2)/((1+np.exp(-nb*(freq+wb)/wb))*(1+np.exp(nb*(freq-wb)/wb))) # filter function
dCg = np.correlate(g, g, 'same')
dCg_norm = dCg/np.max(dCg)*100 # normalized correlation
dCg_norms[:,idx] = dCg_norm # add to allocated array
return dCg_norms
def main():
my_arr = corr_g(*[2,4], Np=Np)
my_arr2 = corr_g2(*[2,4], Np=Np)
# import matplotlib.pyplot as plt
# d_k_vector = np.linspace(-Np / 2, Np / 2 - 1, Np)
# plt.plot(d_k_vector/(10000*DT)/0.25,my_arr[:,1])
# plt.plot(d_k_vector/(10000*DT)/0.25,my_arr2[:,1])
# plt.show()
if __name__ == '__main__':
main()
Profiling results for Np=1000:
Line # Hits Time Per Hit % Time Line Contents
==============================================================
39 #do_profile()
40 def main():
41 1 14419637.0 14419637.0 100.0 my_arr = corr_g(*[2,4], Np=Np)
42 1 1598.0 1598.0 0.0 my_arr2 = corr_g2(*[2,4], Np=Np)
I am conducting the G test in both R and Python and I am getting different results, the results I am getting in Python being wrong. Somehow I am misapplying the formula.
The data are:
prfs
Sex F M
Pref
B 29 17
A 2 12
The R Code is :
library(RVAideMemoire)
G.test(prfs)
G-test
data: prfs
G = 11.025, df = 1, p-value = 0.0008989
The Python code is :
stats.power_divergence(prfs, lambda_ = 'log-likelihood')
Power_divergenceResult(statistic=array([28.14366538, 0.86639163]), pvalue=array([1.12635722e-07, 3.51956200e-01]))
stats.power_divergence(prfs, lambda_ = 'log-likelihood', axis = None, ddof = 2)
Power_divergenceResult(statistic=29.07673602201342, pvalue=6.956736686069527e-08)
Its an old question but following answer may help:
obs = np.array([[29,17], [2,12]])
# G test with scipy:
from scipy.stats import *
g, p, dof, expctd = chi2_contingency(obs, lambda_="log-likelihood")
print("G={}; df={}; P={}".format(g, dof, p))
Output:
G=8.859368223179882; df=1; P=0.0029158847773319975
The values are similar to those obtained by R method.
Reference for above method is here.
I'm trying to do some bayesian probit code using data augmentation. I can get it to work if I loop over the rows of the output matrix, but I'd like to vectorize it and do it all in one shot (presumably that's faster).
import numpy as np
from numpy import random
import statsmodels.api as sm
from scipy import stats
from scipy.stats import norm, truncnorm
##################################
### Create some simulated data ###
num_leg = 50
num_bills = 20
a = np.random.uniform(-1,1,num_bills).reshape(num_bills, 1)
b = np.random.uniform(-2,2,num_bills).reshape(num_bills, 1)
x = np.random.standard_normal(num_leg).reshape(num_leg, 1)
ystar_base = a + np.dot(b,x.T)
epsilon = np.random.standard_normal(num_leg * num_bills).reshape(num_bills, num_leg)
ystar = ystar_base + epsilon
y = 1*(ystar >0)
### Initialize some stuff I need ###
avec = [0]*num_bills # These are bill parameters
bvec = [0]*num_bills
betavec = [np.matrix(zip(avec,bvec))]
xvec = [0]*num_leg # these are legislator parameters
_ones = np.ones(num_leg)
def init_y(mat): # initialize a latent y matrix
if mat==1: return truncnorm.rvs(0,10000)
else: return truncnorm.rvs(-10000,0)
vectorize_y = np.vectorize(init_y)
latent_y = np.matrix(vectorize_y(y))
burn = 500 # How long to run the MCMC
runs = 500
### define the functions ###
def sample_params(xnow,ynow): # This is the function I'd like to vectorize
if type(xnow) == list:
xnow = np.array(xnow)
if type(ynow) == list:
ynow = np.array(ynow)
ynow = ynow.T #reshape(ynow.shape[0],1)
sigma = np.linalg.inv(np.dot(xnow.T,xnow)) ###This is the line that produces an error###
xy = np.dot(xnow.T,ynow)
mu = np.dot(sigma, xy) # this is just (x'x)inv x'y
return np.random.multivariate_normal(np.array(mu).flatten(), sigma)
vecparams = np.vectorize(sample_params)
def get_mu(xnow, bnow): # getting the updated mean to draw the latent ys
if type(xnow) == list:
xnow = np.array(xnow)
if type(bnow) == list:
bnow = np.array(bnow)
mu = np.dot(xnow,bnow.T)
mu = np.matrix(mu)
return mu
def sample_y(mu, ynow): # generate latent y matrix
if ynow==1:
a, b = (0 - mu),(10000-mu)
else:
a, b = (-10000 - mu),(0-mu)
return truncnorm.rvs(a,b)
vector_sample = np.vectorize(sample_y) # I'd like to be able to do something like this
### Here's the MCMC loop with the internal loop over rows(bills)
for i in range(burn+runs):
this_beta = []
this_x = []
this_y = []
for j in range(num_bills): #I'd like to get rid of this loop
ex = zip(x_ones, x)
newbeta = sample_params(ex, latent_y[j])
this_beta.append(newbeta)
#ex = np.array(zip(x_ones, x))
#this_beta = vecparams(ex, latent_y[:,]) # and call the vectorized function here
betavec.append(this_beta)
#Note, I can vectorize the latent outputs easily enough here
mean = get_mu(ex, betavec[-1])
latent_y = np.matrix(vector_sample(mean, np.matrix(y).T).T.reshape(latent_y.shape[0], latent_y.shape[1]))
### Now a bit of code to check to see if I've recovered what I want ###
test_beta = [zip(*(z)) for z in betavec[burn:]]
test_a = np.array([z[0] for z in test_beta])
test_b = np.array([z[1] for z in test_beta])
amean = test_a.sum(axis = 0)/float(runs)
bmean = test_b.sum(axis = 0)/float(runs)
print 'a mean'
print np.corrcoef([amean, np.array(a)])
print
print 'b mean'
print np.corrcoef([bmean, np.array(b)])
If I comment out the loop and use the commented out lines just above, I get the following error at the line I indicated earlier (the one that defines sigma):
LinAlgError: 0-dimensional array given. Array must be at least two-dimensional