statsmodels ValueError - python

I am having problem for my project, i have extracted data from twitter, saved it as csv and also performed data sentiment analysis but when i am trying to plot them i am unable to.
#statmodels OLS first
y, X = dmatrices('retweet_count_l ~ surge_pricing + free_rides + promo+ driver + food + controversy + regulations', data=training, return_type='dataframe')
# Define the model from above Patsy-created variables, using Statsmodels
print sm.OLS(y,X).fit().summary()
print sm.OLS(y,X).fit().params
print 'r sqd is : ', sm.OLS(y,X).fit().rsquared
rainbow = sm.stats.linear_rainbow(sm.OLS(y,X).fit())
print 'Rainbow Test for Linearity is ', rainbow
y_hat, X_hat = dmatrices('retweet_count_l ~ surge_pricing + free_rides + promo + driver + food + controversy + regulations', data=testing, return_type='dataframe')
y_pred = sm.OLS(y,X).fit().predict(X_hat)
testing['retweet_pred_smols'] = pd.Series(y_pred)
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-7-b5c392dca77b> in <module>()
2 y, X = dmatrices('retweet_count_l ~ surge_pricing + free_rides + promo + driver + food + controversy + regulations', data=training, return_type='dataframe')
3 # Define the model from above Patsy-created variables, using Statsmodels
----> 4 model = sm.OLS(y,X)
5 results = model.fit()
6 print(results.summary())
/usr/lib/python2.7/dist-packages/statsmodels/regression/linear_model.pyc in __init__(self, endog, exog, missing, hasconst)
481 def __init__(self, endog, exog=None, missing='none', hasconst=None):
482 super(OLS, self).__init__(endog, exog, missing=missing,
--> 483 hasconst=hasconst)
484
485 def loglike(self, params):
/usr/lib/python2.7/dist-packages/statsmodels/regression/linear_model.pyc in __init__(self, endog, exog, weights, missing, hasconst)
383 weights = weights.squeeze()
384 super(WLS, self).__init__(endog, exog, missing=missing,
--> 385 weights=weights, hasconst=hasconst)
386 nobs = self.exog.shape[0]
387 weights = self.weights
/usr/lib/python2.7/dist-packages/statsmodels/regression/linear_model.pyc in __init__(self, endog, exog, **kwargs)
77 """
78 def __init__(self, endog, exog, **kwargs):
---> 79 super(RegressionModel, self).__init__(endog, exog, **kwargs)
80 self._data_attr.extend(['pinv_wexog', 'wendog', 'wexog', 'weights'])
81
/usr/lib/python2.7/dist-packages/statsmodels/base/model.pyc in __init__(self, endog, exog, **kwargs)
135 def __init__(self, endog, exog=None, **kwargs):
136 super(LikelihoodModel, self).__init__(endog, exog, **kwargs)
--> 137 self.initialize()
138
139 def initialize(self):
/usr/lib/python2.7/dist-packages/statsmodels/regression/linear_model.pyc in initialize(self)
86 # overwrite nobs from class Model:
87 self.nobs = float(self.wexog.shape[0])
---> 88 self.rank = rank(self.exog)
89 self.df_model = float(self.rank - self.k_constant)
90 self.df_resid = self.nobs - self.rank
/usr/lib/python2.7/dist-packages/statsmodels/tools/tools.pyc in rank(X, cond)
380 if len(X.shape) == 2:
381 D = svdvals(X)
--> 382 return int(np.add.reduce(np.greater(D / D.max(), cond).astype(np.int32)))
383 else:
384 return int(not np.alltrue(np.equal(X, 0.)))
/usr/local/lib/python2.7/dist-packages/numpy/core/_methods.pyc in _amax(a, axis, out, keepdims)
24 # small reductions
25 def _amax(a, axis=None, out=None, keepdims=False):
---> 26 return umr_maximum(a, axis, None, out, keepdims)
27
28 def _amin(a, axis=None, out=None, keepdims=False):
ValueError: zero-size array to reduction operation maximum which has no identity

I can reproduce the error with:
In [224]: np.array([]).max()
...
ValueError: zero-size array to reduction operation maximum which has no identity
In [225]:
So tracing D.max() back ...
sm.OLS(y,X)
X must be be zero-size.
So you need to examine X. What is it's .shape? Try printing it.

I had the same issue. It was because I had some 'nan' values in my columns. After filling them with '0's the problem was solved.

Related

OLS Statsmodels formula: Returns an ValueError: zero-size array to reduction operation maximum which has no identity

Hey I am doing multiple OLS regressions for some cross-sectional data iterating through the months. I encounter for the forth month a ValueError: zero-size array to reduction operation maximum which has no identity. But I do not know why. The data has no NaNs, I have tested this with dropna since it was suggested in another Question Link:
df_month.dropna(how='all')
And the zero in each months data is due to the normalization of the data, which does not cause any trouble during the first three iterations only in the fourth. What's also weird is that if I stop the loop before the OLS regression is executed in the fourth line and then run the regression again by hand in another cell it just works fine. Could this issue be due to the storing?
Here is my code an data to replicate the error:
import statsmodels.formula.api as smf #ols (minor letters)
import pandas as pd
import numpy as np
df = pd.read_csv('df_all.csv', index_col='Instrument', sep=',', decimal='.')
df.drop(columns='Unnamed: 0', inplace=True)
#Creates an array with all the years to iterate through single year
years = df_all['Date'].dt.year.unique()
df_store = pd.DataFrame(index=[], columns=['year', 'month', 'R2_adj'])
for year in years:
df_year = df_all[df_all['Date'].dt.year == year]
df_year_t1 = df_all[df_all['Date'].dt.year == year+1]
Jan_date = df_year['Date'][0]
year_start = df_year[(df_year['Date'] == Jan_date) & (df_year['HQ'] == 'United States of America')
& (df_year['ESG'] > 0)]
year_start_firms = year_start.index.unique()
df_year_firms = df_year[['Date', 'eTR', 'MC', 'ESG']].loc[year_start_firms]
df_year_t1 = df_year_t1[['Date', 'eTR', 'MC', 'ESG']]
print(year)
print(" ")
#Normalizes the ESG Scores to the interval (0; 1) and substitues it in; 0.5 is the mean
df = df_year_firms[['ESG']]
Normalized_ESG_year = (df - df.min()) / (df.max() - df.min())
df_year_firms_norm = df_year_firms
df_year_firms_norm[['ESG_norm']] = Normalized_ESG_year.values
df_year_firms_norm = df_year_firms_norm.drop('ESG', axis=1)
df_year_firms_norm = df_year_firms_norm.ffill(axis=0)
df_year_firms_norm.loc[:, 'Month'] = df_year_firms_norm['Date'].dt.month.values.reshape(len(df_year_firms_norm), 1)
df_year_t1.loc[:, 'Month'] = df_year_t1['Date'].dt.month.values.reshape(len(df_year_t1), 1)
data = pd.merge(df_year_firms_norm, df_year_t1.iloc[:, [0,1,-1]], how='inner', on=['Instrument', 'Month'], suffixes=('_t', '_t1'))
#Resets index
data.reset_index(inplace=True)
#Monthwise iteration within the year loop
for month in range(1, 13, 1):
df_month = data[data['Month'] == month]
#ols by statsmodels.formula.api = smf (ols with minor letters)
#DataFrame input
#smf not sm (statsmodels.formula.api instead of statsmodels.api)
ESG_ols = smf.ols(formula = 'eTR_t1 ~ ESG_norm', data=df_month).fit(cov_type='HAC',cov_kwds={'maxlags':0})
ESG_ols_tstat = ESG_ols.tvalues
ESG_ols_coeff = ESG_ols.params
results_df = pd.DataFrame({
'ESG_ols_coeff':ESG_ols_coeff, 'ESG_ols_tstat':ESG_ols_tstat},
index = ['Intercept', 'ESG'])
#Produces the table
df_output = summary_col(
results=[ESG_ols], stars=True, float_format='%0.4f',
model_names=['ESG_ols'],
info_dict={'N':lambda x: "{0:d}".format(int(x.nobs))},
regressor_order = ['Intercept', 'ESG_norm'])
helper = pd.DataFrame(data=[[year, month, ESG_ols.rsquared_adj]],
columns=['year', 'month', 'R2_adj'])
df_store = df_store.append(helper)
print(month)
break
df_store
I am also happy for any suggestions in efficiency!
As requested, here is the full Traceback:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
~\AppData\Local\Temp/ipykernel_8408/1348884998.py in <module>
91 # break
92 #smf not sm (statsmodels.formula.api instead of statsmodels.api)
---> 93 ESG_ols = smf.ols(formula = 'eTR_t1 ~ ESG_norm', data=df_month).fit(cov_type='HAC',cov_kwds={'maxlags':1})
94 #!Assumption: maxlags=0 should be reasonable since we do not have any TS analysis, right?
95 ESG_ols_tstat = ESG_ols.tvalues
~\anaconda3\lib\site-packages\statsmodels\base\model.py in from_formula(cls, formula, data, subset, drop_cols, *args, **kwargs)
193 'formula': formula, # attach formula for unpckling
194 'design_info': design_info})
--> 195 mod = cls(endog, exog, *args, **kwargs)
196 mod.formula = formula
197
~\anaconda3\lib\site-packages\statsmodels\regression\linear_model.py in __init__(self, endog, exog, missing, hasconst, **kwargs)
870 def __init__(self, endog, exog=None, missing='none', hasconst=None,
871 **kwargs):
--> 872 super(OLS, self).__init__(endog, exog, missing=missing,
873 hasconst=hasconst, **kwargs)
874 if "weights" in self._init_keys:
~\anaconda3\lib\site-packages\statsmodels\regression\linear_model.py in __init__(self, endog, exog, weights, missing, hasconst, **kwargs)
701 else:
702 weights = weights.squeeze()
--> 703 super(WLS, self).__init__(endog, exog, missing=missing,
704 weights=weights, hasconst=hasconst, **kwargs)
705 nobs = self.exog.shape[0]
~\anaconda3\lib\site-packages\statsmodels\regression\linear_model.py in __init__(self, endog, exog, **kwargs)
188 """
189 def __init__(self, endog, exog, **kwargs):
--> 190 super(RegressionModel, self).__init__(endog, exog, **kwargs)
191 self._data_attr.extend(['pinv_wexog', 'weights'])
192
~\anaconda3\lib\site-packages\statsmodels\base\model.py in __init__(self, endog, exog, **kwargs)
235
236 def __init__(self, endog, exog=None, **kwargs):
--> 237 super(LikelihoodModel, self).__init__(endog, exog, **kwargs)
238 self.initialize()
239
~\anaconda3\lib\site-packages\statsmodels\base\model.py in __init__(self, endog, exog, **kwargs)
75 missing = kwargs.pop('missing', 'none')
76 hasconst = kwargs.pop('hasconst', None)
---> 77 self.data = self._handle_data(endog, exog, missing, hasconst,
78 **kwargs)
79 self.k_constant = self.data.k_constant
~\anaconda3\lib\site-packages\statsmodels\base\model.py in _handle_data(self, endog, exog, missing, hasconst, **kwargs)
99
100 def _handle_data(self, endog, exog, missing, hasconst, **kwargs):
--> 101 data = handle_data(endog, exog, missing, hasconst, **kwargs)
102 # kwargs arrays could have changed, easier to just attach here
103 for key in kwargs:
~\anaconda3\lib\site-packages\statsmodels\base\data.py in handle_data(endog, exog, missing, hasconst, **kwargs)
670
671 klass = handle_data_class_factory(endog, exog)
--> 672 return klass(endog, exog=exog, missing=missing, hasconst=hasconst,
673 **kwargs)
~\anaconda3\lib\site-packages\statsmodels\base\data.py in __init__(self, endog, exog, missing, hasconst, **kwargs)
85 self.const_idx = None
86 self.k_constant = 0
---> 87 self._handle_constant(hasconst)
88 self._check_integrity()
89 self._cache = {}
~\anaconda3\lib\site-packages\statsmodels\base\data.py in _handle_constant(self, hasconst)
129 # detect where the constant is
130 check_implicit = False
--> 131 exog_max = np.max(self.exog, axis=0)
132 if not np.isfinite(exog_max).all():
133 raise MissingDataError('exog contains inf or nans')
<__array_function__ internals> in amax(*args, **kwargs)
~\anaconda3\lib\site-packages\numpy\core\fromnumeric.py in amax(a, axis, out, keepdims, initial, where)
2731 5
2732 """
-> 2733 return _wrapreduction(a, np.maximum, 'max', axis, None, out,
2734 keepdims=keepdims, initial=initial, where=where)
2735
~\anaconda3\lib\site-packages\numpy\core\fromnumeric.py in _wrapreduction(obj, ufunc, method, axis, dtype, out, **kwargs)
85 return reduction(axis=axis, out=out, **passkwargs)
86
---> 87 return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
88
89
ValueError: zero-size array to reduction operation maximum which has no identity
I was reluctant to use this approach but it works and I still get results even for those month where there is supposed to be a lack of data.
I found the idea here Overcom ValueError for empty array
try: #Catches the upcoming Value Error
results = smf.ols(formula = 'eTR_t1 ~ ESG_norm', data=df_month).fit(cov_type='HAC',cov_kwds={'maxlags':0})
except ValueError:
pass
Thank you #Josef for the suggestions and the help.

Regression ValueError: Pandas data cast to numpy dtype of object. Check input data with np.asarray(data)

I have a data frame 'df' that has missing column values. I want to fill in the missing/NaN values in the Avg Monthly Long Distance Charges column through prediction (regression) using the other column values. Then, replace the NaN values with the new values found.
I received the following error message when executing my code. Is there something that I am doing wrong?
Data frame: 'df'
Customer ID,Gender,Age,Married,Number of Dependents,City,Zip Code,Latitude,Longitude,Number of Referrals,Tenure in Months,Offer,Phone Service,Avg Monthly Long Distance Charges,Multiple Lines,Internet Service,Internet Type,Avg Monthly GB Download,Online Security,Online Backup,Device Protection Plan,Premium Tech Support,Streaming TV,Streaming Movies,Streaming Music,Unlimited Data,Contract,Paperless Billing,Payment Method,Monthly Charge,Total Charges,Total Refunds,Total Extra Data Charges,Total Long Distance Charges,Total Revenue,Customer Status,Churn Category,Churn Reason
0002-ORFBO,Female,37,Yes,0,Frazier Park,93225,34.827662,-118.999073,2,9,None,Yes,42.39,No,Yes,Cable,16,No,Yes,No,Yes,Yes,No,No,Yes,One Year,Yes,Credit Card,65.6,593.3,0,0,381.51,974.81,Stayed,,
0003-MKNFE,Male,46,No,0,Glendale,91206,34.162515,-118.203869,0,9,None,Yes,10.69,Yes,Yes,Cable,10,No,No,No,No,No,Yes,Yes,No,Month-to-Month,No,Credit Card,-4,542.4,38.33,10,96.21,610.28,Stayed,,
0004-TLHLJ,Male,50,No,0,Costa Mesa,92627,33.645672,-117.922613,0,4,Offer E,Yes,33.65,No,Yes,Fiber Optic,30,No,No,Yes,No,No,No,No,Yes,Month-to-Month,Yes,Bank Withdrawal,73.9,280.85,0,0,134.6,415.45,Churned,Competitor,Competitor had better devices
0011-IGKFF,Male,78,Yes,0,Martinez,94553,38.014457,-122.115432,1,13,Offer D,Yes,27.82,No,Yes,Fiber Optic,4,No,Yes,Yes,No,Yes,Yes,No,Yes,Month-to-Month,Yes,Bank Withdrawal,98,1237.85,0,0,361.66,1599.51,Churned,Dissatisfaction,Product dissatisfaction
0013-EXCHZ,Female,75,Yes,0,Camarillo,93010,34.227846,-119.079903,3,3,None,Yes,7.38,No,Yes,Fiber Optic,11,No,No,No,Yes,Yes,No,No,Yes,Month-to-Month,Yes,Credit Card,83.9,267.4,0,0,22.14,289.54,Churned,Dissatisfaction,Network reliability
0013-MHZWF,Female,23,No,3,Midpines,95345,37.581496,-119.972762,0,9,Offer E,Yes,16.77,No,Yes,Cable,73,No,No,No,Yes,Yes,Yes,Yes,Yes,Month-to-Month,Yes,Credit Card,69.4,571.45,0,0,150.93,722.38,Stayed,,
0013-SMEOE,Female,67,Yes,0,Lompoc,93437,34.757477,-120.550507,1,71,Offer A,Yes,9.96,No,Yes,Fiber Optic,14,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Two Year,Yes,Bank Withdrawal,109.7,7904.25,0,0,707.16,8611.41,Stayed,,
0014-BMAQU,Male,52,Yes,0,Napa,94558,38.489789,-122.27011,8,63,Offer B,Yes,12.96,Yes,Yes,Fiber Optic,7,Yes,No,No,Yes,No,No,No,No,Two Year,Yes,Credit Card,84.65,5377.8,0,20,816.48,6214.28,Stayed,,
0015-UOCOJ,Female,68,No,0,Simi Valley,93063,34.296813,-118.685703,0,7,Offer E,Yes,10.53,No,Yes,DSL,21,Yes,No,No,No,No,No,No,Yes,Two Year,Yes,Bank Withdrawal,48.2,340.35,0,0,73.71,414.06,Stayed,,
0016-QLJIS,Female,43,Yes,1,Sheridan,95681,38.984756,-121.345074,3,65,None,Yes,28.46,Yes,Yes,Cable,14,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Two Year,Yes,Credit Card,90.45,5957.9,0,0,1849.9,7807.8,Stayed,,
0017-DINOC,Male,47,No,0,Rancho Santa Fe,92091,32.99356,-117.207121,0,54,None,No,,,Yes,Cable,10,Yes,No,No,Yes,Yes,No,No,Yes,Two Year,No,Credit Card,45.2,2460.55,0,0,0,2460.55,Stayed,,
0017-IUDMW,Female,25,Yes,2,Sunnyvale,94086,37.378541,-122.020456,2,72,None,Yes,16.01,Yes,Yes,Fiber Optic,59,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Two Year,Yes,Credit Card,116.8,8456.75,0,0,1152.72,9609.47,Stayed,,
0018-NYROU,Female,58,Yes,0,Antelope,95843,38.715498,-121.363411,0,5,None,Yes,18.65,No,Yes,Fiber Optic,10,No,No,No,No,No,No,No,Yes,Month-to-Month,Yes,Bank Withdrawal,68.95,351.5,0,0,93.25,444.75,Stayed,,
0019-EFAEP,Female,32,No,0,La Mesa,91942,32.782501,-117.01611,0,72,Offer A,Yes,2.25,Yes,Yes,Fiber Optic,16,Yes,Yes,Yes,No,Yes,No,No,Yes,Two Year,Yes,Bank Withdrawal,101.3,7261.25,0,0,162,7423.25,Stayed,,
0019-GFNTW,Female,39,No,0,Los Olivos,93441,34.70434,-120.02609,0,56,None,No,,,Yes,DSL,19,Yes,Yes,Yes,Yes,No,No,No,Yes,Two Year,No,Bank Withdrawal,45.05,2560.1,0,0,0,2560.1,Stayed,,
0020-INWCK,Female,58,Yes,2,Woodlake,93286,36.464635,-119.094348,9,71,Offer A,Yes,27.26,Yes,Yes,Fiber Optic,12,No,Yes,Yes,No,No,Yes,Yes,Yes,Two Year,Yes,Credit Card,95.75,6849.4,0,0,1935.46,8784.86,Stayed,,
0020-JDNXP,Female,52,Yes,1,Point Reyes Station,94956,38.060264,-122.830646,0,34,None,No,,,Yes,DSL,20,Yes,No,Yes,Yes,Yes,Yes,Yes,Yes,One Year,No,Credit Card,61.25,1993.2,0,0,0,1993.2,Stayed,,
0021-IKXGC,Female,72,No,0,San Marcos,92078,33.119028,-117.166036,0,1,Offer E,Yes,7.77,Yes,Yes,Fiber Optic,22,No,No,No,No,No,No,No,Yes,One Year,Yes,Bank Withdrawal,72.1,72.1,0,0,7.77,79.87,Joined,,
0022-TCJCI,Male,79,No,0,Daly City,94015,37.680844,-122.48131,0,45,None,Yes,10.67,No,Yes,DSL,17,Yes,No,Yes,No,No,Yes,No,Yes,One Year,No,Credit Card,62.7,2791.5,0,0,480.15,3271.65,Churned,Dissatisfaction,Limited range of services
My code:
# Let X = predictor variable and y = target variable
X2 = pd.DataFrame(df[['Monthly Charge', 'Total Revenue']])
y2 = pd.DataFrame(df[['Multiple Lines']])
# Add a constant variable to the predictor variables
X = sm.add_constant(X2)
model02 = sm.OLS(y2, X2).fit()
df['Multiple Lines'].fillna(sm.OLS(y2, X2).fit(), inplace=True)
Error Message: ValueError: Pandas data cast to numpy dtype of object. Check input data with np.asarray(data).
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
/var/folders/wv/42dn23fd1cb0czpvqdnb6zw00000gn/T/ipykernel_15181/1879921247.py in <module>
5 # Add a constant variable to the predictor variables
6 X = sm.add_constant(X2)
----> 7 model02 = sm.OLS(y2, X2).fit()
8 df['Multiple Lines'].fillna(sm.OLS(y2, X2).fit(), inplace=True)
~/opt/miniconda3/lib/python3.9/site-packages/statsmodels/regression/linear_model.py in __init__(self, endog, exog, missing, hasconst, **kwargs)
888 "An exception will be raised in the next version.")
889 warnings.warn(msg, ValueWarning)
--> 890 super(OLS, self).__init__(endog, exog, missing=missing,
891 hasconst=hasconst, **kwargs)
892 if "weights" in self._init_keys:
~/opt/miniconda3/lib/python3.9/site-packages/statsmodels/regression/linear_model.py in __init__(self, endog, exog, weights, missing, hasconst, **kwargs)
715 else:
716 weights = weights.squeeze()
--> 717 super(WLS, self).__init__(endog, exog, missing=missing,
718 weights=weights, hasconst=hasconst, **kwargs)
719 nobs = self.exog.shape[0]
~/opt/miniconda3/lib/python3.9/site-packages/statsmodels/regression/linear_model.py in __init__(self, endog, exog, **kwargs)
189 """
190 def __init__(self, endog, exog, **kwargs):
--> 191 super(RegressionModel, self).__init__(endog, exog, **kwargs)
192 self._data_attr.extend(['pinv_wexog', 'wendog', 'wexog', 'weights'])
193
~/opt/miniconda3/lib/python3.9/site-packages/statsmodels/base/model.py in __init__(self, endog, exog, **kwargs)
265
266 def __init__(self, endog, exog=None, **kwargs):
--> 267 super().__init__(endog, exog, **kwargs)
268 self.initialize()
269
~/opt/miniconda3/lib/python3.9/site-packages/statsmodels/base/model.py in __init__(self, endog, exog, **kwargs)
90 missing = kwargs.pop('missing', 'none')
91 hasconst = kwargs.pop('hasconst', None)
---> 92 self.data = self._handle_data(endog, exog, missing, hasconst,
93 **kwargs)
94 self.k_constant = self.data.k_constant
~/opt/miniconda3/lib/python3.9/site-packages/statsmodels/base/model.py in _handle_data(self, endog, exog, missing, hasconst, **kwargs)
130
131 def _handle_data(self, endog, exog, missing, hasconst, **kwargs):
--> 132 data = handle_data(endog, exog, missing, hasconst, **kwargs)
133 # kwargs arrays could have changed, easier to just attach here
134 for key in kwargs:
~/opt/miniconda3/lib/python3.9/site-packages/statsmodels/base/data.py in handle_data(endog, exog, missing, hasconst, **kwargs)
671
672 klass = handle_data_class_factory(endog, exog)
--> 673 return klass(endog, exog=exog, missing=missing, hasconst=hasconst,
674 **kwargs)
~/opt/miniconda3/lib/python3.9/site-packages/statsmodels/base/data.py in __init__(self, endog, exog, missing, hasconst, **kwargs)
80 self.orig_endog = endog
81 self.orig_exog = exog
---> 82 self.endog, self.exog = self._convert_endog_exog(endog, exog)
83
84 self.const_idx = None
~/opt/miniconda3/lib/python3.9/site-packages/statsmodels/base/data.py in _convert_endog_exog(self, endog, exog)
505 exog = exog if exog is None else np.asarray(exog)
506 if endog.dtype == object or exog is not None and exog.dtype == object:
--> 507 raise ValueError("Pandas data cast to numpy dtype of object. "
508 "Check input data with np.asarray(data).")
509 return super(PandasData, self)._convert_endog_exog(endog, exog)
ValueError: Pandas data cast to numpy dtype of object. Check input data with np.asarray(data).
your y variable (y2) is of type object. Before feeding into the model, convert the y variable to int/float.
For example
model02 = sm.OLS(pd.get_dummies(y2), X2).fit()
pd.get_dummies is one option. find the one which is more appropriate to your use case . For ex: Label encoding

Keras inheriting built in layers?

I am new to Keras.
Keras's docs show how to make a custom layer where you can have full control over the trainable weights.
My question is how can one just extend an existing layer?
For example, the BatchNormalization layer does not have an activation option, where in practice one may often add an activation function following batch normalization.
This attempt does not work:
class BatchNormalizationActivation(keras.layers.BatchNormalization):
def __init__(self, bn_params={}, activation=keras.activations.relu, act_params={}):
super(BatchNormalizationActivation, self).__init__(**bn_params)
self.act = activation
def call(x):
x = super(BatchNormalizationActivation, self).call(x)
return self.act(x, **act_params)
BatchNormalizationActivation()
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-182-6e4f0495a112> in <module>()
----> 1 BatchNormalizationActivation()
<ipython-input-181-2d5c8337234a> in __init__(self, bn_params, activation, act_params)
3
4 def __init__(self, bn_params={}, activation=keras.activations.relu, act_params={}):
----> 5 super(BatchNormalizationActivation, self).__init__(**bn_params)
6 self.act = activation
7
/usr/local/lib/python3.6/site-packages/tensorflow/python/keras/_impl/keras/layers/normalization.py in __init__(self, axis, momentum, epsilon, center, scale, beta_initializer, gamma_initializer, moving_mean_initializer, moving_variance_initializer, beta_regularizer, gamma_regularizer, beta_constraint, gamma_constraint, **kwargs)
105 beta_constraint=constraints.get(beta_constraint),
106 gamma_constraint=constraints.get(gamma_constraint),
--> 107 **kwargs
108 )
109
/usr/local/lib/python3.6/site-packages/tensorflow/python/layers/normalization.py in __init__(self, axis, momentum, epsilon, center, scale, beta_initializer, gamma_initializer, moving_mean_initializer, moving_variance_initializer, beta_regularizer, gamma_regularizer, beta_constraint, gamma_constraint, renorm, renorm_clipping, renorm_momentum, fused, trainable, virtual_batch_size, adjustment, name, **kwargs)
144 **kwargs):
145 super(BatchNormalization, self).__init__(
--> 146 name=name, trainable=trainable, **kwargs)
147 if isinstance(axis, list):
148 self.axis = axis[:]
/usr/local/lib/python3.6/site-packages/tensorflow/python/keras/_impl/keras/engine/base_layer.py in __init__(self, **kwargs)
147 super(Layer, self).__init__(
148 name=name, dtype=dtype, trainable=trainable,
--> 149 activity_regularizer=kwargs.get('activity_regularizer'))
150 self._uses_inputs_arg = True
151
/usr/local/lib/python3.6/site-packages/tensorflow/python/layers/base.py in __init__(self, trainable, name, dtype, activity_regularizer, **kwargs)
130 self._graph = None # Will be set at build time.
131 self._dtype = None if dtype is None else dtypes.as_dtype(dtype).name
--> 132 self._call_fn_args = estimator_util.fn_args(self.call)
133 self._compute_previous_mask = ('mask' in self._call_fn_args or
134 hasattr(self, 'compute_mask'))
/usr/local/lib/python3.6/site-packages/tensorflow/python/estimator/util.py in fn_args(fn)
60 args = tf_inspect.getfullargspec(fn).args
61 if _is_bounded_method(fn):
---> 62 args.remove('self')
63 return tuple(args)
64
ValueError: list.remove(x): x not in list

statsmodel: simulate data and run simple linear regression

I'm new to python statsmodels package. I'm trying to simulate some data linearly related to log(x) and run a simple linear regression using statsmodels formula interface. Here are the codes:
import pandas as pd
import numpy as np
import statsmodels.formula.api as smf
B0 = 3
B1 = 0.5
x = np.linspace(10, 1e4, num = 1000)
epsilon = np.random.normal(0,3, size=1000)
y=B0 + B1*np.log(x)+epsilon
df1 = pd.DataFrame({'Y':y, 'X':x})
model = smf.OLS ('Y~np.log(X)', data=df1).fit()
I got error below:
ValueError Traceback (most recent call last)
<ipython-input-34-c0ab32ca2acf> in <module>()
7 y=B0 + B1*np.log(X)+epsilon
8 df1 = pd.DataFrame({'Y':y, 'X':X})
----> 9 smf.OLS ('Y~np.log(X)', data=df1)
/Users/tiger/anaconda/lib/python3.5/site-packages/statsmodels/regression/linear_model.py in __init__(self, endog, exog, missing, hasconst, **kwargs)
689 **kwargs):
690 super(OLS, self).__init__(endog, exog, missing=missing,
--> 691 hasconst=hasconst, **kwargs)
692 if "weights" in self._init_keys:
693 self._init_keys.remove("weights")
/Users/tiger/anaconda/lib/python3.5/site-packages/statsmodels/regression/linear_model.py in __init__(self, endog, exog, weights, missing, hasconst, **kwargs)
584 weights = weights.squeeze()
585 super(WLS, self).__init__(endog, exog, missing=missing,
--> 586 weights=weights, hasconst=hasconst, **kwargs)
587 nobs = self.exog.shape[0]
588 weights = self.weights
/Users/tiger/anaconda/lib/python3.5/site-packages/statsmodels/regression/linear_model.py in __init__(self, endog, exog, **kwargs)
89 """
90 def __init__(self, endog, exog, **kwargs):
---> 91 super(RegressionModel, self).__init__(endog, exog, **kwargs)
92 self._data_attr.extend(['pinv_wexog', 'wendog', 'wexog', 'weights'])
93
/Users/tiger/anaconda/lib/python3.5/site-packages/statsmodels/base/model.py in __init__(self, endog, exog, **kwargs)
184
185 def __init__(self, endog, exog=None, **kwargs):
--> 186 super(LikelihoodModel, self).__init__(endog, exog, **kwargs)
187 self.initialize()
188
/Users/tiger/anaconda/lib/python3.5/site-packages/statsmodels/base/model.py in __init__(self, endog, exog, **kwargs)
58 hasconst = kwargs.pop('hasconst', None)
59 self.data = self._handle_data(endog, exog, missing, hasconst,
---> 60 **kwargs)
61 self.k_constant = self.data.k_constant
62 self.exog = self.data.exog
/Users/tiger/anaconda/lib/python3.5/site-packages/statsmodels/base/model.py in _handle_data(self, endog, exog, missing, hasconst, **kwargs)
82
83 def _handle_data(self, endog, exog, missing, hasconst, **kwargs):
---> 84 data = handle_data(endog, exog, missing, hasconst, **kwargs)
85 # kwargs arrays could have changed, easier to just attach here
86 for key in kwargs:
/Users/tiger/anaconda/lib/python3.5/site-packages/statsmodels/base/data.py in handle_data(endog, exog, missing, hasconst, **kwargs)
562 exog = np.asarray(exog)
563
--> 564 klass = handle_data_class_factory(endog, exog)
565 return klass(endog, exog=exog, missing=missing, hasconst=hasconst,
566 **kwargs)
/Users/tiger/anaconda/lib/python3.5/site-packages/statsmodels/base/data.py in handle_data_class_factory(endog, exog)
551 else:
552 raise ValueError('unrecognized data structures: %s / %s' %
--> 553 (type(endog), type(exog)))
554 return klass
555
ValueError: unrecognized data structures: <class 'str'> / <class 'NoneType'>
I checked the documentations and everything seems to be right. Spent long time trying to understand why I got these errors but could not figure out. Help is very much appreciated.
In statsmodels.formula.api the ols method is lowercase.
In statsmodels.api the OLS is all caps.
In your case you need...
model = smf.ols('Y~np.log(X)', data=df1).fit()

Negative binomial model cannot find starting position to sample

I am having difficulties running a PYMC3 model when the observed data is discrete. Oddly, if the observed data contains the value zero (0.), the model will run.
I've read in other posts that that suggest using
start = pm.find_MAP(fmin=scipy.optimize.fmin_powell) but that does not resolve the issue.
pymc3.__version__ = '3.0'
theano.__version__ = '0.7.0.dev-RELEASE'
numpy.__version__ = '1.8.0rc1'
Python 2.7.10
See iPython notebook
The code and error are below.
import pymc3 as pm
data = [6.0,12.0,12.0,46.0,5.0,11.0,11.0,39.0,4.0,10.0,25.0,11.0,8.0,5.0,10.0,2.0,30.0,21.0]
with pm.Model() as model:
alpha = pm.Uniform('alpha', lower=0, upper=100)
mu = pm.Uniform('mu', lower=0, upper=100)
y_pred = pm.NegativeBinomial('y_pred', mu=mu, alpha=alpha)
y_est = pm.NegativeBinomial('y_est',
mu=mu,
alpha=alpha,
observed=data)
start = pm.find_MAP()
step = pm.Metropolis()
trace = pm.sample(20000, step, start, progressbar=True)
The error I get is:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-5-b9f2264fccfc> in <module>()
14 observed=data)
15
---> 16 start = pm.find_MAP()
17
18 step = pm.Metropolis()
/Library/Python/2.7/site-packages/pymc3/tuning/starting.pyc in find_MAP(start, vars, fmin, return_raw, disp, model, *args, **kwargs)
79 if 'fprime' in getargspec(fmin).args:
80 r = fmin(logp_o, bij.map(
---> 81 start), fprime=grad_logp_o, disp=disp, *args, **kwargs)
82 else:
83 r = fmin(logp_o, bij.map(start), disp=disp, *args, **kwargs)
/System/Library/Frameworks/Python.framework/Versions/2.7/Extras/lib/python/scipy/optimize/optimize.pyc in fmin_bfgs(f, x0, fprime, args, gtol, norm, epsilon, maxiter, full_output, disp, retall, callback)
775 'return_all': retall}
776
--> 777 res = _minimize_bfgs(f, x0, args, fprime, callback=callback, **opts)
778
779 if full_output:
/System/Library/Frameworks/Python.framework/Versions/2.7/Extras/lib/python/scipy/optimize/optimize.pyc in _minimize_bfgs(fun, x0, args, jac, callback, gtol, norm, eps, maxiter, disp, return_all, **unknown_options)
830 else:
831 grad_calls, myfprime = wrap_function(fprime, args)
--> 832 gfk = myfprime(x0)
833 k = 0
834 N = len(x0)
/System/Library/Frameworks/Python.framework/Versions/2.7/Extras/lib/python/scipy/optimize/optimize.pyc in function_wrapper(*wrapper_args)
279 def function_wrapper(*wrapper_args):
280 ncalls[0] += 1
--> 281 return function(*(wrapper_args + args))
282
283 return ncalls, function_wrapper
/Library/Python/2.7/site-packages/pymc3/tuning/starting.pyc in grad_logp_o(point)
74
75 def grad_logp_o(point):
---> 76 return nan_to_num(-dlogp(point))
77
78 # Check to see if minimization function actually uses the gradient
/Library/Python/2.7/site-packages/pymc3/blocking.pyc in __call__(self, x)
117
118 def __call__(self, x):
--> 119 return self.fa(self.fb(x))
/Library/Python/2.7/site-packages/pymc3/model.pyc in __call__(self, state)
397
398 def __call__(self, state):
--> 399 return self.f(**state)
400
401 class LoosePointFunc(object):
/Library/Python/2.7/site-packages/theano/compile/function_module.pyc in __call__(self, *args, **kwargs)
862 node=self.fn.nodes[self.fn.position_of_error],
863 thunk=thunk,
--> 864 storage_map=getattr(self.fn, 'storage_map', None))
865 else:
866 # old-style linkers raise their own exceptions
/Library/Python/2.7/site-packages/theano/gof/link.pyc in raise_with_op(node, thunk, exc_info, storage_map)
312 # extra long error message in that case.
313 pass
--> 314 reraise(exc_type, exc_value, exc_trace)
315
316
/Library/Python/2.7/site-packages/theano/compile/function_module.pyc in __call__(self, *args, **kwargs)
850 t0_fn = time.time()
851 try:
--> 852 outputs = self.fn()
853 except Exception:
854 if hasattr(self.fn, 'position_of_error'):
ValueError: Input dimension mis-match. (input[0].shape[0] = 1, input[4].shape[0] = 18)
Apply node that caused the error: Elemwise{Composite{Switch(i0, i1, Switch(i2, Switch(i3, i1, i4), i1))}}(TensorConstant{(1,) of 0}, TensorConstant{(1,) of 0}, Elemwise{mul,no_inplace}.0, InplaceDimShuffle{x}.0, TensorConstant{[ 6. 12... 30. 21.]})
Toposort index: 33
Inputs types: [TensorType(int8, vector), TensorType(int8, (True,)), TensorType(int8, (True,)), TensorType(int8, (True,)), TensorType(float64, vector)]
Inputs shapes: [(1,), (1,), (1,), (1,), (18,)]
Inputs strides: [(1,), (1,), (1,), (1,), (8,)]
Inputs values: [array([0], dtype=int8), array([0], dtype=int8), array([1], dtype=int8), array([0], dtype=int8), 'not shown']
Outputs clients: [[Sum{acc_dtype=float64}(Elemwise{Composite{Switch(i0, i1, Switch(i2, Switch(i3, i1, i4), i1))}}.0)]]
HINT: Re-running with most Theano optimization disabled could give you a back-trace of when this node was created. This can be done with by setting the Theano flag 'optimizer=fast_compile'. If that does not work, Theano optimizations can be disabled with 'optimizer=None'.
HINT: Use the Theano flag 'exception_verbosity=high' for a debugprint and storage map footprint of this apply node.

Categories

Resources