I am working on an assignment for Coursera's Machine Learning: Regression course. I am using the kc_house_data.gl/ dataset and GraphLab Create. I am adding new variables to train_data and test_data that are combinations of old variables. Then I take the mean of all these variables. These are the variables I am adding:
bedrooms_squared = bedrooms * bedrooms
bed_bath_rooms = bedrooms*bathrooms
log_sqft_living = log(sqft_living)
lat_plus_long = lat + long
Here is my code:
train_data['bedrooms_squared'] = train_data['bedrooms'].apply(lambda x: x**2)
test_data['bedrooms_squared'] = test_data['bedrooms'].apply(lambda x: x**2)
# create the remaining 3 features in both TEST and TRAIN data
train_data['bed_bath_rooms'] = train_data.apply(lambda row: row['bedrooms'] * row['bathrooms'])
test_data['bed_bath_rooms'] = test_data.apply(lambda row: row['bedrooms'] * row['bathrooms'])
train_data['log_sqft_living'] = train_data['sqft_living'].apply(lambda x: log(x))
test_data['log_sqft_living'] = test_data['bedrooms'].apply(lambda x: log(x))
train_data['lat_plus_long'] = train_data.apply(lambda row: row['lat'] + row['long'])
train_data['lat_plus_long'] = train_data.apply(lambda row: row['lat'] + row['long'])
test_data['bedrooms_squared'].mean()
test_data['bed_bath_rooms'].mean()
test_data['log_sqft_living'].mean()
test_data['lat_plus_long'].mean()
This is the error I'm getting:
RuntimeError: Runtime Exception. Exception in python callback function evaluation:
ValueError('math domain error',):
Traceback (most recent call last):
File "graphlab\cython\cy_pylambda_workers.pyx", line 426, in graphlab.cython.cy_pylambda_workers._eval_lambda
File "graphlab\cython\cy_pylambda_workers.pyx", line 169, in graphlab.cython.cy_pylambda_workers.lambda_evaluator.eval_simple
File "<ipython-input-13-1cdbcd5f5d9b>", line 5, in <lambda>
ValueError: math domain error
I have no idea what this means. Any idea on what caused it and how I fix it? Thanks.
Your problem is that log is receiving a negative number.
log is defined only for numbers greater than zero.
You need to check your values.
Please add/learn exceptions to make your code more robust:
try:
train_data['log_sqft_living'] = train_data['sqft_living'].apply(lambda x: log(x))
test_data['log_sqft_living'] = test_data['bedrooms'].apply(lambda x: log(x))
train_data['lat_plus_long'] = train_data.apply(lambda row: row['lat'] + row['long'])
train_data['lat_plus_long'] = train_data.apply(lambda row: row['lat'] + row['long'])
test_data['bedrooms_squared'].mean()
test_data['bed_bath_rooms'].mean()
test_data['log_sqft_living'].mean()
test_data['lat_plus_long'].mean()
except e as Exception:
print "ERROR in function:", e
Related
'def MSEn(dataset_mus, Mobj, Scales=3, Methodx='coarse', RadNew=0, Plotx=False):
class MSobject:
x=(5,2,[3.2])
Mobj = MSobject(x)
dataset_mus=np.squeeze(dataset_mus)
Func2 = globals()[Methodx.lower()]
MSx = np.zeros(Scales)
for T in range(1,Scales+1):
print(' .', end='')
Temp = Func2(Sig,T)
MSx[T-1] = Temp2
CI = sum(MSx)
if np.any(np.isnan(MSx)):
print('Some entropy values may be undefined.')
if Plotx:
figure()
ax1 = axes()
ax1.plot(np.arange(1,Scales+1), MSx, color=(8/255, 63/255, 77/255), linewidth=3)
ax1.scatter(np.arange(1,Scales+1), MSx, 60, color=(1,0,1))
ax1.set_xlabel('Scale Factor',fontsize=12,fontweight='bold',color=(7/255, 54/255, 66/255))
ax1.set_ylabel('Entropy Value',fontsize=12,fontweight='bold',color=(7/255, 54/255, 66/255))
ax1.set_title('Multiscale %s (%s-graining method)'%(Mobj.Func.__name__,Methodx),
fontsize=16,fontweight='bold',color=(7/255, 54/255, 66/255))
show()
return MSx, CI
MSEn(dataset_mus,Mobj.x)'
error
'---------------------------------------------------------------------------
NameError Traceback (most recent call last)
/var/folders/pm/zc3gg3ts2x17sm0r11qd66k00000gn/T/ipykernel_89686/2723706301.py in
30 show()
31 return MSx, CI
---> 32 MSEn(dataset_mus,Mobj.x)
NameError: name 'Mobj' is not defined' '
Mobj needs to be defined.
Mobj= EH.MSobject('SampEn',m=2,r=0.15)
this will create a sample entropy object with the given parameters and then you can call MSen function to calculate multiscale entropy.
I am working on the same problem right now. But my code works for a large data but throws error for smalller data.
this is the original repo i'm trying to run in my computer: https://github.com/kreamkorokke/cs244-final-project
import os
import matplotlib.pyplot as plt
import argparse
from attacker import check_attack_type
IMG_DIR = "./plots"
def read_lines(f, d):
lines = f.readlines()[:-1]
for line in lines:
typ, time, num = line.split(',')
if typ == 'seq':
d['seq']['time'].append(float(time))
d['seq']['num'].append(float(num))
elif typ == 'ack':
d['ack']['time'].append(float(time))
d['ack']['num'].append(float(num))
else:
raise "Unknown type read while parsing log file: %s" % typ
def main():
parser = argparse.ArgumentParser(description="Plot script for plotting sequence numbers.")
parser.add_argument('--save', dest='save_imgs', action='store_true',
help="Set this to true to save images under specified output directory.")
parser.add_argument('--attack', dest='attack',
nargs='?', const="", type=check_attack_type,
help="Attack name (used in plot names).")
parser.add_argument('--output', dest='output_dir', default=IMG_DIR,
help="Directory to store plots.")
args = parser.parse_args()
save_imgs = args.save_imgs
output_dir = args.output_dir
attack_name = args.attack
if save_imgs and attack_name not in ['div', 'dup', 'opt'] :
print("Attack name needed for saving plot figures.")
return
normal_log = {'seq':{'time':[], 'num':[]}, 'ack':{'time':[], 'num':[]}}
attack_log = {'seq':{'time':[], 'num':[]}, 'ack':{'time':[], 'num':[]}}
normal_f = open('log.txt', 'r')
attack_f = open('%s_attack_log.txt' % attack_name, 'r')
read_lines(normal_f, normal_log)
read_lines(attack_f, attack_log)
if attack_name == 'div':
attack_desc = 'ACK Division'
elif attack_name == 'dup':
attack_desc = 'DupACK Spoofing'
elif attack_name == 'opt':
attack_desc = 'Optimistic ACKing'
else:
raise 'Unknown attack type: %s' % attack_name
norm_seq_time, norm_seq_num = normal_log['seq']['time'], normal_log['seq']['num']
norm_ack_time, norm_ack_num = normal_log['ack']['time'], normal_log['ack']['num']
atck_seq_time, atck_seq_num = attack_log['seq']['time'], attack_log['seq']['num']
atck_ack_time, atck_ack_num = attack_log['ack']['time'], attack_log['ack']['num']
plt.plot(norm_seq_time, norm_seq_num, 'b^', label='Regular TCP Data Segments')
plt.plot(norm_ack_time, norm_ack_num, 'bx', label='Regular TCP ACKs')
plt.plot(atck_seq_time, atck_seq_num, 'rs', label='%s Attack Data Segments' % attack_desc)
plt.plot(atck_ack_time, atck_ack_num, 'r+', label='%s Attack ACKs' % attack_desc)
plt.legend(loc='upper left')
x = max(max(norm_seq_time, norm_ack_time),max(atck_seq_time, atck_ack_time))
y = max(max(norm_seq_num, norm_ack_num),max(atck_seq_num, atck_ack_num))
plt.xlim(0, x)
plt.ylim(0,y)
plt.xlabel('Time (s)')
plt.ylabel('Sequence Number (Bytes)')
if save_imgs:
# Save images to figure/
if not os.path.exists(output_dir):
os.makedirs(output_dir)
plt.savefig(output_dir + "/" + attack_name)
else:
plt.show()
normal_f.close()
attack_f.close()
if __name__ == "__main__":
main()
after running this i get this error
Traceback (most recent call last):
File "plot.py", line 85, in <module>
main()
File "plot.py", line 66, in main
plt.xlim(0, a)
File "/usr/lib/python3/dist-packages/matplotlib/pyplot.py", line 1427, in xlim
ret = ax.set_xlim(*args, **kwargs)
File "/usr/lib/python3/dist-packages/matplotlib/axes/_base.py", line 3267, in set_xlim
reverse = left > right
TypeError: '>' not supported between instances of 'int' and 'list'
Done! Please check ./plots for all generated plots.
how can i solve this problem? or better yet if there is another way of running this project? i installed matplotlib via pip3 install matplotlib command (same with scapy) and my main python version is python2 right now but i run the project with python3, could the issue be about this? what am i missing? or is it about mininet itself?
The problem is in this line
x = max(max(norm_seq_time, norm_ack_time),max(atck_seq_time, atck_ack_time))
IIUC, you wanna assign to x the maximum value among all those four lists. However, when you pass two lists to the max function, such as max(norm_seq_time, norm_ack_time), it will return the list it considers the greater one, and not the highest value considering both lists.
Instead, you can do something like:
x = max(norm_seq_time + norm_ack_time + atck_seq_time + atck_ack_time)
This will concatenate the four lists into a single one. Then, the function will return the highest value among all of them. You might wanna do that to the calculation of y as well.
If this is not what you wanted, or if you have any further issues, please let us know.
with the help of a friend we solved this problem by changing a part in code into this:
max_norm_seq_time = max(norm_seq_time) if len(norm_seq_time) > 0 else 0
max_norm_ack_time = max(norm_ack_time) if len(norm_ack_time) > 0 else 0
max_atck_seq_time = max(atck_seq_time) if len(atck_seq_time) > 0 else 0
max_atck_ack_time = max(atck_ack_time) if len(atck_ack_time) > 0 else 0
x = max((max_norm_seq_time, max_norm_ack_time,\
max_atck_seq_time, max_atck_ack_time))
plt.xlim([0,x])
max_norm_seq_num = max(norm_seq_num) if len(norm_seq_num) > 0 else 0
max_norm_ack_num = max(norm_ack_num) if len(norm_ack_num) > 0 else 0
max_atck_seq_num = max(atck_seq_num) if len(atck_seq_num) > 0 else 0
max_atck_ack_num = max(atck_ack_num) if len(atck_ack_num) > 0 else 0
plt.ylim([0, max((max_norm_seq_num, max_norm_ack_num,\
max_atck_seq_num, max_atck_ack_num))])
```
writing here just in case anyone else needs it.
This is my code:
p = range(0,3)
q = range(0,3)
d = range(0,3)
s = range(30,31)
P =range(0,1)
D = range(1,2)
Q = range(0,1)
lowest_aic= None
lowest_parm = None
lowest_param_seasonal = None
pdq = list(itertools.product(p, d, q))
seasonal_pdq = list(itertools.product(P,D,Q,s))
for param in pdq:
for param_seasonal in seasonal_pdq:
try:
mod = sm.tsa.statespace.SARIMAX(data,order=param,seasonal_order=param_seasonal)
results = mod.fit()
current_aic = results.aic
if (lowest_aic == None):
lowest_aic = results.aic
if (current_aic <= lowest_aic):
lowest_aic = current_aic
lowest_parm = param
lowest_param_seasonal = param_seasonal
#print('SARIMA{}x{} - AIC:{}'.format(param, param_seasonal, results.aic))
print('SARIMA{},{} - AIC:{}'.format(param, param_seasonal, results.aic))
except:
continue
print('SARIMA{}x{} - AIC:{}'.format(param, param_seasonal, results.aic))
syhat_sar_c = results.predict(len(df), len(df)+6, typ='levels',dynamic=False)
print(np.array(syhat_sar))
I am trying to implement a grid search for my SARIMA model.
I would like to know why try block is not getting executed, because of which it is throwing, UnboundLocal error.
It would be great if the explanation would be detailed.
This is the traceback I am getting when I run this code:
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
<ipython-input-12-2669d247586e> in <module>()
35
36
---> 37 print('SARIMA{}x{} - AIC:{}'.format(param, param_seasonal, results.aic))
38 syhat_sar_c = results.predict(len(df), len(df)+6, typ='levels',dynamic=False)
39 print(np.array(syhat_sar))
NameError: name 'results' is not defined
There are three possibilities:
pdq is empty, so the outer for loop exits immediately and the try is never executed.
seasonal_pdq is empty, so the inner for loop exits immediately and the try is never executed.
The try is executed, but it terminates before the variable you're depending on has been bound.
The rudimentary debugging techniques you could use to narrow this down would be:
Print pdq to make sure it's not empty.
Print seasonal_pdq to make sure it's not empty.
Change the continue to a raise so that the exception is raised instead of ignored. That way you can see what it is and exactly what line it came from.
How to debug "NameError: global name 'X' is not defined" in Python? I am pretty much new in Python. I am using jupyter_notebook with Python 2.7 to execute code. I am facing following error.
My code:
logFile = "NASAlog.txt"
def parseLogs():
parsed_logs=(sc
.textFile(logFile)
.map(parseApacheLogLine)
.cache())
access_logs = (parsed_logs
.filter(lambda s: s[1] == 1)
.map(lambda s: s[0])
.cache())
failed_logs = (parsed_logs
.filter(lambda s: s[1] == 0)
.map(lambda s: s[0]))
failed_logs_count = failed_logs.count()
if failed_logs_count > 0:
print 'Number of invalid logline: %d' % failed_logs.count()
for line in failed_logs.take(20):
print 'Invalid logline: %s' % line
print 'Read %d lines, successfully parsed %d lines, failed to parse %d lines' % (parsed_logs.count(), access_logs.count(), failed_logs.count())
return parsed_logs, access_logs, failed_logs
parsed_logs, access_logs, failed_logs = parseLogs()
ERROR
> NameError Traceback (most recent call last)
> <ipython-input-18-b365aa793252> in <module>()
> 24 return parsed_logs, access_logs, failed_logs
> 25
> ---> 26 parsed_logs, access_logs, failed_logs = parseLogs()
>
> <ipython-input-18-b365aa793252> in parseLogs()
> 2
> 3 def parseLogs():
> ----> 4 parsed_logs=(sc
> 5 .textFile(logFile)
> 6 .map(parseApacheLogLine)
>
> NameError: global name 'sc' is not defined
The problem is that you did never define sc. Therefore python can't find it. (Makes sense, doesn't it?)
Now there are several possible reasons:
- python is case-sensitive. Did you somewhere define SC instead of sc? ... Or Sc instead of sc?
You defined sc in another function (-> you defined it in a function outside parseLogs()). If you only define it there the variable will be local and just be available to the code inside the function. Add the line global sc to the first line of your function to make it accessible everywhere in you whole code.
You simply did not define sc.
I am receiving an error when I try to fit some data I have to a VAR Time series model in Python via statsmodels, the documentation of which is available here:
The data I have is available in a dataframe df_IBM_training which can be seen below:
date sym open high low close newscount
6 2014.08.05 IBM 189.30 189.3000 186.4100 187.0800 4
9 2014.08.06 IBM 185.80 186.8800 184.4400 185.9000 0
12 2014.08.07 IBM 186.56 186.8800 1.0000 184.2800 2
15 2014.08.08 IBM 183.32 186.6800 183.3200 186.5499 18
The model VAR I want to build looks like this, the regressors of which I try to create in the code below. I also try to search for the ideal model order in the code below, which is where I get the error. Each coefficient in the equation below, examples include α1,1, γ1,11 is associated with a regressor in the code:
Δlog(C_t) = α1,1(log(C_t - 1) − log(O_t-1))
+ α1,2(log(C_t - 1) − log(H_t-1))
+ α1,3(log(C_t - 1) − log(L_t-1))
+ γ1,11Δlog(C_t − 1)
+ γ1,12Δlog(O_t − 1)
+ γ1,13Δlog(H_t − 1)
+ γ1,14Δlog(L_t − 1)
+ εt
My code is as follows. For some reason, I get the following error in the line model.select_order(8):
numpy.linalg.linalg.linalgerror 7-th leading minor not positive semi-definite
#VAR regressors
df_IBM_training['log_ret0'] = np.log(df_IBM_training.close) - np.log(df_IBM_training.close.shift(1))
df_IBM_training['log_ret1'] = np.log(df_IBM_training.open) - np.log(df_IBM_training.open.shift(1))
df_IBM_training['log_ret2'] = np.log(df_IBM_training.high) - np.log(df_IBM_training.high.shift(1))
df_IBM_training['log_ret3'] = np.log(df_IBM_training.low) - np.log(df_IBM_training.low.shift(1))
df_IBM_training = df_IBM_training[np.isfinite(df_IBM_training['log_ret3'])]
regressor_1 = np.log(df_IBM_training['close']) - np.log(df_IBM_training['open'])
regressor_2 = np.log(df_IBM_training['close']) - np.log(df_IBM_training['high'])
regressor_3 = np.log(df_IBM_training['close']) - np.log(df_IBM_training['low'])
regressor_4 = df_IBM_training['log_ret0']
regressor_5 = df_IBM_training['log_ret1']
regressor_6 = df_IBM_training['log_ret2']
regressor_7 = df_IBM_training['log_ret3']
X_IBM = [regressor_1, regressor_2, regressor_3, regressor_4,regressor_5, regressor_6, regressor_7]
X_IBM = np.array(X_IBM)
X_IBM = X_IBM.T
model = statsmodels.tsa.api.VAR(X_IBM)
#The line below is where the error arises
model.select_order(8)
Edit: Traceback Error below:
Traceback (most recent call last):
File "TimeSeries.py", line 70, in <module>
model.select_order(8)
File "C:\Python34\lib\site-packages\statsmodels\tsa\vector_ar\var_model.py", line 505, in select_order
for k, v in iteritems(result.info_criteria):
File "C:\Python34\lib\site-packages\statsmodels\base\wrapper.py", line 35, in __getattribute__
obj = getattr(results, attr)
File "C:\Python34\lib\site-packages\statsmodels\tools\decorators.py", line 94, in __get__
_cachedval = self.fget(obj)
File "C:\Python34\lib\site-packages\statsmodels\tsa\vector_ar\var_model.py", line 1468, in info_criteria
ld = logdet_symm(self.sigma_u_mle)
File "C:\Python34\lib\site-packages\statsmodels\tools\linalg.py", line 213, in logdet_symm
c, _ = linalg.cho_factor(m, lower=True)
File "C:\Python34\lib\site-packages\scipy\linalg\decomp_cholesky.py", line 132, in cho_factor
check_finite=check_finite)
File "C:\Python34\lib\site-packages\scipy\linalg\decomp_cholesky.py", line 30, in _cholesky
raise LinAlgError("%d-th leading minor not positive definite" % info)
numpy.linalg.linalg.LinAlgError: 5-th leading minor not positive definite