Strange variable behaviour in Python - python

My first post here.
So I'm loading data into a variable called f1_data, then passing it to pm.removeDC() function to do some signal processing, and keeping the result into the same variable. But then, I want to replace only the column 8, with the original f1_data that I called raw_data and I can't figure it out why it doesn't work. Here are the functions. Help anyone?
inside file pm.py
def removeDC(data):
# define the filter
butter_order = 2
hp_cutoff_Hz = 1.0
b, a = signal.butter(butter_order, hp_cutoff_Hz/(fs_Hz / 2.0), 'highpass')
for i in range(1,9):
data[:,i] = signal.lfilter(b, a, data[:,i], 0)
return (data)
def get_epoch1(data, t_sec, epoch, f_tup, col):
#f_tup = (f_wdir, f_name, f_columns, out_save, out_dir, out_number, fig_width)
f_name = f_tup[1]
fig_width = f_tup[6]
epoch_boolvector = (t_sec >= epoch[0][0]) & (t_sec <= epoch[0][1])
epoch_timescale = t_sec[epoch_boolvector]
epoch_data = data[epoch_boolvector]
plt.figure(figsize=(fig_width,8), dpi=96)
plt.plot(epoch_timescale, epoch_data[:,col]);
plt.xlim(epoch_timescale[0], epoch_timescale[-1])
plt.show()
return (epoch_boolvector, epoch_timescale, epoch_data)
inside main file
#load the whole data
(f1_data, f1_data_indices, f1_timescale) = pm.load_data(f1_wdir, f1_name)
raw_data = f1_data[:] #create copy of f1_data
(f1ep1_boolvector, f1ep1_timescale, f1ep1_data) = pm.get_epoch1(f1_data, f1_timescale, f1_epochs[1], f1_tup, 8)
#--- filter data to remove DC (1Hz)
f1_data = pm.removeDC(f1_data)
# replace only channel 8 with original data
f1_data[:,8] = raw_data[:,8]
(f1ep2_boolvector, f1ep2_timescale, f1ep2_data) = pm.get_epoch1(f1_data, f1_timescale, f1_epochs[1], f1_tup, 8)

The solution is import copy and use copy.deepcopy function.
For further info check this link:
docs.python.org/2/library/copy.html
When I have raw_data = f1_data[:] I get, after pm.removeDC():
raw_data is f1_data: False
(raw_data == f1_data).all(): True
But when I have raw_data = copy.deepcopy(f1_data) I get, after pm.removeDC():
raw_data is f1_data: False
(raw_data == f1_data).all(): False

Related

Is there a proper way to append JSON Data to a Numpy array

I am trying to add data that I am reading from a series of JSON files to a Numpy array (or whatever data collection would work best). My idea, is that I want to sort a collection of episodes of a tv show by episode title.
The problem I have encountered, is actually creating the collection from the data.
The intent, is that I want to be able to have a collection of the items found within the for loop [a,b,c,d]; for each episode of the show.
Is a Numpy array the best way to go about making this collection, or should I use something else?
season1 = open('THEJSONFILES\seasonone.json', 'r')
season_array = np.array(['episodeTitle','seasonNum', 'episodeNum', 'plotContents'])
def ReadTheDarnJsonFile(jsonTitle):
seasondata = jsonTitle.read()
seasonobj = j.loads(seasondata)
list = (seasonobj['episodes'])
for i in range(len(list)):
a = str(list[i].get('title'))
b = str(list[i].get('seasonNumber'))
c = str(list[i].get('episodeNumber'))
d = str(list[i].get('plot'))
print(a, b, c, d)
print("----------------")
# np.append(season_array, [a,b,c,d]) this is not correct
ReadTheDarnJsonFile(season1)
print(season_array)
2 notes. First I would avoid using list as a variable name because it is a keyword in python. Second I would recommend using a custom class for your data for maximum readability.
season1 = open('THEJSONFILES\seasonone.json', 'r')
season_array = np.array(['episodeTitle','seasonNum', 'episodeNum', 'plotContents'])
class episode:
def __init__(self,title,seasonNumber,episodeNumber,plot):
self.title = title
self.seasonNumber = seasonNumber
self.episodeNumber = episodeNumber
self.plot = plot
def summary(self):
print("Season "+str(self.seasonNumber)+" Episode "+str(self.episodeNumber))
print(self.title)
print(self.plot)
def ReadTheDarnJsonFile(jsonTitle):
seasondata = jsonTitle.read()
seasonobj = j.loads(seasondata)
episodes = (seasonobj['episodes'])
season_array = []
for i in range(len(episodes)):
a = str(list[i].get('title'))
b = str(list[i].get('seasonNumber'))
c = str(list[i].get('episodeNumber'))
d = str(list[i].get('plot'))
season_array.append(episode(a,b,c,d)) this is not correct
return season_array
season_array = Read
TheDarnJsonFile(season1)
for item in season_array:
item.summary()
Here is what I ended up doing.
import json as j
import pandas as pd
emptyArray = []
season1 = open('THEJSONFILES\seasonone.json', 'r')
season2 = open('THEJSONFILES\seasontwo.json', 'r')
season3 = open('THEJSONFILES\seasonthree.json', 'r')
season4 = open('THEJSONFILES\seasonfour.json', 'r')
season5 = open('THEJSONFILES\seasonfive.json', 'r')
season6 = open('THEJSONFILES\seasonsix.json', 'r')
season7 = open('THEJSONFILES\seasonseven.json', 'r')
columnData = ["episodeTitle", "seasonIndex", "episodeIndex", "plot", "imageURL"]
finalDf = pd.DataFrame
def ReadTheDarnJsonFile(jsonTitle):
df = pd.DataFrame(columns = columnData)
seasonData = jsonTitle.read()
seasonObj = j.loads(seasonData)
currentSeasonList = (seasonObj['episodes'])
for i in range(len(currentSeasonList)):
tempTitle = str(currentSeasonList[i].get('title'))
tempSN = str(currentSeasonList[i].get('seasonNumber'))
tempEN = str(currentSeasonList[i].get('episodeNumber'))
tempPlot = str(currentSeasonList[i].get('plot'))
tempImage = str(currentSeasonList[i].get('image'))
dataObj = pd.Series([tempTitle, tempSN, tempEN, tempPlot, tempImage], index=(df.columns))
df.loc[i] = dataObj
emptyArray.append(df)
ReadTheDarnJsonFile(season1)
ReadTheDarnJsonFile(season2)
ReadTheDarnJsonFile(season3)
ReadTheDarnJsonFile(season4)
ReadTheDarnJsonFile(season5)
ReadTheDarnJsonFile(season6)
ReadTheDarnJsonFile(season7)
finalDf = pd.concat(emptyArray)
print(emptyArray)
holyOutput = finalDf.sort_values(by=['episodeTitle'])
holyOutput.reset_index(inplace=True)
holyOutput.to_json("P:\\ProjectForStarWarsCloneWarsJson\JSON\OutputJsonV2.json")

Python 'list.insert()' only saves the last result of calculation loop

I was making my automatic stock strategy yield calculation program with Python. Here's my code:
import FinanceDataReader as fdr
import numpy as np
# ...(more modules for python)
pd.options.display.float_format = '{:.5f}'.format
file_list = os.listdir('/home/sejahui/projects/stock_data_excel')
for i in range(20):
os.chdir('/home/sejahui/projects/stock_data_excel')
odd = file_list[i]
data = pd.read_excel('/home/sejahui/projects/stock_data_excel/'+str(odd))
def calMACD(data, short=5, long=25, signal=9):
data.sort_index()
data['MVA_25']=data['Close'].ewm(span=long, adjust=False).mean()
data['MVA_5']=data['Close'].ewm(span=short, adjust=False).mean()
data['MACD']=data['Close'].ewm(span=short, adjust=False).mean() - data['Close'].ewm(span=long, adjust=False).mean()
data['Signal']=data['MACD'].ewm(span=signal, adjust=False).mean( )
#data['Buy_sign']=(data['MACD']-data['Signal']) >=600
data['Buy_sign']=np.where(data['MACD']-data['Signal'] >=451, 'Buy' , 'Sell' )
#data['Target_1']=(data['Close']-data['Close'].shift(1))/data['Close'].shift(1)*100
#data['Target_1']=np.where(data['Buy_sign']=='Buy', (data['Change'])+1,1)
#data['Target_2']=np.where(data['Buy_sign']=='Sell', (data['Change'])+1,1)
#data['Real_world']= 1000000*data['Target_1']
#data['Real_world_2']= 1000000*data['Target_2']
#data['Condition'] = np.where(data['Real_world']<1000000, data['Real_world']-data['Real_world'].shift(-2),1)
##data['Condition_2'] = np.where(data['Real_world']<1000000, data['Target_1'].shift(-2),1)
#data['Moneyflow'] =
#plt.plot(data['Date'], data['Real_world'])
#data[data.Buy_sign !='Sell']
'''
data['Target_1']=np.where(data['Buy_sign']=='Buy', data['Change'],1)
data['Target_2']=np.where(data['Buy_sign']=='Sell', data ['Change'],1)
data['Yield']=np.where(data['Buy_sign']=='Sell', data['Target_1']/data['Target_2'],1 )
'''
'''
data['Result']=data['Target_1'].cumprod()
data['Result_2']=data['Target_2'].cumprod()
data['??????'] = data['Result'] - data['Result_2']
'''
return data
Adjusted = calMACD(data)
Adjusted.drop(['Change'], axis=1, inplace = True)
Filtered = Adjusted[Adjusted.Buy_sign!='Sell'].copy()
#print(Filtered)
#Filtered = (Adjusted.Buy_sign =='Buy') #(Adjusted.Condition = 1.0)
#Master = Adjusted.loc[Adjusted,['Date','Buy_sign','Target_1','Real_world',]]
#print(Adjusted)
def backtester(Filtered):
Filtered['Change'] = ((Filtered['Close'] - Filtered['Close'].shift(1)) / Filtered['Close'].shift(1))+1
#data['Target_1']=np.where(data['Buy_sign']=='Buy', (data['Change'])+1,1)
Filtered['Real_world'] = 1000000*Filtered['Change']
#Filtered['Condition'] = np.where(Filtered['Real_world']<1000000, Filtered['Real_world'].shift(-2)-Filtered['Real_world'],1)
Filtered['Condition'] = np.where(Filtered['Real_world']<1000000, Filtered['Change'].shift(-2),1)
#Filtered['Target_1'] = np.where(Filtered['Buy_sign']=='Buy', (Filtered['Change'])+1,1)
#Filtered['Condition'] = np.where(Filtered['Real_world']<1000000, Filtered['Real_world'].shift(-2)-Filtered['Real_world'],1)
return Filtered
s = backtester(Filtered)
e = s[s.Condition!=1.00000]
x = e.dropna()
y = x['Condition']
list_1 = []
write_wb = Workbook()
write_ws = write_wb.create_sheet('MACD&Signal gap data sheet')
write_ws = write_wb.active
write_ws['A1'] = 'Name'
write_ws['B1'] = 'Profit'
try:
print(geometric_mean(y)*1000000*12)
except StatisticsError as e:
print ('Sell is empty':',odd)
else:
d = (geometric_mean(y)*1000000*12)
print(d,odd)
list_1.insert(i,d)
Print(list_1)
Here's the part where I'm troubling with:
s = backtester(Filtered)
e = s[s.Condition!=1.00000]
x = e.dropna()
y = x['Condition']
list_1 = []
try:
print(geometric_mean(y)*1000000*12)
except StatisticsError as e:
print ('Sell is empty':',odd)
else:
d = (geometric_mean(y)*1000000*12)
print(d)
list_1.insert(d)
print(list_1)
When I initiate the code where I am having problems, list only saves the last result of 'try, except, else' function. My intention was saving all the results. What change should I give to save all the results?
Here's the output of the list:
[11772769.197974786]
Your problem is that you are using insert instead of append and the main difference that insert takes a second argument for the position that you want to insert your element at and when none is provided it is 0 by default so you are consistently inserting at the same index resulting in a list with only the last element at the first position.
To fix that simply use append instead.
else:
d = (geometric_mean(y)*1000000*12)
print(d)
list_1.append(d)
You want to use append, not insert. see Python Data Structures
Change list_1.insert(d) to list_1.append(d)
The insert is defaulting to index 0 and just updating it each time.
Edit: Just noticed your answer is in the question title.

How to read large NetCDF data sets without using a for - Python

Good morning, I have a problem when reading a large netCDF file in python, which contains meteorological information, that information must go through it to assemble the information and then insert it into the database, but the time it takes to go through and assemble the information is too much, I know there must be other ways to perform the same process more efficiently, currently I access the information through a for loop, below the code
content = nc.Dataset(pathFile+file)
XLONG, XLAT = content.variables["XLONG"], content.variables["XLAT"]
Times = content.variables["Times"] #Horas formar b 'b
RAINC = content.variables["RAINC"] #Lluvia
Q2 = content.variables["Q2"] #Humedad especifica
T2 = content.variables["T2"] #Temperatura
U10 = content.variables["U10"] #Viento zonal
V10 = content.variables["V10"] #Viento meridional
SWDOWN = content.variables["SWDOWN"] #Radiacion incidente
PSFC = content.variables["PSFC"] #Presion de la superficie
SST = content.variables["SST"] #Temperatura de la superficie del mar
CLDFRA = content.variables["CLDFRA"] #Fraccion de nubes
for c2 in range(len(XLONG[0])):
for c3 in range(len(XLONG[0][c2])):
position += 1
for hour in range(len(Times)):
dateH = getDatetimeInit(dateFormatFile.hour) if hour == 0 else getDatetimeForHour(hour, dateFormatFile.hour)
hourUTC = getHourUTC(hour)
RAINH = str(RAINC[hour][0][c2][c3])
Q2H = str(Q2[hour][0][c2][c3])
T2H = str(convertKelvinToCelsius(T2[hour][0][c2][c3]))
U10H = str(U10[hour][0][c2][c3])
V10H = str(V10[hour][0][c2][c3])
SWDOWNH = str(SWDOWN[hour][0][c2][c3])
PSFCH = str(PSFC[hour][0][c2][c3])
SSTH = str(SST[hour][0][c2][c3])
CLDFRAH = str(CLDFRA[hour][0][c2][c3] )
rowData = [idRun, functions.IDMODEL, idTime, position, dateH.year, dateH.month, dateH.day, dateH.hour, RAINH, Q2H, T2H, U10H, V10H, SWDOWNH, PSFCH, SSTH, CLDFRAH]
dataProcess.append(rowData)
I would use NumPy. Let us assume you have netCDF with 2 variables, "t2" and "slp". Then you could use the following code to vectorize your data:
#!//usr/bin/env ipython
# ---------------------
import numpy as np
from netCDF4 import Dataset
# ---------------------
filein = 'test.nc'
ncin = Dataset(filein);
tair = ncin.variables['t2'][:];
slp = ncin.variables['slp'][:];
ncin.close();
# -------------------------
tairseries = np.reshape(tair,(np.size(tair),1));
slpseries = np.reshape(slp,(np.size(slp),1));
# --------------------------
## if you want characters:
#tairseries = np.array([str(val) for val in tairseries]);
#slpseries = np.array([str(val) for val in slpseries]);
# --------------------------
rowdata = np.concatenate((tairseries,slpseries),axis=1);
# if you want characters, do this in the end:
row_asstrings = [[str(vv) for vv in val] for val in rowdata]
# ---------------------------
Nevertheless, I have a feeling that using strings is not very good idea. In my example, the conversion from numerical arrays to strings, took quite long time and therefore I did not implement it before concatenation.
If you want also some time/location information, you can do like this:
#!//usr/bin/env ipython
# ---------------------
import numpy as np
from netCDF4 import Dataset
# ---------------------
filein = 'test.nc'
ncin = Dataset(filein);
xin = ncin.variables['lon'][:]
yin = ncin.variables['lat'][:]
timein = ncin.variables['time'][:]
tair = ncin.variables['t2'][:];
slp = ncin.variables['slp'][:];
ncin.close();
# -------------------------
tairseries = np.reshape(tair,(np.size(tair),1));
slpseries = np.reshape(slp,(np.size(slp),1));
# --------------------------
## if you want characters:
#tairseries = np.array([str(val) for val in tairseries]);
#slpseries = np.array([str(val) for val in slpseries]);
# --------------------------
rowdata = np.concatenate((tairseries,slpseries),axis=1);
# if you want characters, do this in the end:
#row_asstrings = [[str(vv) for vv in val] for val in rowdata]
# ---------------------------
# =========================================================
nx = np.size(xin);ny = np.size(yin);ntime = np.size(timein);
xm,ym = np.meshgrid(xin,yin);
xmt = np.tile(xm,(ntime,1,1));ymt = np.tile(ym,(ntime,1,1))
timem = np.tile(timein[:,np.newaxis,np.newaxis],(1,ny,nx));
xvec = np.reshape(xmt,(np.size(tair),1));yvec = np.reshape(ymt,(np.size(tair),1));timevec = np.reshape(timem,(np.size(tair),1)); # to make sure that array's size match, I am using the size of one of the variables
rowdata = np.concatenate((xvec,yvec,timevec,tairseries,slpseries),axis=1);
In any case, with variable sizes (744,150,150), it took less than 2 seconds to vectorize 2 variables.

Saving data from Arduino using Python - loss of data

With the help of web, i have created a code that collects the data form Arduino uno, and saves it to csv file.
The data collected are raw values of MEMS accelerometers.
The problem in code is that very often i loose a lot of data, if not all, if i terminate the Python. I noticed that at a random time, the output csv file has zero bytes.
Temporary solution is to start Arduino's "Serial monitor". This way most of the measured data is saved.
import serial
import time
import csv
import numpy as np
import pandas as pd
timeHr = []
timeT = []
mem1xD = []
mem1yD = []
mem1zD = []
#
mem2xD = []
mem2yD = []
mem2zD = []
arduinoData = serial.Serial('COM4',9600)
df = pd.DataFrame({
'timeHr':0,
'timeT':0,
'mem1xD':0,
'mem1yD':0,
'mem1zD':0,
'mem2xD':0,
'mem2yD':0,
'mem2zD':0,
},
index=[0]
)
while True:
while (arduinoData.inWaiting()==0):
pass
arduinoString = arduinoData.readline().decode("utf-8")
dataArray = arduinoString.split(",")
timehr = dataArray[0]
time = float(dataArray[1])/1000
mem1x = float(dataArray[2])
mem1y = float(dataArray[3])
mem1z = float(dataArray[4])
#
mem2x = float(dataArray[5])
mem2y = float(dataArray[6])
mem2z = float(dataArray[7])
timeHr.append(timehr)
timeT.append(time)
mem1xD.append(mem1x)
mem1yD.append(mem1y)
mem1zD.append(mem1z)
#
mem2xD.append(mem2x)
mem2yD.append(mem2y)
mem2zD.append(mem2z)
df = pd.DataFrame({
'timeHr':timeHr,
'timeT':timeT,
'mem1xD':mem1xD,
'mem1yD':mem1yD,
'mem1zD':mem1zD,
'mem2xD':mem2xD,
'mem2yD':mem2yD,
'mem2zD':mem2zD,
}
)
df.to_csv(r'time4.csv')
You need to append new data to your dataframe. Passing mode='a' in pd.Dataframe.to_csv will allow you to do that.
import time
tStart = str(time.time()).split('.')[0]
fileOut = tStart+'.csv'
while True:
while (arduinoData.inWaiting()==0):
pass
arduinoString = arduinoData.readline().decode("utf-8")
dataArray = arduinoString.split(",")
timehr = dataArray[0]
time = float(dataArray[1])/1000
mem1x = float(dataArray[2])
mem1y = float(dataArray[3])
mem1z = float(dataArray[4])
#
mem2x = float(dataArray[5])
mem2y = float(dataArray[6])
mem2z = float(dataArray[7])
timeHr.append(timehr)
timeT.append(time)
mem1xD.append(mem1x)
mem1yD.append(mem1y)
mem1zD.append(mem1z)
#
mem2xD.append(mem2x)
mem2yD.append(mem2y)
mem2zD.append(mem2z)
df = pd.DataFrame({
'timeHr':timeHr,
'timeT':timeT,
'mem1xD':mem1xD,
'mem1yD':mem1yD,
'mem1zD':mem1zD,
'mem2xD':mem2xD,
'mem2yD':mem2yD,
'mem2zD':mem2zD,
}
)
df.to_csv(fileOut,mode='a', header=False)

Why is a variable not getting assigned to when assignments exist in both clauses of an if-else block?

Suppose we have something like:
if True:
r = 0
else:
r = 1
print(r)
Why would we get UnboundLocalError: local variable 'r' referenced before assignment?
The actual code is shown below:
def rasterize_dot_verify_args(callable, parent):
if not hasattr(callable, "__call__"):
raise ValueError()
import inspect
siggy = inspect.signature(callable)
if (len(siggy.parameters) > 1):
raise ValueError()
def rasterize(callable, xparent, make_copy :bool = False):
rasterize_dot_verify_args(callable, xparent)
iparent = xparent
if make_copy:
import copy
iparent = copy.deepcopy(xparent)
if hasattr(iparent, "__iter__"):
in_kids = iter(iparent)
if in_kids != iparent:
lamby = lambda p, *, c=callable: rasterize(c, p)
out_kids = map(lamby, in_kids)
# !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
r = callable(out_kids) # !!!!!!!!!!!!!!!!!!!!
# !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
else:
# !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
r = iparent # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
# !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
return r
import itertools as itts
sixify = lambda obj, *, itts=itts: itts.repeat(obj, 6)
inputs = map(sixify, range(1, 5))
# inputs = (_ for _ in [
# itts.repeat(1, 6),
# itts.repeat(2, 6),
# itts.repeat(3, 6),
# itts.repeat(4, 6)
# ])
print(rasterize(list, inputs))
I'm forced to add a little bit of text here because, "It looks like your post is mostly code; please add some more details."
oh dear.... even more text is needed.
r would not be assigned when hasattr(iparent, "__iter__") is True and in_kids != iparent is False. You should add an else block to the if in_kids != iparent: statement to assign r a value.
A better characterization of your code would be:
if test1:
if test2:
r = something
else:
r = something
Which should make it obvious how you'd get the error you got.

Categories

Resources