I have started using pyomo to solve optimization problems. I have a bit of an issue regarding accessing the variables, which use two indices. I can easily print the solution, but I want to store the index depending variable values within a pd.DataFrame to further analyze the result. I have written following code, but it needs forever to store the variables. Is there a faster way?
df_results = pd.DataFrame()
df_variables = pd.DataFrame()
results.write()
instance.solutions.load_from(results)
for v in instance.component_objects(Var, active=True):
print ("Variable",v)
varobject = getattr(instance, str(v))
frequency = np.empty([len(price_dict)])
for index in varobject:
exist = False
two = False
if index is not None:
if type(index) is int:
#For time index t (0:8760 hours of year)
exists = True #does a index exist
frequency[index] = float(varobject[index].value)
else:
#For components (names)
if type(index) is str:
print(index)
print(varobject[index].value)
else:
#for all index with two indices
two = True #is index of two indices
if index[1] in df_variables.columns:
df_variables[index[0], str(index[1]) + '_' + str(v)] = varobject[index].value
else:
df_variables[index[1]] = np.nan
df_variables[index[0], str(index[1]) + '_' + str(v)] = varobject[index].value
else:
# If no index exist, simple print the variable value
print(varobject.value)
if not(exists):
if not(two):
df_variable = pd.Series(frequency, name=str(v))
df_results = pd.concat([df_results, df_variable], axis=1)
df_variable.drop(df_variable.index, inplace=True)
else:
df_results = pd.concat([df_results, df_variable], axis=1)
df_variable.drop(df_variable.index, inplace=True)
with some more work and less DataFrame, I have solved the issue with following code. Thanks to BlackBear for the comment
df_results = pd.DataFrame()
df_variables = pd.DataFrame()
results.write()
instance.solutions.load_from(results)
for v in instance.component_objects(Var, active=True):
print ("Variable",v)
varobject = getattr(instance, str(v))
frequency = np.empty([20,len(price_dict)])
exist = False
two = False
list_index = []
dict_position = {}
count = 0
for index in varobject:
if index is not None:
if type(index) is int:
#For time index t (0:8760 hours of year)
exist = True #does a index exist
frequency[0,index] = float(varobject[index].value)
else:
#For components (names)
if type(index) is str:
print(index)
print(varobject[index].value)
else:
#for all index with two indices
exist = True
two = True #is index of two indices
if index[1] in list_index:
position = dict_position[index[1]]
frequency[position,index[0]] = varobject[index].value
else:
dict_position[index[1]] = count
list_index.append(index[1])
print(list_index)
frequency[count,index[0]] = varobject[index].value
count += 1
else:
# If no index exist, simple print the variable value
print(varobject.value)
if exist:
if not(two):
frequency = np.transpose(frequency)
df_variable = pd.Series(frequency[:,0], name=str(v))
df_results = pd.concat([df_results, df_variable], axis=1)
df_variable.drop(df_variable.index, inplace=True)
else:
for i in range(count):
df_variable = pd.Series(frequency[i,:], name=str(v)+ '_' + list_index[i])
df_results = pd.concat([df_results, df_variable], axis=1)
df_variable.drop(df_variable.index, inplace=True)
Related
I have a list called fdt_frozen which is a list of multiple data frames.
I'm trying to generate a match score using pairwise combinations for each group of data frame in the list. I have tested the below logic and it runs fine when i subset the list of data frames into a single data frame( like it works for a single data frame- fdt_frozen[6])
But it is not working when i try to loop the match score logic for all data frames in the list as it gives index not callable error when i try for loop.
Could anyone please help me with putting this entire logic inside a for loop so that it will loop over for all data frames in the list
a = np.asmatrix(fdt_frozen[6])
compare_all = []
for i in range(a.shape[1]):
compare = [1 if a[x[0], i] == a[x[1], i] else 0 for x in combinations(range(a.shape[0]), 2)]
compare_all.append(compare)
compare1 = pd.DataFrame(compare_all)
compare1 = compare1.T
compare1.columns = fdt_single.columns
compare_all1 = []
for i in range(a.shape[1]):
compare = [1 if pd.isnull(a[x[0], i]) and pd.isnull(a[x[1], i]) else 0 for x in combinations(range(a.shape[0]), 2)]
compare_all1.append(compare)
compare2 = pd.DataFrame(compare_all1)
compare2 = compare2.T
compare2.columns = fdt_single.columns
compare2[compare2 == 1] = np.nan
compare = compare1+compare2
combinations = list(itertools.combinations(range(a.shape[0]), 2))
combinations = [a[x[0],0] + '-' + a[x[1],0] for x in combinations]
compare.index = combinations
compare = compare.drop("Material", axis=1)
combinations = compare.index
MFDETAILED = pd.DataFrame({'COMBINATION': combinations,
'MS': round(((compare.sum(axis=1,skipna=True))/(~compare.isna()).sum(axis=1))*100,0)})
MFDETAILED = pd.concat([MFDETAILED, compare], axis=1)
MF = MFDETAILED.iloc[:, [0,1]]
feature_names = compare.columns
MF['MF'] = compare.apply(lambda x: ', '.join(feature_names[x == 1]), axis=1)
feature_names = compare.columns
MF['NMF'] = compare.apply(lambda x: ', '.join(feature_names[x == 0]), axis=1)
MF[['LOOKUP', 'MAT']] = MF['COMBINATION'].str.split("-", expand=True)
MF = MF.drop(['COMBINATION'], axis=1)
MF = MF[["LOOKUP","MAT","MS", "MF","NMF"]]
MF.columns = ["LOOKUP MATERIAL", "MATERIAL", "MATCH SCORE","MATCH FEATURES","NON MATCH FEATURES"]
ja = MF[["LOOKUP MATERIAL", "MATERIAL", "MATCH SCORE","MATCH FEATURES","NON MATCH FEATURES"]]
ja[["LOOKUP MATERIAL", "MATERIAL"]] = ja[["MATERIAL","LOOKUP MATERIAL"]].values
cols = ['Material', 'Material' , '100' , 'All', 'None']
MF_SELF = pd.DataFrame(columns = cols)
MF_SELF['Material'] = fdt_frozen[6]['Material']
MF_SELF['Material'] = fdt_frozen[6]['Material']
MF_SELF['100'] = 100
MF_SELF['All'] = 'All'
MF_SELF['None'] = 'None'
MF_SELF.columns = ["LOOKUP MATERIAL", "MATERIAL", "MATCH SCORE","MATCH FEATURES","NON MATCH FEATURES"]
MF = pd.concat([MF, ja, MF_SELF], axis=0)
MF = MF[MF['MATCH SCORE'] >= 70]
MF = MF.reset_index(drop = True)
Dataset: https://raw.githubusercontent.com/Kuntal-G/Machine-Learning/master/R-machine-learning/data/banknote-authentication.csv
How can I calculate the conditional entropy and find the best information gain from a dataset like this?
The code for calculating entropy:
def entropy(column):
""" Calculates the entropy"""
values, counts = np.unique(column, return_counts=True)
entropy_val = 0
for i in range(len(counts)):
entropy_val += (
(-counts[i] / sum(counts)) * math.log2(counts[i] / (sum(counts)))
)
return entropy_val
where 'column' is a feature in the dataframe, for example df[0].
I'm a little stuck as to where to go from here... Can anyone point me in the right direction, where my end goal is finding best information gain.
entropy_vals = {}
entropy_vals = entropy(X[0]), entropy(X[1]), entropy(X[2]), entropy(X[3]), entropy(y)
print(entropy_vals)
df = pd.read_csv('data_banknote_authentication.txt', header=None)
print(df)
y = df.iloc[:, -1]
X = df.iloc[:, :4]
def count_labels(rows):
"""Counts number of each unique value in selected column."""
counts = {}
for row in rows:
label = row
if label not in counts:
counts[label] = 1
else:
counts[label] += 1
return counts
def entropy(column):
""" Calculates the entropy"""
values, counts = np.unique(column, return_counts=True)
entropy_val = 0
for i in range(len(counts)):
entropy_val += (
(-counts[i] / sum(counts)) * math.log2(counts[i] / (sum(counts)))
)
return entropy_val
entropy_vals = {}
entropy_vals = entropy(X[0]), entropy(X[1]), entropy(X[2]), entropy(X[3]), entropy(y)
print(entropy_vals)
def check_unique(data):
label_col = data[data.columns[-1]]
print(label_col)
unique_features = np.unique(label_col)
if len(unique_features) == 1:
return True
else:
return False
def categorize_data(data):
label_col = data[data.columns[-1]]
values, counts = np.unique(label_col, return_counts=True)
print(values, counts)
index = counts.argmax()
category = values[index]
return category
def split(data):
x_less = data[data <= np.mean(data)]
x_greater = data[data > np.mean(data)]
return x_less, x_greater
I have a data frame that basically consists of three columns: group, timestamp, value.
I created the following for loop that will iterate through the data frame and run tests to see if the values are acceptable or not. For example, if not enough time has passed between the timestamps to account for the value, then it is tagged as potentially bad data.
The only caveat here is that values should not always be compared to the previous value, but rather the last 'good' value within the group. Thus the reason I went with the loop.
I'm wondering if there is a better way to do this without the loop, or are there inefficiencies in the loop that would help speed it up?
dfy = pd.DataFrame(index=dfx.index,columns = ['gvalue','quality'])
for row in df.itertuples():
thisgroup = row[1]
thistimestamp = row[2]
thisvalue = row[3]
qualitytag = ''
qualitytest = True
if prevgroup == thisgroup:
ts_gap = thistimestamp - goodtimestamp
hour_gap = (thisvalue - goodvalue) * 3600
if hour_gap < 0:
qualitytag = 'H'
qualitytest = False
elif hour_gap > ts_gap:
qualitytag = 'A'
qualitytest = False
elif hour_gap >= 86400
qualitytag = 'U'
qualitytest = False
#if tests pass, update good values
if qualitytest:
goodvalue = thisvalue
goodtimestamp = thistimestamp
#save good values to y dataframe
dfy.iat[row[0],0] = goodvalue
dfy.iat[row[0],1] = qualitytag
prevgroup = thisgroup
df = dfx.join(dfy)
I have coded the following for loop. The main idea is that in each occurrence of 'D' in the column 'A_D', it looks for all the possible cases where some specific conditions should happen. When all the conditions are verified, a value is added to a list.
a = []
for i in df.index:
if df['A_D'][i] == 'D':
if df['TROUND_ID'][i] == ' ':
vb = df[(df['O_D'] == df['O_D'][i])
& (df['A_D'] == 'A' )
& (df['Terminal'] == df['Terminal'][i])
& (df['Operator'] == df['Operator'][i])]
number = df['number_ac'][i]
try: ## if all the conditions above are verified a value is added to a list
x = df.START[i] - pd.Timedelta(int(number), unit='m')
value = vb.loc[(vb.START-x).abs().idxmin()].FlightID
except: ## if are not verified, several strings are added to the list
value = 'No_link_found'
else:
value = 'Has_link'
else:
value = 'IsArrival'
a.append(value)
My main problem is that df has millions of rows, therefore this for loop is way too time consuming. Is there any vectorized solution where I do not need to use a for loop?
An initial set of improvements: use apply rather than a loop; create a second dataframe at the start of the rows where df["A_D"] == "A"; and vectorise the value x.
arr = df[df["A_D"] == "A"]
# if the next line is slow, apply it only to those rows where x is needed
df["x"] = df.START - pd.Timedelta(int(df["number_ac"]), unit='m')
def link_func(row):
if row["A_D"] != "D":
return "IsArrival"
if row["TROUND_ID"] != " ":
return "Has_link"
vb = arr[arr["O_D"] == row["O_D"]
& arr["Terminal"] == row["Terminal"]
& arr["Operator"] == row["Operator"]]
try:
return vb.loc[(vb.START - row["x"]).abs().idxmin()].FlightID
except:
return "No_link_found"
df["a"] = df.apply(link_func, axis=1)
Using apply is apparently more efficient but does not automatically vectorise the calculation. But finding a value in arr based on each row of df is inherently time consuming, however efficiently it is implemented. Consider whether the two parts of the original dataframe (where df["A_D"] == "A" and df["A_D"] == "D", respectively) can be reshaped into a wide format somehow.
EDIT: You might be able to speed up the querying of arr by storing query strings in df, like this:
df["query_string"] = ('O_D == "' + df["O_D"]
+ '" & Terminal == "' + df["Terminal"]
+ '" & Operator == "' + df["Operator"] + '"')
def link_func(row):
vb = arr.query(row["query_string"])
try:
row["a"] = vb.loc[(vb.START - row["x"]).abs().idxmin()].FlightID
except:
row["a"] = "No_link_found"
df.query('(A_D == "D") & (TROUND_ID == " ")').apply(link_func, axis=1)
I have a dataframe:
import pandas as pd
df = pd.DataFrame({'start' : [5, 10, '$%%', 20], 'stop' : [10, 20, 30, 40]})
df['length_of_region'] = pd.Series([0 for i in range(0, len(df['start']))])
I want to calculate length of region only for non-zero numeric row values and skip function for the row with an error note if the value is not right. Here is what I have so far:
df['Notes'] = pd.Series(["" for i in range(0, len(df['region_name']))])
for i in range(0, len(df['start'])):
if pd.isnull(df['start'][i]) == True:
df['Notes'][i] += 'Error: Missing value for chromosome start at region %s, required value;' % (df['region_name'][i])
df['critical_error'][i] = True
num_error = num_error+1
else:
try:
#print (df['start'][i]).isnumeric()
start = int(df['start'][i])
#print start
#print df['start'][i]
if start == 0:
raise ValueError
except:
df['Notes'][i] += 'Error: Chromosome start should be a non zero number at region %s; ' % (df['region_name'][i])
#print df['start'][i]
df['critical_error'][i] = True
num_error = num_error+1
for i in range(0, len(df['start'][i])):
if df['critical_error'][i] == True:
continue
df['length_of_region'][i] = (df['stop'][i] - df['start'][i]) + 1.0
However, pandas converts df['start'] into a str variable and even if I use int to convert it, I get the following error:
df['length_of_region'][i] = (df['stop'][i] - df['start'][i]) + 1.0
TypeError: unsupported operand type(s) for -: 'numpy.int64' and 'str'
What am I missing here? Thanks for your time!
You can define a custom function to do the calculation then apply that function to each row.
def calculate_region_length(x):
start_val = x[0]
stop_val = x[1]
try:
start_val = float(start_val)
return (stop_val - start_val) + 1.0
except ValueError:
return None
The custom function accepts a list as input. The function will test the start value to see if it can be converted into a float. If it cannot then None will be returned. This way if '1' is stored as a string the value can still be converted to float and won't be skipped whereas '$%%' in your example cannot and will return None.
Next you call the custom function for each row:
df['length_of_region'] = df[['start', 'stop']].apply(lambda x: calculate_region_legnth(x), axis=1)
This will create your new column with (stop - start) + 1.0 for rows where start is not a non-convertible string and None where start is a string that cannot be converted to a number.
You can then update the Notes field based on rows where None is returned to identify the regions where a start value is missing:
df.loc[df['length_of_region'].isnull(), 'Notes'] = df['region_name']
After staring at the code for quite some time, found a simple and elegant fix to reassign df['start'][i] to start that I use in try-except as follows:
for i in range(0, len(df['start'])):
if pd.isnull(df['start'][i]) == True:
df['Notes'][i] += 'Error: Missing value for chromosome start at region %s, required value;' % (df['region_name'][i])
df['critical_error'][i] = True
num_error = num_error+1
else:
try:
start = int(df['start'][i])
df['start'][i] = start
if start == 0:
raise ValueError
except:
df['Notes'][i] += 'Error: Chromosome start should be a non zero number at region %s; ' % (df['region_name'][i])
#print df['start'][i]
df['critical_error'][i] = True
num_error = num_error+1
for i in range(0, len(df['start'][i])):
if df['critical_error'][i] == True:
continue
df['length_of_region'][i] = (df['stop'][i] - df['start'][i]) + 1.0
Re-assigning the start variable, converts it into int format and helps to calculate length_of_region only for numeric columns