I am working on an optimization problem, and facing difficulty setting up two constraints together in Python. Hereunder, I am simplifying my problem by calculation of area and volume. Only length can be changed, other parameters should remain the same.
Constraint 1: Maximum area should be 40000m2
Constraint 2: Minimum volume should be 50000m3
Here, I can set values in dataframe by following both constraints one-by-one, how to modify code so that both constraints (1 & 2) should meet given requirements?
Many Thanks for your time and support!
import pandas as pd
import numpy as np
df = pd.DataFrame({'Name': ['A', 'B', 'C', 'D'],
'Length': [1000, 2000, 3000, 5000],
'Width': [5, 12, 14, 16],
'Depth': [15, 10, 15, 18]})
area = (df['Length'])*(df['Width'])
volume = (df['Length'])*(df['Width'])*(df['Depth'])
print(area)
print(volume)
#Width and Depth are constants, only Length can be change
#Constraint 1: Maximum area should be 40000m2
#Calculation of length parameter by using maximum area, with other given parameters
Constraint_length_a = 40000/ df['Width']
#Constraint 2: Minimum volume should be 50000m3
#Calculation of length parameter by using minimum area, with other given parameters
Constraint_length_v = 50000/ ((df['Width'])*(df['Depth']))
#Setting Length values considering constraint 1
df.at[0, 'Length']=Constraint_length_a[0]
df.at[1, 'Length']=Constraint_length_a[1]
df.at[2, 'Length']=Constraint_length_a[2]
df.at[2, 'Length']=Constraint_length_a[3]
#Setting Length values considering constraint 2
df.at[0, 'Length']=Constraint_length_v[0]
df.at[1, 'Length']=Constraint_length_v[1]
df.at[2, 'Length']=Constraint_length_v[2]
df.at[2, 'Length']=Constraint_length_v[3]
I believed the code below solve the current problem you are facing.
If I can help any further let me know.
import pandas as pd
import numpy as np
df = pd.DataFrame({'Name': ['A', 'B', 'C', 'D'],
'Length': [1000, 2000, 3000, 5000],
'Width': [5, 12, 14, 16],
'Depth': [15, 10, 15, 18]})
area = (df['Length'])*(df['Width'])
volume = (df['Length'])*(df['Width'])*(df['Depth'])
def constraint1(df, col, n):
df.loc[:n,'lenght'] = 40000 / df.loc[:n, col]
df.drop('Length', axis=1, inplace=True)
return df
def constraint2(df, col, col1, n):
df.loc[:n, 'lenght'] = 50000/ ((df.loc[:n,col])*(df.loc[:n,col1]))
df.drop('Length', axis=1, inplace=True)
return df
If you want to performance it through the whole column then
def constraint1a(df, col):
df['lenght'] = 40000 / df[col]
df.drop('Length', axis=1, inplace=True)
return df
def constraint2a(df, col, col1):
df['lenght'] = 50000/ ((df[col])*(df[col1]))
df.drop('Length', axis=1, inplace=True)
return df
df = constraint1(df, 'Width', 3)
df1 = constraint2(df, 'Width','Depth', 3)
df2 = constraint1a(df, 'Width')
df3 = constraint2a(df, 'Width','Depth')
Adding the conditions I left out the first time
def constraint1(df, col, col1):
l = []
for x, w in zip(df[col], df[col1]):
if x > 40000:
l.append(40000 / w)
else:
l.append(x)
df[col] = l
return df
def constraint2(df, col, col1, col2):
l = []
for x, w, d in zip(df[col], df[col1], df[col2]):
if x <= 50000:
l.append(50000 / (w*d))
else:
l.append(x)
df[col] = l
return df
df1 = constraint1(df, 'Length', 'Width')
df2 = constraint2(df, 'Length', 'Width', 'Depth')
Related
import numpy as np
import h5py
x1 = [0, 1, 2, 3, 4]
y1 = ['a', 'b', 'c', 'd', 'e']
z1 = [5, 6, 7, 8, 9]
namesList = ['ID', 'Name', 'Path']
ds_dt = np.dtype({'names': namesList, 'formats': ['S32'] * 4})
rec_arr = np.rec.fromarrays([x1, y1, z1], dtype=ds_dt)
test = [[], [], []]
hdf5_file = h5py.File("test.h5", "w")
structure = hdf5_file.create_group('structure')
structure.create_dataset('images', data=test, compression='gzip', maxshape=(None,3))
structure['images'].resize((structure['images'].shape[0] + rec_arr.shape[0]), axis=0)
structure['images'][-rec_arr.shape[0]:] = rec_arr
I am starting with an empty dataset and I am trying to add data to that data set. When I view the file, nothing has been added and the dataset is empty. How do I fix this?
We can add the data directly to the .h5 file when creating the new dataset. The following code worked for me to write rec_arr to the file, and I added the 'with' statement to ensure it is closed properly.
import numpy as np
import h5py
x1 = [0, 1, 2, 3, 4]
y1 = ['a', 'b', 'c', 'd', 'e']
z1 = [5, 6, 7, 8, 9]
namesList = ['ID', 'Name', 'Path']
ds_dt = np.dtype({'names': namesList, 'formats': ['S32'] * 4})
rec_arr = np.rec.fromarrays([x1, y1, z1], dtype=ds_dt)
with h5py.File("test.h5", "w") as hdf5_file:
structure = hdf5_file.create_group('structure')
structure.create_dataset('images', data=rec_arr, compression='gzip', maxshape=(rec_arr.shape))
What you need is a resizable dataset. You define them by using the maxshape=() parameter. None means unlimited length. Example below shows how to create a resizable dataset. It starts with the data from your question and first answer. After it exits the 1st with/as: block, there is a 2nd with/as: block that reopens the file (in append mode), extends the dataset, and adds 5 more rows of data.
Also, I modified the dtype definition used for the recarray and resulting dataset. Previous code had all string values. I changed the 1st and 3rd columns to use integers (to match data values). It demonstrates how to mix datatypes in a recarray. Also, I I removed the create_group() call. Groups are not required (unless you want to use them to organize your datasets).
import numpy as np
import h5py
x1 = [0, 1, 2, 3, 4]
y1 = ['a', 'b', 'c', 'd', 'e']
z1 = [5, 6, 7, 8, 9]
namesList = ['ID', 'Name', 'Path']
ds_dt = np.dtype({'names': namesList, 'formats': [int, 'S32', int] })
rec_arr = np.rec.fromarrays([x1, y1, z1], dtype=ds_dt)
with h5py.File("test.h5", "w") as h5f:
h5f.create_dataset('data', data=rec_arr, maxshape=(None,),
compression='gzip' )
x2 = [ i for i in range(10,15)]
y2 = [chr(i) for i in range(102,107)]
z2 = [ i for i in range(15,20)]
rec_arr = np.rec.fromarrays([x2, y2, z2], dtype=ds_dt)
with h5py.File("test.h5", "a") as h5f:
ds_len = h5f['data'].shape[0]
arr_len = rec_arr.shape[0]
h5f['data'].resize(ds_len+arr_len,axis=0)
h5f['data'][arr_len:ds_len+arr_len] = rec_arr
I have multiindex table in pandas and I need to generate some string values instead of numbers. I need data to generate letters from a to z, and to be random. How do I do it?
import pandas as pd
import numpy as np
index = pd.MultiIndex.from_product([[2020], [1, 2, 3, 4]],
names=['year', 'q'])
columns = pd.MultiIndex.from_product([['Items1', 'Items2', 'Items3'], ['new', 'old']],
names=['subject', 'type'])
data = np.round(np.random.randn(4, 6), 1)
data[:, ::2] *= 20
data += 50
Ldata = pd.DataFrame(data, index=index, columns=columns)
Ldata
I figured out.
data = np.random.seed(123)
data = list(np.random.choice(list(string.ascii_lowercase), (4,6)))
I have n data-frames. I want to apply min- max scalar to each of them.
from sklearn import preprocessing
x = df1.values #returns a numpy array
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(x)
#df11= pd.DataFrame(x_scaled)
x_scaled1 = pd.DataFrame(x_scaled, columns = df1.columns)
for one df. I tried several ways. Everything failed.
Please list out the methods you have tried for your problem.
However, you can store the dataframe objects in a list. And then you can scale each of the dataframe as you loop through the list. I am doing it for two dataframes but the same applies for n number of these.
df1 = {'col1' : np.random.randint(1,26,10) , 'col2' : np.random.randint(1,26,10)}
df2 = {'col1' : np.random.randint(1,26,10) , 'col2' : np.random.randint(1,26,10)}
df1 = pd.DataFrame(df1)
df2 = pd.DataFrame(df2)
from sklearn import preprocessing
df_list = [df1,df2]
def loop_scaler(df_list):
scaled_df_list = []
for df in df_list:
df_array = df.values
min_max_scaler = preprocessing.MinMaxScaler()
df_scaled = min_max_scaler.fit_transform(df_array)
df_scaled = pd.DataFrame(df_scaled, columns = df.columns)
scaled_df_list.append(df_scaled)
return scaled_df_list
You can print and look at the scaled output:
scaled_df_list = loop_scaler(df_list)
for scaled_df in scaled_df_list:
print(scaled_df.head(5))
or
scaled_df1, scaled_df2 = loop_scaler(df_list)
print(scaled_df1.show())
print(scaled_df2.show())
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
data = [[-1, 2], [-0.5, 6], [0, 10], [1, 18]]
new_data = pd.DataFrame(data)
scaler = MinMaxScaler()
transformed_data = pd.DataFrame(scaler.fit_transform(new_data))
This may help to solve your problem.
I have dataset of 3 parameters 'A','B','C' in .TXT file and after I print them in 24x20 matrices I need to collect the 1st elements of 'A','B','C' put in long arrays in panda dataframe and then 2nd elements of each then 3rd and so on till 480th elements.
So my data is like this in text file:
my data is txt file is following:
id_set: 000
A: -2.46882615679
B: -2.26408246559
C: -325.004619528
I already made a panda dataframe includes 3 columns of 'A','B','C' and index and defined functions to print 24x20 matric in right way. Simple example via 2x2 matrices:
1st cycle: A = [1,2, B = [4,5, C = [8,9,
3,4] 6,7] 10,11]
2nd cycle: A = [0,8, B = [1,9, C = [10,1,
2,5] 4,8] 2,7]
Reshape to this form:
A(1,1),B(1,1),C(1,1),A(1,2),B(1,2),C(1,2),.....
Result= [1,4,8,2,5,9,3,6,10,4,7,11] #1st cycle
[0,1,10,8,9,1,2,4,2,5,8,7] #2nd cycle
My scripts are following:
import numpy as np
import pandas as pd
import os
def normalize(value, min_value, max_value, min_norm, max_norm):
new_value = ((max_norm - min_norm)*((value - min_value)/(max_value - min_value))) + min_norm
return new_value
dft = pd.read_csv('D:\mc25.TXT', header=None)
id_set = dft[dft.index % 4 == 0].astype('int').values
A = dft[dft.index % 4 == 1].values
B = dft[dft.index % 4 == 2].values
C = dft[dft.index % 4 == 3].values
data = {'A': A[:,0], 'B': B[:,0], 'C': C[:,0]}
df = pd.DataFrame(data, columns=['A','B','C'], index = id_set[:,0])
#next iteration create all plots, change the number of cycles
cycles = int(len(df)/480)
print(cycles)
for cycle in range(0,10):
count = '{:04}'.format(cycle)
j = cycle * 480
for i in df:
try:
os.mkdir(i)
except:
pass
min_val = df[i].min()
min_nor = -1
max_val = df[i].max()
max_nor = 1
ordered_data = mkdf(df.iloc[j:j+480][i])
csv = print_df(ordered_data)
#Print .csv files contains matrix of each parameters by name of cycles respectively
csv.to_csv(f'{i}/{i}{count}.csv', header=None, index=None)
if 'C' in i:
min_nor = -40
max_nor = 150
#Applying normalization for C between [-40,+150]
new_value3 = normalize(df['C'].iloc[j:j+480], min_val, max_val, -40, 150)
df3 = print_df(mkdf(new_value3))
df3.to_csv(f'{i}/norm{i}{count}.csv', header=None, index=None)
else:
#Applying normalization for A,B between [-1,+1]
new_value1 = normalize(df['A'].iloc[j:j+480], min_val, max_val, -1, 1)
new_value2 = normalize(df['B'].iloc[j:j+480], min_val, max_val, -1, 1)
df1 = print_df(mkdf(new_value1))
df2 = print_df(mkdf(new_value2))
df1.to_csv(f'{i}/norm{i}{count}.csv', header=None, index=None)
df2.to_csv(f'{i}/norm{i}{count}.csv', header=None, index=None)
Note2: I provided a dataset in text file for 3 cycles:
Text dataset
I am not sure if I understood your question fully but this is a solution:
Convert your data frame to a 2d numpy array using as_matrix() then use ravel() to get a vector of size 480 * 3 then cycle over your cycles and use vstack method for stacking rows over each other in your result, this is a code with your example data:
A = [[1,2,3,4], [10,20,30,40]]
B = [[4,5,6,7], [40,50,60,70]]
C = [[8,9,10,11], [80,90,100,110]]
cycles = 2
for cycle in range(cycles):
data = {'A': A[cycle], 'B': B[cycle], 'C': C[cycle]}
df = pd.DataFrame(data)
D = df.as_matrix().ravel()
if cycle == 0:
Results = np.array(D)
else:
Results = np.vstack((Results, D2))
# Output: Results= array([[ 1, 4, 8, 2, 5, 9, 3, 6, 10, 4, 7, 11], [ 10, 40, 80, 20, 50, 90, 30, 60, 100, 40, 70, 110]], dtype=int64)
np.savetxt("Results.csv", Results, delimiter=",")
Is this what you wanted?
I have a dataframe with users, score, times, where each user's different scores and the number of times they received it are listed:
user1, 1, 4
user1, 7, 2
user2, 3, 1
user2, 10, 2
and so on.
I'd like to calculate for each user the median of the scores.
For that I guess I should create a row-duplicated df, such as -
user1,1
user1,1
user1,1
user1,1
user1,7
user1,7
user2,3
user2,10
user2,10
and then use groupBy and apply to calculate the median somehow?
My questions -
Is this the correct approach? my df is very large so the solution has to be time efficient.
If this is indeed the way to go - can you please advise how? It keeps failing for me whatever I try to do.
I believe you need weighted median. I used function weighted_median from here, you can also try wquantile's weighted.median, but it interpolates in a bit different way so you may achieve nonexpected results):
import numpy as np
import pandas as pd
# from here: https://stackoverflow.com/a/32921444/3025981, CC BY-SA by Afshin # SE
def weighted_median(values, weights):
''' compute the weighted median of values list. The
weighted median is computed as follows:
1- sort both lists (values and weights) based on values.
2- select the 0.5 point from the weights and return the corresponding values as results
e.g. values = [1, 3, 0] and weights=[0.1, 0.3, 0.6] assuming weights are probabilities.
sorted values = [0, 1, 3] and corresponding sorted weights = [0.6, 0.1, 0.3] the 0.5 point on
weight corresponds to the first item which is 0. so the weighted median is 0.'''
#convert the weights into probabilities
sum_weights = sum(weights)
weights = np.array([(w*1.0)/sum_weights for w in weights])
#sort values and weights based on values
values = np.array(values)
sorted_indices = np.argsort(values)
values_sorted = values[sorted_indices]
weights_sorted = weights[sorted_indices]
#select the median point
it = np.nditer(weights_sorted, flags=['f_index'])
accumulative_probability = 0
median_index = -1
while not it.finished:
accumulative_probability += it[0]
if accumulative_probability > 0.5:
median_index = it.index
return values_sorted[median_index]
elif accumulative_probability == 0.5:
median_index = it.index
it.iternext()
next_median_index = it.index
return np.mean(values_sorted[[median_index, next_median_index]])
it.iternext()
return values_sorted[median_index]
# end from
def wmed(group):
return weighted_median(group['score'], group['times'])
import pandas as pd
df = pd.DataFrame([
['user1', 1, 4],
['user1', 7, 2],
['user2', 3, 1],
['user2', 10, 2]
], columns = ['user', 'score', 'times'])
groups = df.groupby('user')
groups.apply(wmed)
# user
# user1 1
# user2 10
# dtype: int64
df = pd.DataFrame({'user': ['user1', 'user1', 'user2', 'user2'],
'score': [1, 7, 3, 10],
'times': [4, 2, 1, 2]})
# Create dictionary of empty lists keyed on user.
scores = {user: [] for user in df.user.unique()}
# Expand list of scores for each user using a list comprehension.
_ = [scores[row.user].extend([row.score] * row.times) for row in df.itertuples()]
>>> scores
{'user1': [1, 1, 1, 1, 7, 7], 'user2': [3, 10, 10]}
# Now you can use a dictionary comprehension to calculate the median score of each user.
>>> {user: np.median(scores[user]) for user in scores}
{'user1': 1.0, 'user2': 10.0}