ContainerAlreadyContains error when doing an optimization with cobra - python

I am trying to run an optimization with cobra to create a coop-medium for two organisms. For this, I set up constraints and objectives like it was explained in the docu. Sadly, my code produces a ContainerAlreadyContains error, and I have no idea where it comes from; an internet search did not help. I was understanding that my newly defined constraints would be added to those already in the model or maybe they would overwrite the old constraints if affecting the same component, or is this incorrect?
I am using two other models, but managed to produce the same error with the cobra testmodel with the code posted below. Using python 3.7.0 and cobra 0.19.0. Hopefully someone can help me; thanks in any case!
import cobra
import copy
import cobra.test
def optimize_model(inmodel, medium, biomass, verbose=False):
model = copy.deepcopy(inmodel)
metabolites = list(medium.keys())
reactions = [r.id for r in model.reactions]
#variables
thetas = {}
for m in metabolites:
var = model.problem.Variable(name="theta_{}".format(m), lb=0, type="binary")
thetas["theta_{}".format(m)] = var
#constraints
constraints = []
for m in metabolites:
try:
const = model.problem.Constraint(model.reactions.get_by_id(m).flux_expression +
model.reactions.get_by_id(m).lower_bound*thetas["theta_"+m],
lb=model.reactions.get_by_id(m).lower_bound,
name="V_COOPM_{}".format(m))
constraints.add_cons_vars(const)
except:
pass
VBM_COMPM = model.optimize().objective_value / 10
cost = model.problem.Constraint(biomass.flux_expression, lb=VBM_COMPM)
constraints.append(cost)
#objective
obj = model.problem.Objective(sum(thetas[t] for t in thetas.keys()),
direction="max")
model.add_cons_vars(constraints)
model.objective = obj
model.solver.update()
status = model.optimize()
medium = {
'EX_ala__L_e': 'Alanine', 'EX_arg__L_e': 'Arginine',
'EX_cys__L_e': 'Cysteine', 'EX_glu__L_e': 'Glutamic acid',
'EX_gly_e': 'Glycine', 'EX_his__L_e': 'Histidine',
'EX_leu__L_e': 'Leucine', 'EX_lys__L_e': 'Lysine', 'EX_orn_e': 'Ornithine',
'EX_phe__L_e': 'Phenylalanine', 'EX_pro__L_e': 'Proline',
'EX_ser__L_e': 'Serine', 'EX_thr__L_e': 'Threonine',
'EX_trp__L_e': 'Tryptophane', 'EX_val__L_e': 'Valine',
'EX_cit_e': 'citric acid', 'EX_fum_e': 'Fumaric acid',
'EX_male_e': 'maleic acid', 'EX_pyr_e': 'pyruvic acid',
'EX_succ_e': 'succinic acid', 'EX_glc__D_e': 'glucose',
'EX_urea_e': 'Urea', 'EX_na1_e': 'Sodium', 'EX_cl_e': 'Chloride',
'EX_k_e': 'Potassium', 'EX_pi_e': 'Phosphate', 'EX_mg2_e': 'Magnesium',
'EX_so4_e': 'Sulphate', 'EX_ca2_e': 'Calcium', 'EX_zn2_e': 'ZnCl2',
'EX_mn2_e': 'MnCl2', 'EX_cobalt2_e': 'CoCl2', 'EX_cu2_e': 'CuCl2',
'EX_ni2_e': 'NiCl2', 'EX_mobd_e': 'MoNa2O4', 'EX_adocbl_e': 'Cyanocobalamine',
'EX_4abz_e': 'p-Aminobenzoic acid', 'EX_btn_e': 'Biotin', 'EX_nac_e': 'Nicotinic acid',
'EX_pnto__R_e': 'Ca-D-Pantothenic acid', 'EX_pydam_e': 'Pyridoxamine-2HCl',
'EX_thm_e': 'Thiamine-dichloride', 'EX_ribflv_e': 'Riboflavin', 'EX_o2_e': 'Oxygen',
'EX_fe2_e': 'Fe3+', 'EX_h2o_e': 'Water', 'EX_co2_e': 'Co2'
}
model = cobra.test.create_test_model("textbook")
for r in model.reactions:
if r.id == "Biomass_Ecoli_core":
biomass = r
break
optimize_model(model, medium, biomass, True)

See here for the change that worked:
https://github.com/opencobra/cobrapy/issues/1026

Related

How to get multiple random values from a specified area in a dictionary

I am trying to create a workout simulator. If the user wants to target two areas, I want to be able to take 3 exercises from each section and then combine them into their own set. How would I take 3 random exercises? I have tried using random.sample with no luck
musclegroup_exercises = {
'legs': {"squat", "calf raises", "hamstring curls", "deadlifts", "walking lunges"},
'chest': {"barbell bench", "pushups", "cable fly", "dumbbell fly", "dumbbell incline bench press"},
'arms': {"bicep curls", "kickbacks", "tricep pushdown", "reverse curls", "hammer curl"},
'shoulders':{"shoulder press", "lateral raise", "barbell shrug", "bent over reverse flys", "push press"},
'back':{"dumbbell rows", "back extension", "pull ups", "lat pull downs", "machine seated row"},
'core':{"sit ups", "crunches", "russian twists", "bicycles", "planks"},
}
print('Here are the possible muscle groups you can target: Legs, Chest, Arms, Shoulders, Back, Core')
print('Here are the possible intensity levels: Easy, Medium, Hard')
num = int(input('Would you like to target one or two muscle groups? '))
if num == 1:
musclegroup = input('What muscle group would you like to target? ')
if num == 2:
musclegroup1 = input('What is the first musclegroup you would like to target? ')
musclegroup2 = input('What is the second musclegroup you would like to target? ')
intensity = input('What intensity level would you like? ')
if intensity == 'Easy':
rate = '65%'
if intensity == 'Medium':
rate = '80%'
if intensity == 'Hard':
rate = '90%'
def createworkout1(y):
for exercise in musclegroup_exercises[musclegroup.lower()]:
print(exercise)
def createworkout2(j,k):
import random
half1 = random.sample(musclegroup_exercises[musclegroup1.lower()].items(),3)
You can use shuffle:
from itertools import chain
from random import shuffle
musclegroup_exercises = {
'legs': {"squat", "calf raises", "hamstring curls", "deadlifts", "walking lunges"},
'chest': {"barbell bench", "pushups", "cable fly", "dumbbell fly", "dumbbell incline bench press"},
'arms': {"bicep curls", "kickbacks", "tricep pushdown", "reverse curls", "hammer curl"},
'shoulders':{"shoulder press", "lateral raise", "barbell shrug", "bent over reverse flys", "push press"},
'back':{"dumbbell rows", "back extension", "pull ups", "lat pull downs", "machine seated row"},
'core':{"sit ups", "crunches", "russian twists", "bicycles", "planks"},
}
def shuffled_group(group):
group = [i for i in group]
shuffle(group)
return group
selected_groups = ["back", "core"] # This should come from your input, though, not be hardcoded.
targets = list(chain(*(shuffled_group(v)[:3] for k, v in musclegroup_exercises.items()) if k in selected_groups))
This could also work, though sample is being deprecated for sets so you have to do some more work:
list(chain(*(sample(list(v), 3) for k, v in musclegroup_exercises.items() if k in selected_groups)))
You can use random.sample(). Pass as parameters the list from which to get the elements and the length of the new list.
import random
musclegroup_exercises = {
"legs":["squat", "calf raises", "hamstring curls", "deadlifts", "walking lunges"],
"chest":["barbell bench", "pushups", "cable fly", "dumbbell fly", "dumbbell incline bench press"],
"arms":["bicep curls", "kickbacks", "tricep pushdown", "reverse curls", "hammer curl"],
"shoulders":["shoulder press", "lateral raise", "barbell shrug", "bent over reverse flys", "push press"],
"back":["dumbbell rows", "back extension", "pull ups", "lat pull downs", "machine seated row"],
"core":["sit ups", "crunches", "russian twists", "bicycles", "planks"],
}
random_exercises = {}
exercises_per_group = 3
for exercise_type in musclegroup_exercises:
exercises = exercise = random.sample(musclegroup_exercises[exercise_type], exercises_per_group)
random_exercises[exercise_type] = exercises
NOTE: Always import the packages you need at the beginning of the program, do not import them in the meantime inside an if or functions.
To take the exercises from only one area, you can simply change the key you use to access to musclegroup_exercises:
import random
musclegroup_exercises = {
"legs":["squat", "calf raises", "hamstring curls", "deadlifts", "walking lunges"],
"chest":["barbell bench", "pushups", "cable fly", "dumbbell fly", "dumbbell incline bench press"],
"arms":["bicep curls", "kickbacks", "tricep pushdown", "reverse curls", "hammer curl"],
"shoulders":["shoulder press", "lateral raise", "barbell shrug", "bent over reverse flys", "push press"],
"back":["dumbbell rows", "back extension", "pull ups", "lat pull downs", "machine seated row"],
"core":["sit ups", "crunches", "russian twists", "bicycles", "planks"],
}
exercises_per_group = 3
exercise_type = "legs"
exercises = random.sample(musclegroup_exercises[exercise_type], exercises_per_group)
Maybe use random.choices since sample is deprecated:
exercices = [j for z in [random.choices(tuple(musclegroup_exercises[i]),k=3) for i in (musclegroup1, musclegroup2)] for j in z]
Or in more understandable form:
exercices = []
for i in (musclegroup1, musclegroup2):
exercices.extend(random.choices(tuple(musclegroup_exercises[i]), k=3))

Difficulties to get the correct posterior value in a Naive Bayes Implementation

For studying purposes, I've tried to implement this "lesson" using python but "without" sckitlearn or something similar.
My attempt code is the follow:
import pandas, math
training_data = [
['A great game','Sports'],
['The election was over','Not sports'],
['Very clean match','Sports'],
['A clean but forgettable game','Sports'],
['It was a close election','Not sports']
]
text_to_predict = 'A very close game'
data_frame = pandas.DataFrame(training_data, columns=['data','label'])
data_frame = data_frame.applymap(lambda s:s.lower() if type(s) == str else s)
text_to_predict = text_to_predict.lower()
labels = data_frame.label.unique()
word_frequency = data_frame.data.str.split(expand=True).stack().value_counts()
unique_words_set = set()
unique_words = data_frame.data.str.split().apply(unique_words_set.update)
total_unique_words = len(unique_words_set)
word_frequency_per_labels = []
for l in labels:
word_frequency_per_label = data_frame[data_frame.label == l].data.str.split(expand=True).stack().value_counts()
for w, f in word_frequency_per_label.iteritems():
word_frequency_per_labels.append([w,f,l])
word_frequency_per_labels_df = pandas.DataFrame(word_frequency_per_labels, columns=['word','frequency','label'])
laplace_smoothing = 1
results = []
for l in labels:
p = []
total_words_in_label = word_frequency_per_labels_df[word_frequency_per_labels_df.label == l].frequency.sum()
for w in text_to_predict.split():
x = (word_frequency_per_labels_df.query('word == #w and label == #l').frequency.to_list()[:1] or [0])[0]
p.append((x + laplace_smoothing) / (total_words_in_label + total_unique_words))
results.append([l,math.prod(p)])
print(results)
result = pandas.DataFrame(results, columns=['labels','posterior']).sort_values('posterior',ascending = False).labels.iloc[0]
print(result)
In the blog lesson their results are:
But my result were:
[['sports', 4.607999999999999e-05], ['not sports', 1.4293831139825827e-05]]
So, what did I do wrong in my python implementation? How can I get the same results?
Thanks in advance
You haven't multiplied by the priors p(Sport) = 3/5 and p(Not Sport) = 2/5. So just updating your answers by these ratios will get you to the correct result. Everything else looks good.
So for example you implement p(a|Sports) x p(very|Sports) x p(close|Sports) x p(game|Sports) in your math.prod(p) calculation but this ignores the term p(Sport). So adding this in (and doing the same for the not sport condition) fixes things.
In code this can be achieved by:
prior = (data_frame.label == l).mean()
results.append([l,prior*math.prod(p)])
the answer by #nick is correct and should be awarded the bounty.
Here an alternative implementation (from scratch, not using pandas) that also supports normalization of probabilities and words not in training set
from collections import defaultdict
from dataclasses import dataclass, field
from typing import Dict, Set
def tokenize(text: str):
return [word.lower() for word in text.split()]
def normalize(result: Dict[str, float]):
total = sum([v for v in result.values()])
for k in result.keys():
result[k] /= total
#dataclass
class Model:
labels: Set[str] = field(default_factory=set)
words: Set[str] = field(default_factory=set)
prob_labels: Dict[str,float] = field(default_factory=lambda: defaultdict(float)) # P(label)
prob_words: Dict[str,Dict[str,float]] = field(default_factory=lambda: defaultdict(lambda: defaultdict(float))) # P(word | label) as prob_words[label][word]
def predict(self, text: str, norm=True) -> Dict[str, float]: # P(label | text) as model.predict(text)[label]
result = {label: self.prob_labels[label] for label in self.labels}
for word in tokenize(text):
for label in self.labels:
if word in self.words:
result[label] *= self.prob_words[label][word]
if norm:
normalize(result)
return result
def train(self, data):
prob_words_denominator = defaultdict(int)
for row in data:
text = row[0]
label = row[1].lower()
self.labels.add(label)
self.prob_labels[label] += 1.0
for word in tokenize(text):
self.words.add(word)
self.prob_words[label][word] += 1.0
prob_words_denominator[label] += 1.0
for label in self.labels:
self.prob_labels[label] /= len(data)
for word in self.words:
self.prob_words[label][word] = (self.prob_words[label][word] + 1.0) / (prob_words_denominator[label] + len(self.words))
training_data = [
['A great game','Sports'],
['The election was over','Not sports'],
['Very clean match','Sports'],
['A clean but forgettable game','Sports'],
['It was a close election','Not sports']
]
text_to_predict = 'A very close game'
model = Model()
model.train(training_data)
print(model.predict(text_to_predict, norm=False))
print(model.predict(text_to_predict))
print(model.predict("none of these words is in training data"))
output:
{'sports': 2.7647999999999997e-05, 'not sports': 5.7175324559303314e-06}
{'sports': 0.8286395560004286, 'not sports': 0.1713604439995714}
{'sports': 0.6, 'not sports': 0.4}

Make sentence from value of dictionary

link for original txt file
https://medusa.ugent.be/en/exercises/187053144/description/wM6YaQUbWdHKPhQX/media/ICD.txt
This is what I got:
given_string = 'You are what you eat.'
dictionary ={'D89.1': 'Cryoglobulinemia', 'M87.332': 'Other secondary osteonecrosis of left radius', 'M25.57': 'Pain in ankle and joints of foot', 'H59.111': 'Intraoperative hemorrhage and hematoma of right eye and adnexa complicating an ophthalmic procedure', 'I82.5Z9': 'Chronic embolism and thrombosis of unspecified deep veins of unspecified distal lower extremity', 'T38.3X': 'Poisoning by, adverse effect of and underdosing of insulin and oral hypoglycemic [antidiabetic] drugs', 'H95.52': 'Postprocedural hematoma of ear and mastoid process following other procedure', 'Q90.1': 'Trisomy 21, mosaicism (mitotic nondisjunction)', 'X83.8': 'Intentional self-harm by other specified means', 'H02.145': 'Spastic ectropion of left lower eyelid', 'M67.341': 'Transient synovitis, right hand', 'P07.32': 'Preterm newborn, gestational age 29 completed weeks', 'R44.8': 'Other symptoms and signs involving general sensations and perceptions', 'R03.1': 'Nonspecific low blood-pressure reading', 'Q03': 'Congenital hydrocephalus', 'C11.0': 'Malignant neoplasm of superior wall of nasopharynx', 'C44.4': 'Other and unspecified malignant neoplasm of skin of scalp and neck', 'N48.5': 'Ulcer of penis', 'T50.2X1': 'Poisoning by carbonic-anhydrase inhibitors, benzothiadiazides and other diuretics, accidental (unintentional)', 'V92.13': 'Drowning and submersion due to being thrown overboard by motion of other powered watercraft', 'D30.0': 'Benign neoplasm of kidney', 'M08.06': 'Unspecified juvenile rheumatoid arthritis, knee', 'T41.5X4': 'Poisoning by therapeutic gases, undetermined', 'T59.3X2': 'Toxic effect of lacrimogenic gas, intentional self-harm', 'S84.91': 'Injury of unspecified nerve at lower leg level, right leg', 'Z80.4': 'Family history of malignant neoplasm of genital organs', 'M05.34': 'Rheumatoid heart disease with rheumatoid arthritis of hand', 'Y36.531': 'War operations involving thermal radiation effect of nuclear weapon, civilian', 'H59.88': 'Other intraoperative complications of eye and adnexa, not elsewhere classified', 'R29.91': 'Unspecified symptoms and signs involving the musculoskeletal system', 'M71.139': 'Other infective bursitis, unspecified wrist', 'S00.441': 'External constriction of right ear', 'V04': 'Pedestrian injured in collision with heavy transport vehicle or bus', 'C92.1': 'Chronic myeloid leukemia, BCR/ABL-positive', 'I82.60': 'Acute embolism and thrombosis of unspecified veins of upper extremity', 'I75.89': 'Atheroembolism of other site', 'S51.031': 'Puncture wound without foreign body of right elbow', 'Z01.110': 'Encounter for hearing examination following failed hearing screening', 'I06.8': 'Other rheumatic aortic valve diseases', 'Z68.25': 'Body mass index (BMI) 25.0-25.9, adult', 'A66': 'Yaws', 'S78.921': 'Partial traumatic amputation of right hip and thigh, level unspecified', 'F44': 'Dissociative and conversion disorders', 'O87.8': 'Other venous complications in the puerperium', 'K04.3': 'Abnormal hard tissue formation in pulp', 'V38.7': 'Person on outside of three-wheeled motor vehicle injured in noncollision transport accident in traffic accident', 'V36.1': 'Passenger in three-wheeled motor vehicle injured in collision with other nonmotor vehicle in nontraffic accident', 'B94.9': 'Sequelae of unspecified infectious and parasitic disease', 'K50.911': "Crohn's disease, unspecified, with rectal bleeding", 'S00.52': 'Blister (nonthermal) of lip and oral cavity', 'T43.1': 'Poisoning by, adverse effect of and underdosing of monoamine-oxidase-inhibitor antidepressants', 'B99.8': 'Other infectious disease', 'S97.12': 'Crushing injury of lesser toe(s)', 'S02.69': 'Fracture of mandible of other specified site', 'V29.10': 'Motorcycle passenger injured in collision with unspecified motor vehicles in nontraffic accident', 'Z68.35': 'Body mass index (BMI) 35.0-35.9, adult', 'A81.2': 'Progressive multifocal leukoencephalopathy', 'V44.4': 'Person boarding or alighting a car injured in collision with heavy transport vehicle or bus', 'M62.51': 'Muscle wasting and atrophy, not elsewhere classified, shoulder', 'M62.151': 'Other rupture of muscle (nontraumatic), right thigh', 'V52.2': 'Person on outside of pick-up truck or van injured in collision with two- or three-wheeled motor vehicle in nontraffic accident', 'E09.622': 'Drug or chemical induced diabetes mellitus with other skin ulcer', 'S43.492': 'Other sprain of left shoulder joint', 'M08.212': 'Juvenile rheumatoid arthritis with systemic onset, left shoulder', 'R00.0': 'Tachycardia, unspecified', 'G21.8': 'Other secondary parkinsonism', 'W58.01': 'Bitten by alligator', 'D46.1': 'Refractory anemia with ring sideroblasts', 'H61.32': 'Acquired stenosis of external ear canal secondary to inflammation and infection', 'H95.0': 'Recurrent cholesteatoma of postmastoidectomy cavity', 'Z72.4': 'Inappropriate diet and eating habits', 'Z68.41': 'Body mass index (BMI) 40.0-44.9, adult', 'S20.172': 'Other superficial bite of breast, left breast', 'I63.232': 'Cerebral infarction due to unspecified occlusion or stenosis of left carotid arteries', 'M14.811': 'Arthropathies in other specified diseases classified elsewhere, right shoulder', 'E13.41': 'Other specified diabetes mellitus with diabetic mononeuropathy', 'H02.53': 'Eyelid retraction', 'V95.49': 'Other spacecraft accident injuring occupant', 'D74.0': 'Congenital methemoglobinemia', 'D60.1': 'Transient acquired pure red cell aplasia', 'T52.1X2': 'Toxic effect of benzene, intentional self-harm', 'O71.2': 'Postpartum inversion of uterus', 'M08.439': 'Pauciarticular juvenile rheumatoid arthritis, unspecified wrist', 'M01.X72': 'Direct infection of left ankle and foot in infectious and parasitic diseases classified elsewhere', 'H95.3': 'Accidental puncture and laceration of ear and mastoid process during a procedure', 'C74.92': 'Malignant neoplasm of unspecified part of left adrenal gland', 'G00': 'Bacterial meningitis, not elsewhere classified', 'M19.011': 'Primary osteoarthritis, right shoulder', 'G72.49': 'Other inflammatory and immune myopathies, not elsewhere classified', 'Z68.34': 'Body mass index (BMI) 34.0-34.9, adult', 'V86.64': 'Passenger of military vehicle injured in nontraffic accident', 'L20.9': 'Atopic dermatitis, unspecified', 'S65.51': 'Laceration of blood vessel of other and unspecified finger', 'B67.1': 'Echinococcus granulosus infection of lung', 'S08.81': 'Traumatic amputation of nose', 'Z36.5': 'Encounter for antenatal screening for isoimmunization', 'S59.22': 'Salter-Harris Type II physeal fracture of lower end of radius', 'M66.359': 'Spontaneous rupture of flexor tendons, unspecified thigh', 'I69.919': 'Unspecified symptoms and signs involving cognitive functions following unspecified cerebrovascular disease', 'I25.700': 'Atherosclerosis of coronary artery bypass graft(s), unspecified, with unstable angina pectoris', 'V24.0': 'Motorcycle driver injured in collision with heavy transport vehicle or bus in nontraffic accident', 'S53.025': 'Posterior dislocation of left radial head', 'Q72.819': 'Congenital shortening of unspecified lower limb', 'G44.82': 'Headache associated with sexual activity', 'M93.2': 'Osteochondritis dissecans', 'V44.6': 'Car passenger injured in collision with heavy transport vehicle or bus in traffic accident', 'O90.89': 'Other complications of the puerperium, not elsewhere classified', 'T83.518': 'Infection and inflammatory reaction due to other urinary catheter', 'Z02.9': 'Encounter for administrative examinations, unspecified', 'S55.091': 'Other specified injury of ulnar artery at forearm level, right arm'}
Each character of the string must be replaced by randomly choosing among all possible Hippocrates-codes that encode the character, and return result contain code where character is in, and index of character in value
so. this is the answer that I supposed to get
A66.0 M62.51.29 V44.6.68 H95.3.70 M08.06.26 S51.031.39 V92.13.17 V95.49.25 P07.32.46 C11.0.44 V04.45 E13.41.30 G21.8.5 R00.0.4 V52.2.54 B67.1.38 V24.0.43 M01.X72.10 C74.92.35 G72.49.35 Z68.41.24
and, this is the answer that i got.
F44.6.4 S78.922.3 W36.1.17 S93.121.2 E10.32.39 A00.1.12 S90.464.3 T37.1X.9 T43.2.17 W24.0.3 Q60.3.5 V59.9.14 S66.911.5 W93.42 V14.1.34 Y92.139.14 T21.06.12 T65.89.6 Q95.3.4 S85.161.16 S93.121.7 T37.1X.18 V49.60.23 T37.1X5.7 F98.29.16 J10.89.14
for get that I wrote code like this
import re
import random
class Hippocrates:
def __init__(self, code):
self.code = code
def description(self, x):
line_list = []
split_point = []
k = []
v = []
with open(self.code) as f:
for line in f:
for i in line:
if i == " ":
split_point.append(line.find(i))
with open(self.code) as f:
for line in f:
line_list.append(line.rstrip())
for i in line_list:
a = i.split(" ", 1)
k.append(a[0])
v.append(a[1])
d = dict(zip(k, v))
for key, value in d.items():
if x == key:
return d[key]
else:
raise ValueError('invalid ICD-code')
def character(self, numb):
line_list = []
split_point = []
k = []
v = []
with open(self.code) as f:
for line in f:
for i in line:
if i == " ":
split_point.append(line.find(i))
with open(self.code) as f:
for line in f:
line_list.append(line.rstrip())
for i in line_list:
a = i.split(" ", 1)
k.append(a[0])
v.append(a[1])
d = dict(zip(k, v))
rev = numb[::-1]
revs = rev.split('.',1)
r1 =(revs[1][::-1])
r2 = (revs[0][::-1])
for key, value in d.items():
if r1 == key:
answer = d[key]
result = answer[int(r2)]
return result
else:
raise ValueError('invalid Hippocrates-code')
def codes(self, char):
line_list = []
split_point = []
k = []
v = []
r_v = []
code_result = []
des_result = []
des_result2 = []
location = []
final = []
with open(self.code) as f:
for line in f:
for i in line:
if i == " ":
split_point.append(line.find(i))
with open(self.code) as f:
for line in f:
line_list.append(line.rstrip())
for i in line_list:
a = i.split(" ", 1)
k.append(a[0])
v.append(a[1])
d = dict(zip(k, v))
for i in v:
for x in i:
if x == char:
r_v.append(i)
for key, value in d.items():
for i in r_v:
if i == value:
code_result.append(key)
for key in d.keys():
for i in code_result:
if i == key:
des_result.append(d[i])
for i in des_result:
if i not in des_result2:
des_result2.append(i)
for i in des_result2:
regex = re.escape(char)
a = [m.start() for m in re.finditer(regex,i)]
location.append(a)
location = (sum(location,[]))
for i in range(len(code_result)):
answer = (str(code_result[i]) +'.'+ str(location[i]))
final.append(answer)
return (set(final))
def encode(self, plaintxt):
line_list = []
split_point = []
#key of dictionary
k = []
#value of dictionary
v = []
#description that contain character with index
r = []
#list of possible choice
t = []
#randomly choosen result from t
li_di = []
#descriptoin
des = []
#index of char in description
index_char = []
#answer to print
resul = []
dictlist = []
answers = []
with open(self.code) as f:
for line in f:
for i in line:
if i == " ":
split_point.append(line.find(i))
with open(self.code) as f:
for line in f:
line_list.append(line.rstrip())
for i in line_list:
a = i.split(" ", 1)
k.append(a[0])
v.append(a[1])
d = dict(zip(k, v))
print(d)
for key, value in d.items():
for i in plaintxt:
if i in value:
answer = d[key] +':'+ str(d[key].index(i))
r.append(answer)
print(r)
a = len(plaintxt)
b=0
for i in range(len(r)):
t.append(r[b::a])
b+=1
if b == len(plaintxt):
break
for i in t:
li_di.append(random.choice(i))
for i in li_di:
sep = i.split(":", 1)
des.append(sep[0])
index_char.append(sep[1])
print(index_char)
for i in des:
for key, value in d.items():
if i == value:
resul.append(key)
print(resul)
for i in range(len(resul)):
answers.append(resul[i]+'.'+index_char[i]+'')
return(" ".join(answers))
the codes that represent character in given_string should be in same order with, original given string, but i messed it up. how can i fix this?
This should work for your encode function:
def encode(self, plaintxt):
code_map = {}
codes = []
with open(self.code) as f:
for line in f:
line = line.rstrip().split(' ', 1)
code_map[line[0]] = line[1]
for ch in plaintxt:
matches = []
for key, value in code_map.items():
pos = -1
while True:
pos = value.find(ch, pos + 1)
if pos != -1:
matches.append((key, pos))
else:
break
if not matches:
raise ValueError(f'Character {ch} cannot be encoded as there are no matches')
code_tuple = random.choice(matches)
code, idx = code_tuple
codes.append(f'{code}.{idx}')
return ' '.join(codes)
Edit: I updated this to make it more space-efficient, by getting rid of char_map and appending codes as it goes
First, it creates a dict of keys as codes and values as the corresponding strings. Then it iterates through the given plaintxt string, and searches all of the values of the dict for matches (including multiple matches in a single value), and adds this to a matches list of tuples, where each tuple contains a suitable code and the index of the match. If there are no matches, it raises a ValueError as soon as it runs into an issue. It chooses randomly from each list of tuples to choose some code and index pair, and appends this to a list on the fly, and then at the end it joins this list to make your encoded string.
If memory is not a problem, I think you should build an index of possible choices of each character from the dictionary. Here is an example code:
import random
def build_char_codes(d):
result = {}
for key, val in d.items():
for i in range(len(val)):
ch = val[i]
if ch not in result:
result[ch] = {key: [i]}
else:
result[ch][key] = result[ch].get(key, []) + [i]
return result
def get_code(ch, char_codes):
key = random.sample(char_codes[ch].keys(), 1)[0]
char_pos = random.choice(char_codes[ch][key])
code = '{}.{}'.format(key, char_pos)
return code
char_codes = build_char_codes(dictionary)
given_string = 'You are what you eat.'
codes = [get_code(ch, char_codes) for ch in given_string]
print(' '.join(codes))
Notes:
char_codes index all possible choices of each character in the dictionary
it sample all the key in dictionary first (uniformly random), and then it sample the position in the string (uniformly random). But it is not sampling uniformly among all the possible choices of a character.
In preparation for the transformation, you could create a dictionary with each letter in the ICD description mapping to a list of codes that contain it at various indexes.
Then, the transformation process would simply be a matter of picking one of the code.index from the entry in the dictionary for each letter in the given string:
preparation ...
with open(fileName,'r') as f:
icd = [line.split(" ",1) for line in f.read().split("\n")]
icdLetters = dict() # list of ICD codes with index for each possible letter
for code,description in icd:
for i,letter in enumerate(description):
icdLetters.setdefault(letter,[]).append(f"{code}.{i}")
transformation....
import random
given_string = 'You are what you eat.'
result = [ random.choice(icdLetters.get(c,["-"])) for c in given_string ]
output:
print(result)
['A66.0', 'T80.22.35', 'S53.136.34', 'C40.90.33', 'S53.136.43', 'Z96.621.12', 'B57.30.24', 'H59.121.55', 'V14.1.43', 'S93.121.47', 'H59.121.9', 'V04.92.17', 'T80.22.80', 'O16.1.22', 'T25.61.10', 'S53.136.34', 'F44.6.32', 'M67.232.29', 'M89.771.34', 'S93.121.7', 'Z68.36.29']
If you want to save some memory, your dictionary could store indexes in the main list of icd codes and descriptions instead of the formatted values:
with open(fileName,'r') as f:
icd = [line.split(" ",1) for line in f.read().split("\n")]
icdLetters = dict()
for codeIndex,(code,description) in enumerate(icd):
for letterIndex,letter in enumerate(description):
icdLetters.setdefault(letter,[]).append((codeIndex,letterIndex))
import random
def letterToCode(letter):
if letter not in icdLetters: return "-"
codeIndex,letterIndex = random.choice(icdLetters[letter])
return f"{icd[codeIndex][0]}.{letterIndex}"
given_string = 'You are what you eat.'
result = [ letterToCode(c) for c in given_string ]

How do I consolidate my code to have one variable that changes for each input?

I'm building a program that will take the skillsets of different candidates for a given job, and check to see if they have the required skills. I have figured out how to make this work, but I don't know how to do it without writing "candidate1", "candidate2" etc. Is there a more efficient way to do this?:
list_of_qualities = ['Experience in Cold Calling', 'Experience in Door to
Door Sales', 'Experience in Account Management','Experience in Warm Leads','Experience in Presenting', 'Experience in Negotiation',\'Experience in Leadership', 'Experience in Closing']
cold_calling = list_of_qualities[0]
door_to_door = list_of_qualities[1]
account_management = list_of_qualities[2]
warm_leads = list_of_qualities[3]
presenting = list_of_qualities[4]
negotiation = list_of_qualities[5]
leadership = list_of_qualities[6]
closing = list_of_qualities[7]
required_qualities = [cold_calling, presenting, account_management, leadership, closing]
candidate1 = [cold_calling, presenting, account_management, leadership, closing, door_to_door]
candidate2 = [cold_calling, warm_leads, account_management, leadership]
candidate3 = [cold_calling, account_management]
matched_qualities1 = []
matched_qualities2 = []
matched_qualities3 = []
lacking_qualities1 = []
lacking_qualities2 = []
lacking_qualities3 = []
print("To view and apply for your job, candidates must have the following skillset:")
print(required_qualities)
print(" ")
print("The candidates have the following matching skills:")
Candidate 1
for i in candidate1:
if i in required_qualities:
matched_qualities1.append(i)
print("Candidate 1:", matched_qualities1)
for i in required_qualities:
if i not in candidate1:
lacking_qualities1.append(i)
Check if candidate 1 has all skills or not
if len(lacking_qualities1) == 0:
print(" This candidate has all of the required skills")
else:
print(" lacking:", lacking_qualities1)
There are several options. You can have a list of lists, or a dataframe. An array is also an option, but not really appropriate. Some other things to simplify your code:
You can do cold_calling,door_to_door,account_management,warm_leads,presenting,negotiation,
leadership,closing = list_of_qualities
Also, you can replace your for-loop with lacking_qualities1=[quality in required_qualities if not quality in candidate1]
If you create a list of candidates, you can do
lacking_qualities_list_of_list=[
[quality in required_qualities if not quality in candidate]
for candidate in list_of_candidates]

How to speed up ElasticSearch indexing?

I am a beginner with elasticsearch and i have to write 1-million random events into an Elastic search cluster (hosted on the cloud), with a python script...
es = Elasticsearch(
[host_name],
port=9243,
http_auth=("*****","*******"),
use_ssl=True,
verify_certs=True,
ca_certs=certifi.where(),
sniff_on_start=True
)
Here's my code for the indexing:
for i in range(1000000):
src_centers=['data center a','data center b','data center c','data center d','data center e']
transfer_src = np.random.choice(src_centers, p=[0.3, 0.175, 0.175, 0.175, 0.175])
dst_centers = [x for x in src_centers if x != transfer_src]
transfer_dst = np.random.choice(dst_centers)
final_transfer_status = ['transfer-success','transfer-failure']
transfer_starttime = generate_timestamp()
file_size=random.choice(range(1024,10000000000))
ftp={
'event_type': 'transfer-queued',
'uuid': uuid.uuid4(),
'src_site' : transfer_src,
'dst_site' : transfer_dst,
'timestamp': transfer_starttime,
'bytes' : file_size
}
print(i)
es.index(index='ft_initial', id=(i+1), doc_type='initial_transfer_details', body= ftp)
transfer_status = ['transfer-success', 'transfer-failure']
final_status = np.random.choice(transfer_status, p=[0.95,0.05])
ftp['event_type'] = final_status
if (final_status=='transfer-failure'):
time_delay = 10
else :
time_delay = int(transfer_time(file_size)) # ranges roughly from 0-10000 s
ftp['timestamp'] = transfer_starttime + timedelta(seconds=time_delay)
es.index(index='ft_final', id=(i+1), doc_type='final_transfer_details', body=ftp)
Is there any alternate way to speed up the process??
Any help/pointers will be appreciated. Thanks.
Use bulks, otherwise you have a lot of overhead for each single request: https://www.elastic.co/guide/en/elasticsearch/reference/current/docs-bulk.html
Change the refresh rate, ideally disable it totally until you're done: https://www.elastic.co/guide/en/elasticsearch/reference/current/indices-update-settings.html#bulk
Use monitoring (there's a free basic license) to see what is actually the bottleneck (IO, memory, CPU): https://www.elastic.co/guide/en/x-pack/current/xpack-monitoring.html

Categories

Resources