Python: Replace value with the value from a dictionary key value pair - python

I have been racking my brain on this for hours now. I'm trying to replace the offense number which is 1-30 to its corresponding offense type i.e. stealing, embezzlement, Burglary, etc. and then sort that into a list.
Here is a sample of the output I currently have:
offense # : Victim Total
1 189
10 712
11 1844
12 184
13 147
14 4364
15 595
16 175
17 387
18 2893
2 597
20 661
Here is what code I have thus far. The offense_map dictionary is what I would like to use to replace the 1-30 in the output to the offense type. Then sort the list in descending order from the largest victim count (right column) to the least. I am working with ~100,000 rows of data so efficiency is important for this program.
from collections import Counter
incidents_f = open('incidents.csv', mode = "r")
crime_dict = dict()
for line in incidents_f:
line_1st = line.strip().split(",")
if line_1st[0].upper() != "REPORT_NO":
report_no = line_1st[0]
offense = line_1st[3]
zip_code = line_1st[4]
if len(zip_code) < 5:
zip_code = "99999"
if report_no in crime_dict:
crime_dict[report_no].append(zip_code).append(offense)
else:
crime_dict[report_no] = [zip_code]+[offense]
#close File
incidents_f.close
details_f = open('details.csv',mode = 'r')
for line in details_f:
line_1st = line.strip().split(",")
if line_1st[0].upper() != "REPORT_NO":
report_no = line_1st[0]
involvement = line_1st[1]
if involvement.upper() != 'VIC':
continue
else:
crime_dict[report_no].append(involvement.upper())
#close File
details_f.close
offense_map = {'1':'Homicide','2':'Rape','3':'Robbery','4':'Assault','5':'Burglary','6':'Stealing','7':'Auto Theft','8':'Non Agg Assault','9':'Arson','10':'Forgery','11':'Fraud','12':'Embezzlement','13':'Stolen Property','14':'Property Damage','15':'Weapons Law Violation','16':'Prostitution','17':'Sex Offense Other','18':'Possession/Sale/Dist','20':'Family Offense','21':'DUI','22':'Liquor Law Violation','24':'Disorderly','25':'Loitering','26':'Misc Violation','29':'Missing/Runaway','30':'Casualty/Suicide'}
victims_by_offense = {}
for k, v in crime_dict.items():
zip = v[1]
if zip not in victims_by_offense.keys():
victims_by_offense[zip] = 0
victims_by_offense[zip] += v[0:].count('VIC')
for zip in sorted(victims_by_offense.keys()):
print(zip, victims_by_offense[zip])

To get a list of keys in victims_by_offense in descending order of Victim Total:
victims_by_offense = {'1': 189, '10': 712, '11': 1844, '12': 184, '13': 147, '14': 4364, '15': 595, '16': 175, '17': 387, '18': 2893, '2': 597, '20': 661}
sorted_keys = sorted(victims_by_offense, key=victims_by_offense.get, reverse=True)
Then
for zip in sorted_keys:
print(offense_map[zip], victims_by_offense[zip])
I get
('Property Damage', 4364)
('Possession/Sale/Dist', 2893)
('Fraud', 1844)
('Forgery', 712)
('Family Offense', 661)
('Rape', 597)
('Weapons Law Violation', 595)
('Sex Offense Other', 387)
('Homicide', 189)
('Embezzlement', 184)
('Prostitution', 175)
('Stolen Property', 147)
('Homicide', 189)
('Embezzlement', 184)
('Prostitution', 175)
('Stolen Property', 147)

I tweaked your code a bit to use csv.reader objects instead of stripping and splitting yourself, as well as changed your data structure to be
crimes = {report_no: {'offense': offense_number,
'zip': zip_code,
'victims': victim_count},
...}
but I think it works much better this way.
import csv
import itemgetter
crimes = dict()
# build `crimes` dict with zero-count victims
with open("incidents.csv") as f:
reader = csv.reader(f)
headers = next(reader)
for report_no, _, _, offense, zip_code, *_ in reader:
if len(zip_code) < 5:
zip_code = "99999"
report = (zip_code, offense)
crimes[report_no] = {'offense': offense,
'zip': zip_code,
'victims': 0}
# parse victims information
with open("details.csv") as f:
reader = csv.reader(f)
headers = next(reader)
for report_no, involvement, *_ in reader:
if involvement.upper() == "VIC":
crimes[report_no]['victims'] += 1
offense_map = {'1':'Homicide',
'2':'Rape',
'3':'Robbery',
'4':'Assault',
'5':'Burglary',
'6':'Stealing',
'7':'Auto Theft',
'8':'Non Agg Assault',
'9':'Arson',
'10':'Forgery',
'11':'Fraud',
'12':'Embezzlement',
'13':'Stolen Property',
'14':'Property Damage',
'15':'Weapons Law Violation',
'16':'Prostitution',
'17':'Sex Offense Other',
'18':'Possession/Sale/Dist',
'20':'Family Offense',
'21':'DUI',
'22':'Liquor Law Violation',
'24':'Disorderly',
'25':'Loitering',
'26':'Misc Violation',
'29':'Missing/Runaway',
'30':'Casualty/Suicide'}
counts = {k: 0 for k in offense_map.values()}
# start counting crimes by victim count (by name, not number)
for crime_info in crimes.values()
try:
offense_no = crime_info['offense']
offense_name = offense_map[offense_no]
counts[offense_name] += crime_info['victims']
except KeyError:
# we couldn't map that
print("No such offense: {}".format(crime_info['offense']))
# sort by value
for k,v in sorted(counts.items(), key=operator.itemgetter(1), reverse=True):
print(k, v)

Related

How to add on dictionaries?

Sorry for the vague question, I really don't know what to make of this.
This is my code:
area="""AREA,POPULATION,CHILD
ARKANSAS,2000,20
TEXAS,50,5"""
def createFiles():
x = open('area.txt','w')
x.write(area)
x.close()
createFiles()
city = {}
total = 0
with open('area.txt', 'r') as file:
next(file)
for line in file:
data = line.strip().split(',')
place = data[0]
city[place] = {}
city[place]['Population'] = int(data[1])
city[place]['Children'] = int(data[2])
print(city)
choose=input('Choose area ARKANSAS/TEXAS: ')
addPopu=input('Add population: ')
addChild=input('Add Children: ')
For reasons, the variable "area" is going to be a txt file named "area.txt"
for example I choose area TEXAS, TEXAS has 50 Population and 5 Children.
I add 10 in Population and 5 on Children so it's value will change to 60 Population and 10 children, How do I do this?
my Output should look like this:
{'ARKANSAS': {'Population': 2000, 'Children': 20}, 'TEXAS': {'Population': 50, 'Children': 5}}
Choose area ARKANSAS/TEXAS: TEXAS
Add population: 10
Add Children: 5
{'ARKANSAS': {'Population': 2000, 'Children': 20}, 'TEXAS': {'Population': 60, 'Children': 10}}
This should do that
# add the input population and children to the chosen area
city[choose]['Population'] += int(addPopu)
city[choose]['Children'] += int(addChild)
# rewriting the area.txt file with updated values
with open('area.txt', 'w') as file:
file.write("AREA,POPULATION,CHILD\n")
for place in city:
file.write(f"{place},{city[place]['Population']},{city[place]['Children']}\n")
print(city)

How do I fix: "TypeError: cannot unpack non-iterable NoneType object"

I've been searching through stackoverflow and other various sites, but I've been unable to resolve this error for about a week now.
I'm trying the get the minimum and maximum values from each country within the dictionary. The key of the dictionary is the region. I'm unsure of where the type error is but, I'd appreciate it if someone could help.
Here's the error:
min_tup, max_tup = get_min_max(D,region,option)
File "proj08.py", line 107, in get_min_max
return min[0], max[0]
UnboundLocalError: local variable 'min' referenced before assignment
Here's the sample input:
Region,option: North America , 2
Here's the documentation explaining the function and .csv
https://www.cse.msu.edu/~cse231/Online/Projects/Project08/Project08.pdf
https://www.cse.msu.edu/~cse231/Online/Projects/Project08/data_short.csv
Here's the code:
import csv
from operator import itemgetter
# do NOT import sys
REGION_LIST = ['East Asia & Pacific',
'Europe & Central Asia',
'Latin America & Caribbean',
'Middle East & North Africa',
'North America',
'South Asia',
'Sub-Saharan Africa']
PROMPT = "\nSpecify a region from this list or 'q' to quit -- \nEast
Asia & Pacific,Europe & Central Asia,Latin America & Caribbean,Middle
East & North
Africa,North America,South Asia,Sub-Saharan Africa: "
def open_file():
# Opens a file
while True:
try:
file = input("Input a file: ")
fp = open(file, "r")
return fp
except FileNotFoundError:
print("Invalid filename, please try again.")
def read_file(fp):
# Sets read Csv file to a variable
reader = csv.reader(fp)
# Skips the header
next(reader, None)
# Country List
country_list = []
# sets a dictionary
Dict = dict()
for line in reader:
try:
skipper = ""
if skipper in line:
continue
else:
region = line[6]
country = line[0].strip()
electricty = float(line[2])
fertility = float(line[3])
gdp = float(line[4])
life_expectancy = float(line[5])
country_list = [country, electricty, fertility, GDP,
life_expectancy]
if region in Dict.keys():
Dict[region].append(country_list)
elif region not in Dict.keys():
Dict[region] = [country_list]
else:
continue
except KeyError:
continue
except ValueError:
continue
return Dict
def get_min_max(Dict, region, option):
lis = []
for k, v in Dict.items():
if region in k[0]:
if option == 1:
electricity = v[1]
tup = tuple(k, electricity)
lis.append(tup)
min = sorted(lis, key=itemgetter(1))
max = sorted(lis, key=itemgetter(1), reverse=True)
if option == 2:
fertility = v[2]
tup = tuple(k, fertility)
lis.append(tup)
min = sorted(lis, key=itemgetter(1))
max = sorted(lis, key=itemgetter(1), reverse=True)
if option == 3:
gdp = v[3]
tup = tuple(k, gdp)
lis.append(tup)
min = sorted(lis, key=itemgetter(1))
max = sorted(lis, key=itemgetter(1), reverse=True)
if option == 4:
life_expectancy = v[4]
tup = tuple(k, life_expectancy)
lis.append(tup)
min = sorted(lis, key=itemgetter(1))
max = sorted(lis, key=itemgetter(1), reverse=True)
return min[0], max[0]
def display_all_countries(D, region):
if region in REGION_LIST:
if region == 'all':
print("\nDisplaying {} Region:\n".format(region))
print("{:32s}{:>20s}{:>20s}{:>17s}{:>18s}".format(
"Country", "Electricity Access", "Fertility rate", "GDP
per capita", "Life expectancy"))
for k, v in D.items():
if region in v[0]:
country = v[0]
electricity = v[1]
fertility = v[2]
gdp = v[3]
life = v[4]
tup = (country, electricity, fertility, gdp, life)
sorted(tup, key=itemgetter(0), reverse=True)
print("{:32s}{:>20.2f}{:>20.2f}{:>17.2f}
{:>18.2f}".format(
tup[0], tup[1], tup[2], tup[3], tup[4]))
if region not in REGION_LIST:
return None
def get_top10(D):
pass
def display_options():
"""
DO NOT CHANGE
Display menu of options for program
"""
OPTIONS = """\nMenu
1: Minimum and Maximum Countries Access to Electricity
2: Minimum and Maximum Countries Fertility Rate
3: Minimum and Maximum Countries GDP per Capita
4: Minimum and Maximum Countries Life Expectancy
5: List of countries in a region
6: Top 10 Countries in the world by GDP per Capita\n"""
print(OPTIONS)
def main():
file = open_file()
# while True:
# if user == 'East Asia & Pacific' or user == 'Europe &
Central Asia' or user == 'Middle East & North Africa' or user ==
'Latin America & Caribbean' or user == 'North America' or user ==
'South Asia' or user == 'Sub-Saharan Africa':
# print("\nRegion: ".format(user))
# display_options()
# if user == "Q" or user == "q":
# break
# else:
# user = input(PROMPT)
region = 'North America'
option = '2'
superD = read_file(file)
mina = get_min_max(superD, region, option)
#print(mina)
if __name__ == '__main__':
main()
The error is telling you that you can't use unpacking assignment such as
x, y = function()
Because the function returned something that can't be unpacked (None, in this case)
This means that your function returned None somehow. We can't say for sure without a reusable example, but I would guess that its because of the first if condition in your function, which can return None.
Although it is allowed, it is generally not a great idea to have multiple different return types in a python function. This is because the caller has to know how to handle different things that the function might do, instead of being able to trust that the function will work and give them a good answer (assuming of course, that they are using correct inputs.)

Make sentence from value of dictionary

link for original txt file
https://medusa.ugent.be/en/exercises/187053144/description/wM6YaQUbWdHKPhQX/media/ICD.txt
This is what I got:
given_string = 'You are what you eat.'
dictionary ={'D89.1': 'Cryoglobulinemia', 'M87.332': 'Other secondary osteonecrosis of left radius', 'M25.57': 'Pain in ankle and joints of foot', 'H59.111': 'Intraoperative hemorrhage and hematoma of right eye and adnexa complicating an ophthalmic procedure', 'I82.5Z9': 'Chronic embolism and thrombosis of unspecified deep veins of unspecified distal lower extremity', 'T38.3X': 'Poisoning by, adverse effect of and underdosing of insulin and oral hypoglycemic [antidiabetic] drugs', 'H95.52': 'Postprocedural hematoma of ear and mastoid process following other procedure', 'Q90.1': 'Trisomy 21, mosaicism (mitotic nondisjunction)', 'X83.8': 'Intentional self-harm by other specified means', 'H02.145': 'Spastic ectropion of left lower eyelid', 'M67.341': 'Transient synovitis, right hand', 'P07.32': 'Preterm newborn, gestational age 29 completed weeks', 'R44.8': 'Other symptoms and signs involving general sensations and perceptions', 'R03.1': 'Nonspecific low blood-pressure reading', 'Q03': 'Congenital hydrocephalus', 'C11.0': 'Malignant neoplasm of superior wall of nasopharynx', 'C44.4': 'Other and unspecified malignant neoplasm of skin of scalp and neck', 'N48.5': 'Ulcer of penis', 'T50.2X1': 'Poisoning by carbonic-anhydrase inhibitors, benzothiadiazides and other diuretics, accidental (unintentional)', 'V92.13': 'Drowning and submersion due to being thrown overboard by motion of other powered watercraft', 'D30.0': 'Benign neoplasm of kidney', 'M08.06': 'Unspecified juvenile rheumatoid arthritis, knee', 'T41.5X4': 'Poisoning by therapeutic gases, undetermined', 'T59.3X2': 'Toxic effect of lacrimogenic gas, intentional self-harm', 'S84.91': 'Injury of unspecified nerve at lower leg level, right leg', 'Z80.4': 'Family history of malignant neoplasm of genital organs', 'M05.34': 'Rheumatoid heart disease with rheumatoid arthritis of hand', 'Y36.531': 'War operations involving thermal radiation effect of nuclear weapon, civilian', 'H59.88': 'Other intraoperative complications of eye and adnexa, not elsewhere classified', 'R29.91': 'Unspecified symptoms and signs involving the musculoskeletal system', 'M71.139': 'Other infective bursitis, unspecified wrist', 'S00.441': 'External constriction of right ear', 'V04': 'Pedestrian injured in collision with heavy transport vehicle or bus', 'C92.1': 'Chronic myeloid leukemia, BCR/ABL-positive', 'I82.60': 'Acute embolism and thrombosis of unspecified veins of upper extremity', 'I75.89': 'Atheroembolism of other site', 'S51.031': 'Puncture wound without foreign body of right elbow', 'Z01.110': 'Encounter for hearing examination following failed hearing screening', 'I06.8': 'Other rheumatic aortic valve diseases', 'Z68.25': 'Body mass index (BMI) 25.0-25.9, adult', 'A66': 'Yaws', 'S78.921': 'Partial traumatic amputation of right hip and thigh, level unspecified', 'F44': 'Dissociative and conversion disorders', 'O87.8': 'Other venous complications in the puerperium', 'K04.3': 'Abnormal hard tissue formation in pulp', 'V38.7': 'Person on outside of three-wheeled motor vehicle injured in noncollision transport accident in traffic accident', 'V36.1': 'Passenger in three-wheeled motor vehicle injured in collision with other nonmotor vehicle in nontraffic accident', 'B94.9': 'Sequelae of unspecified infectious and parasitic disease', 'K50.911': "Crohn's disease, unspecified, with rectal bleeding", 'S00.52': 'Blister (nonthermal) of lip and oral cavity', 'T43.1': 'Poisoning by, adverse effect of and underdosing of monoamine-oxidase-inhibitor antidepressants', 'B99.8': 'Other infectious disease', 'S97.12': 'Crushing injury of lesser toe(s)', 'S02.69': 'Fracture of mandible of other specified site', 'V29.10': 'Motorcycle passenger injured in collision with unspecified motor vehicles in nontraffic accident', 'Z68.35': 'Body mass index (BMI) 35.0-35.9, adult', 'A81.2': 'Progressive multifocal leukoencephalopathy', 'V44.4': 'Person boarding or alighting a car injured in collision with heavy transport vehicle or bus', 'M62.51': 'Muscle wasting and atrophy, not elsewhere classified, shoulder', 'M62.151': 'Other rupture of muscle (nontraumatic), right thigh', 'V52.2': 'Person on outside of pick-up truck or van injured in collision with two- or three-wheeled motor vehicle in nontraffic accident', 'E09.622': 'Drug or chemical induced diabetes mellitus with other skin ulcer', 'S43.492': 'Other sprain of left shoulder joint', 'M08.212': 'Juvenile rheumatoid arthritis with systemic onset, left shoulder', 'R00.0': 'Tachycardia, unspecified', 'G21.8': 'Other secondary parkinsonism', 'W58.01': 'Bitten by alligator', 'D46.1': 'Refractory anemia with ring sideroblasts', 'H61.32': 'Acquired stenosis of external ear canal secondary to inflammation and infection', 'H95.0': 'Recurrent cholesteatoma of postmastoidectomy cavity', 'Z72.4': 'Inappropriate diet and eating habits', 'Z68.41': 'Body mass index (BMI) 40.0-44.9, adult', 'S20.172': 'Other superficial bite of breast, left breast', 'I63.232': 'Cerebral infarction due to unspecified occlusion or stenosis of left carotid arteries', 'M14.811': 'Arthropathies in other specified diseases classified elsewhere, right shoulder', 'E13.41': 'Other specified diabetes mellitus with diabetic mononeuropathy', 'H02.53': 'Eyelid retraction', 'V95.49': 'Other spacecraft accident injuring occupant', 'D74.0': 'Congenital methemoglobinemia', 'D60.1': 'Transient acquired pure red cell aplasia', 'T52.1X2': 'Toxic effect of benzene, intentional self-harm', 'O71.2': 'Postpartum inversion of uterus', 'M08.439': 'Pauciarticular juvenile rheumatoid arthritis, unspecified wrist', 'M01.X72': 'Direct infection of left ankle and foot in infectious and parasitic diseases classified elsewhere', 'H95.3': 'Accidental puncture and laceration of ear and mastoid process during a procedure', 'C74.92': 'Malignant neoplasm of unspecified part of left adrenal gland', 'G00': 'Bacterial meningitis, not elsewhere classified', 'M19.011': 'Primary osteoarthritis, right shoulder', 'G72.49': 'Other inflammatory and immune myopathies, not elsewhere classified', 'Z68.34': 'Body mass index (BMI) 34.0-34.9, adult', 'V86.64': 'Passenger of military vehicle injured in nontraffic accident', 'L20.9': 'Atopic dermatitis, unspecified', 'S65.51': 'Laceration of blood vessel of other and unspecified finger', 'B67.1': 'Echinococcus granulosus infection of lung', 'S08.81': 'Traumatic amputation of nose', 'Z36.5': 'Encounter for antenatal screening for isoimmunization', 'S59.22': 'Salter-Harris Type II physeal fracture of lower end of radius', 'M66.359': 'Spontaneous rupture of flexor tendons, unspecified thigh', 'I69.919': 'Unspecified symptoms and signs involving cognitive functions following unspecified cerebrovascular disease', 'I25.700': 'Atherosclerosis of coronary artery bypass graft(s), unspecified, with unstable angina pectoris', 'V24.0': 'Motorcycle driver injured in collision with heavy transport vehicle or bus in nontraffic accident', 'S53.025': 'Posterior dislocation of left radial head', 'Q72.819': 'Congenital shortening of unspecified lower limb', 'G44.82': 'Headache associated with sexual activity', 'M93.2': 'Osteochondritis dissecans', 'V44.6': 'Car passenger injured in collision with heavy transport vehicle or bus in traffic accident', 'O90.89': 'Other complications of the puerperium, not elsewhere classified', 'T83.518': 'Infection and inflammatory reaction due to other urinary catheter', 'Z02.9': 'Encounter for administrative examinations, unspecified', 'S55.091': 'Other specified injury of ulnar artery at forearm level, right arm'}
Each character of the string must be replaced by randomly choosing among all possible Hippocrates-codes that encode the character, and return result contain code where character is in, and index of character in value
so. this is the answer that I supposed to get
A66.0 M62.51.29 V44.6.68 H95.3.70 M08.06.26 S51.031.39 V92.13.17 V95.49.25 P07.32.46 C11.0.44 V04.45 E13.41.30 G21.8.5 R00.0.4 V52.2.54 B67.1.38 V24.0.43 M01.X72.10 C74.92.35 G72.49.35 Z68.41.24
and, this is the answer that i got.
F44.6.4 S78.922.3 W36.1.17 S93.121.2 E10.32.39 A00.1.12 S90.464.3 T37.1X.9 T43.2.17 W24.0.3 Q60.3.5 V59.9.14 S66.911.5 W93.42 V14.1.34 Y92.139.14 T21.06.12 T65.89.6 Q95.3.4 S85.161.16 S93.121.7 T37.1X.18 V49.60.23 T37.1X5.7 F98.29.16 J10.89.14
for get that I wrote code like this
import re
import random
class Hippocrates:
def __init__(self, code):
self.code = code
def description(self, x):
line_list = []
split_point = []
k = []
v = []
with open(self.code) as f:
for line in f:
for i in line:
if i == " ":
split_point.append(line.find(i))
with open(self.code) as f:
for line in f:
line_list.append(line.rstrip())
for i in line_list:
a = i.split(" ", 1)
k.append(a[0])
v.append(a[1])
d = dict(zip(k, v))
for key, value in d.items():
if x == key:
return d[key]
else:
raise ValueError('invalid ICD-code')
def character(self, numb):
line_list = []
split_point = []
k = []
v = []
with open(self.code) as f:
for line in f:
for i in line:
if i == " ":
split_point.append(line.find(i))
with open(self.code) as f:
for line in f:
line_list.append(line.rstrip())
for i in line_list:
a = i.split(" ", 1)
k.append(a[0])
v.append(a[1])
d = dict(zip(k, v))
rev = numb[::-1]
revs = rev.split('.',1)
r1 =(revs[1][::-1])
r2 = (revs[0][::-1])
for key, value in d.items():
if r1 == key:
answer = d[key]
result = answer[int(r2)]
return result
else:
raise ValueError('invalid Hippocrates-code')
def codes(self, char):
line_list = []
split_point = []
k = []
v = []
r_v = []
code_result = []
des_result = []
des_result2 = []
location = []
final = []
with open(self.code) as f:
for line in f:
for i in line:
if i == " ":
split_point.append(line.find(i))
with open(self.code) as f:
for line in f:
line_list.append(line.rstrip())
for i in line_list:
a = i.split(" ", 1)
k.append(a[0])
v.append(a[1])
d = dict(zip(k, v))
for i in v:
for x in i:
if x == char:
r_v.append(i)
for key, value in d.items():
for i in r_v:
if i == value:
code_result.append(key)
for key in d.keys():
for i in code_result:
if i == key:
des_result.append(d[i])
for i in des_result:
if i not in des_result2:
des_result2.append(i)
for i in des_result2:
regex = re.escape(char)
a = [m.start() for m in re.finditer(regex,i)]
location.append(a)
location = (sum(location,[]))
for i in range(len(code_result)):
answer = (str(code_result[i]) +'.'+ str(location[i]))
final.append(answer)
return (set(final))
def encode(self, plaintxt):
line_list = []
split_point = []
#key of dictionary
k = []
#value of dictionary
v = []
#description that contain character with index
r = []
#list of possible choice
t = []
#randomly choosen result from t
li_di = []
#descriptoin
des = []
#index of char in description
index_char = []
#answer to print
resul = []
dictlist = []
answers = []
with open(self.code) as f:
for line in f:
for i in line:
if i == " ":
split_point.append(line.find(i))
with open(self.code) as f:
for line in f:
line_list.append(line.rstrip())
for i in line_list:
a = i.split(" ", 1)
k.append(a[0])
v.append(a[1])
d = dict(zip(k, v))
print(d)
for key, value in d.items():
for i in plaintxt:
if i in value:
answer = d[key] +':'+ str(d[key].index(i))
r.append(answer)
print(r)
a = len(plaintxt)
b=0
for i in range(len(r)):
t.append(r[b::a])
b+=1
if b == len(plaintxt):
break
for i in t:
li_di.append(random.choice(i))
for i in li_di:
sep = i.split(":", 1)
des.append(sep[0])
index_char.append(sep[1])
print(index_char)
for i in des:
for key, value in d.items():
if i == value:
resul.append(key)
print(resul)
for i in range(len(resul)):
answers.append(resul[i]+'.'+index_char[i]+'')
return(" ".join(answers))
the codes that represent character in given_string should be in same order with, original given string, but i messed it up. how can i fix this?
This should work for your encode function:
def encode(self, plaintxt):
code_map = {}
codes = []
with open(self.code) as f:
for line in f:
line = line.rstrip().split(' ', 1)
code_map[line[0]] = line[1]
for ch in plaintxt:
matches = []
for key, value in code_map.items():
pos = -1
while True:
pos = value.find(ch, pos + 1)
if pos != -1:
matches.append((key, pos))
else:
break
if not matches:
raise ValueError(f'Character {ch} cannot be encoded as there are no matches')
code_tuple = random.choice(matches)
code, idx = code_tuple
codes.append(f'{code}.{idx}')
return ' '.join(codes)
Edit: I updated this to make it more space-efficient, by getting rid of char_map and appending codes as it goes
First, it creates a dict of keys as codes and values as the corresponding strings. Then it iterates through the given plaintxt string, and searches all of the values of the dict for matches (including multiple matches in a single value), and adds this to a matches list of tuples, where each tuple contains a suitable code and the index of the match. If there are no matches, it raises a ValueError as soon as it runs into an issue. It chooses randomly from each list of tuples to choose some code and index pair, and appends this to a list on the fly, and then at the end it joins this list to make your encoded string.
If memory is not a problem, I think you should build an index of possible choices of each character from the dictionary. Here is an example code:
import random
def build_char_codes(d):
result = {}
for key, val in d.items():
for i in range(len(val)):
ch = val[i]
if ch not in result:
result[ch] = {key: [i]}
else:
result[ch][key] = result[ch].get(key, []) + [i]
return result
def get_code(ch, char_codes):
key = random.sample(char_codes[ch].keys(), 1)[0]
char_pos = random.choice(char_codes[ch][key])
code = '{}.{}'.format(key, char_pos)
return code
char_codes = build_char_codes(dictionary)
given_string = 'You are what you eat.'
codes = [get_code(ch, char_codes) for ch in given_string]
print(' '.join(codes))
Notes:
char_codes index all possible choices of each character in the dictionary
it sample all the key in dictionary first (uniformly random), and then it sample the position in the string (uniformly random). But it is not sampling uniformly among all the possible choices of a character.
In preparation for the transformation, you could create a dictionary with each letter in the ICD description mapping to a list of codes that contain it at various indexes.
Then, the transformation process would simply be a matter of picking one of the code.index from the entry in the dictionary for each letter in the given string:
preparation ...
with open(fileName,'r') as f:
icd = [line.split(" ",1) for line in f.read().split("\n")]
icdLetters = dict() # list of ICD codes with index for each possible letter
for code,description in icd:
for i,letter in enumerate(description):
icdLetters.setdefault(letter,[]).append(f"{code}.{i}")
transformation....
import random
given_string = 'You are what you eat.'
result = [ random.choice(icdLetters.get(c,["-"])) for c in given_string ]
output:
print(result)
['A66.0', 'T80.22.35', 'S53.136.34', 'C40.90.33', 'S53.136.43', 'Z96.621.12', 'B57.30.24', 'H59.121.55', 'V14.1.43', 'S93.121.47', 'H59.121.9', 'V04.92.17', 'T80.22.80', 'O16.1.22', 'T25.61.10', 'S53.136.34', 'F44.6.32', 'M67.232.29', 'M89.771.34', 'S93.121.7', 'Z68.36.29']
If you want to save some memory, your dictionary could store indexes in the main list of icd codes and descriptions instead of the formatted values:
with open(fileName,'r') as f:
icd = [line.split(" ",1) for line in f.read().split("\n")]
icdLetters = dict()
for codeIndex,(code,description) in enumerate(icd):
for letterIndex,letter in enumerate(description):
icdLetters.setdefault(letter,[]).append((codeIndex,letterIndex))
import random
def letterToCode(letter):
if letter not in icdLetters: return "-"
codeIndex,letterIndex = random.choice(icdLetters[letter])
return f"{icd[codeIndex][0]}.{letterIndex}"
given_string = 'You are what you eat.'
result = [ letterToCode(c) for c in given_string ]

Parse vertical text in a file with repeated block

What is the best way for parsing below file? The blocks repeat multiple times.
The expected result is output to CSV file as:
{Place: REGION-1, Host: ABCD, Area: 44...}
I tried the code below, but it only iterates first blocks and than finishes.
with open('/tmp/t2.txt', 'r') as input_data:
for line in input_data:
if re.findall('(.*_RV)\n',line):
myDict={}
myDict['HOST'] = line[6:]
continue
elif re.findall('Interface(.*)\n',line):
myDict['INTF'] = line[6:]
elif len(line.strip()) == 0:
print(myDict)
Text file is below.
Instance REGION-1:
ABCD_RV
Interface: fastethernet01/01
Last state change: 0h54m44s ago
Sysid: 01441
Speaks: IPv4
Topologies:
ipv4-unicast
SAPA: point-to-point
Area Address(es):
441
IPv4 Address(es):
1.1.1.1
EFGH_RV
Interface: fastethernet01/01
Last state change: 0h54m44s ago
Sysid: 01442
Speaks: IPv4
Topologies:
ipv4-unicast
SAPA: point-to-point
Area Address(es):
442
IPv4 Address(es):
1.1.1.2
Instance REGION-2:
IJKL_RV
Interface: fastethernet01/01
Last state change: 0h54m44s ago
Sysid: 01443
Speaks: IPv4
Topologies:
ipv4-unicast
SAPA: point-to-point
Area Address(es):
443
IPv4 Address(es):
1.1.1.3
Or if you prefer an ugly regex route:
import re
region_re = re.compile("^Instance\s+([^:]+):.*")
host_re = re.compile("^\s+(.*?)_RV.*")
interface_re = re.compile("^\s+Interface:\s+(.*?)\s+")
other_re = re.compile("^\s+([^\s]+).*?:\s+([^\s]*){0,1}")
myDict = {}
extra = None
with open('/tmp/t2.txt', 'r') as input_data:
for line in input_data:
if extra: # value on next line from key
myDict[extra] = line.strip()
extra = None
continue
region = region_re.match(line)
if region:
if len(myDict) > 1:
print(myDict)
myDict = {'Place': region.group(1)}
continue
host = host_re.match(line)
if host:
if len(myDict) > 1:
print(myDict)
myDict = {'Place': myDict['Place'], 'Host': host.group(1)}
continue
interface = interface_re.match(line)
if interface:
myDict['INTF'] = interface.group(1)
continue
other = other_re.match(line)
if other:
groups = other.groups()
if groups[1]:
myDict[groups[0]] = groups[1]
else:
extra = groups[0]
# dump out final one
if len(myDict) > 1:
print(myDict)
output:
{'Place': 'REGION-1', 'Host': 'ABCD', 'INTF': 'fastethernet01/01', 'Last': '0h54m44s', 'Sysid': '01441', 'Speaks': 'IPv4', 'Topologies': 'ipv4-unicast', 'SAPA': 'point-to-point', 'Area': '441', 'IPv4': '1.1.1.1'}
{'Place': 'REGION-1', 'Host': 'EFGH', 'INTF': 'fastethernet01/01', 'Last': '0h54m44s', 'Sysid': '01442', 'Speaks': 'IPv4', 'Topologies': 'ipv4-unicast', 'SAPA': 'point-to-point', 'Area': '442', 'IPv4': '1.1.1.2'}
{'Place': 'REGION-2', 'Host': 'IJKL', 'INTF': 'fastethernet01/01', 'Last': '0h54m44s', 'Sysid': '01443', 'Speaks': 'IPv4', 'Topologies': 'ipv4-unicast', 'SAPA': 'point-to-point', 'Area': '443', 'IPv4': '1.1.1.3'}
This doesn't use much regex and could be more optimized. Hope it helps!
import re
import pandas as pd
from collections import defaultdict
_level_1 = re.compile(r'instance region.*', re.IGNORECASE)
with open('stack_formatting.txt') as f:
data = f.readlines()
"""
Format data so that it could be split easily
"""
data_blocks = defaultdict(lambda: defaultdict(str))
header = None
instance = None
for line in data:
line = line.strip()
if _level_1.match(line):
header = line
else:
if "_RV" in line:
instance = line
elif not line.endswith(":"):
data_blocks[header][instance] += line + ";"
else:
data_blocks[header][instance] += line
def parse_text(data_blocks):
"""
Generate a dict which could be converted easily to a pandas dataframe
:param data_blocks: splittable data
:return: dict with row values for every column
"""
final_data = defaultdict(list)
for key1 in data_blocks.keys():
for key2 in data_blocks.get(key1):
final_data['instance'].append(key1)
final_data['sub_instance'].append(key2)
for items in data_blocks[key1][key2].split(";"):
print(items)
if items.isspace() or len(items) == 0:
continue
a,b = re.split(r':\s*', items)
final_data[a].append(b)
return final_data
print(pd.DataFrame(parse_text(data_blocks)))
This worked for me but it's not pretty:
text=input_data
text=text.rstrip(' ').rstrip('\n').strip('\n')
#first I get ready to create a csv by replacing the headers for the data
text=text.replace('Instance REGION-1:',',')
text=text.replace('Instance REGION-2:',',')
text=text.replace('Interface:',',')
text=text.replace('Last state change:',',')
text=text.replace('Sysid:',',')
text=text.replace('Speaks:',',')
text=text.replace('Topologies:',',')
text=text.replace('SAPA:',',')
text=text.replace('Area Address(es):',',')
text=text.replace('IPv4 Address(es):',',')
#now I strip out the leading whitespace, cuz it messes up the split on '\n\n'
lines=[x.lstrip(' ') for x in text.split('\n')]
clean_text=''
#now that the leading whitespace is gone I recreate the text file
for line in lines:
clean_text+=line+'\n'
#Now split the data into groups based on single entries
entries=clean_text.split('\n\n')
#create one liners out of the entries so they can be split like csv
entry_lines=[x.replace('\n',' ') for x in entries]
#create a dataframe to hold the data for each line
df=pd.DataFrame(columns=['Instance REGION','Interface',
'Last state change','Sysid','Speaks',
'Topologies','SAPA','Area Address(es)',
'IPv4 Address(es)']).T
#now the meat and potatoes
count=0
for line in entry_lines:
data=line[1:].split(',') #split like a csv on commas
data=[x.lstrip(' ').rstrip(' ') for x in data] #get rid of extra leading/trailing whitespace
df[count]=data #create an entry for each split
count+=1 #incriment the count
df=df.T #transpose back to normal so it doesn't look weird
Output looks like this for me
Edit: Also, since you have various answers here, I test the performance of mine. It is mildly exponential as described by the equation y = 100.97e^(0.0003x)
Here are my timeit results.
Entries Milliseconds
18 49
270 106
1620 394
178420 28400

Doing operations on a large data set

I have to perform some analysis on a PSL record which contains information on DNA sequence fragments. Basically I have to find entries that are from the same read in the same contig (these are both values in the PSL entry). The problem is the PSL records are large (10-30 Mb text documents). I wrote a program that works on short records and on the long records given enough time but it took way longer than specified. I was told the program shouldn't take more than ~15 seconds. Mine took over 15 minutes.
PSL records look like this:
275 11 0 0 0 0 0 0 - M02034:35:000000000-A7UU0:1:1101:19443:1992/2 286 0 286 NODE_406138_length_13407_cov_13.425076 13465 408 694 1 286, 0, 408,
171 5 0 0 0 0 0 0 + M02034:35:000000000-A7UU0:1:1101:13497:2001/2 294 0 176 NODE_500869_length_34598_cov_30.643419 34656 34334 34510 1 176, 0, 34334,
188 14 0 10 0 0 0 0 + M02034:35:000000000-A7UU0:1:1101:18225:2002/1 257 45 257 NODE_455027_length_12018_cov_13.759444 12076 11322 11534 1 212, 45, 11322,
My code looks like this:
import sys
class PSLreader :
'''
Class to provide reading of a file containing psl alignments
formatted sequences:
object instantiation:
myPSLreader = PSLreader(<file name>):
object attributes:
fname: the initial file name
methods:
readPSL() : reads psl file, yielding those alignments that are within the first or last
1000 nt
readPSLpairs() : yields psl pairs that support a circular hypothesis
Author: David Bernick
Date: May 12, 2013
'''
def __init__ (self, fname=''):
'''contructor: saves attribute fname '''
self.fname = fname
def doOpen (self):
if self.fname is '':
return sys.stdin
else:
return open(self.fname)
def readPSL (self):
'''
using filename given in init, returns each filtered psl records
that contain alignments that are within the terminal 1000nt of
the target. Incomplete psl records are discarded.
If filename was not provided, stdin is used.
This method selects for alignments that could may be part of a
circle.
Illumina pairs aligned to the top strand would have read1(+) and read2(-).
For the bottoms trand, read1(-) and read2(+).
For potential circularity,
these are the conditions that can support circularity:
read1(+) near the 3' terminus
read1(-) near the 5' terminus
read2(-) near the 5' terminus
read2(+) near the 3' terminus
so...
any read(+) near the 3', or
any read(-) near the 5'
'''
nearEnd = 1000 # this constant determines "near the end"
with self.doOpen() as fileH:
for line in fileH:
pslList = line.split()
if len(pslList) < 17:
continue
tSize = int(pslList[14])
tStart = int(pslList[15])
strand = str(pslList[8])
if strand.startswith('+') and (tSize - tStart > nearEnd):
continue
elif strand.startswith('-') and (tStart > nearEnd):
continue
yield line
def readPSLpairs (self):
read1 = []
read2 = []
for psl in self.readPSL():
parsed_psl = psl.split()
strand = parsed_psl[9][-1]
if strand == '1':
read1.append(parsed_psl)
elif strand == '2':
read2.append(parsed_psl)
output = {}
for psl1 in read1:
name1 = psl1[9][:-1]
contig1 = psl1[13]
for psl2 in read2:
name2 = psl2[9][:-1]
contig2 = psl2[13]
if name1 == name2 and contig1 == contig2:
try:
output[contig1] += 1
break
except:
output[contig1] = 1
break
print(output)
PSL_obj = PSLreader('EEV14-Vf.filtered.psl')
PSL_obj.readPSLpairs()
I was given some example code that looks like this:
def doSomethingPairwise (a):
for leftItem in a[1]:
for rightItem in a[2]:
if leftItem[1] is rightItem[1]:
print (a)
thisStream = [['David', 'guitar', 1], ['David', 'guitar', 2],
['John', 'violin', 1], ['John', 'oboe', 2],
['Patrick', 'theremin', 1], ['Patrick', 'lute',2] ]
thisGroup = None
thisGroupList = [ [], [], [] ]
for name, instrument, num in thisStream:
if name != thisGroup:
doSomethingPairwise(thisGroupList)
thisGroup = name
thisGroupList = [ [], [], [] ]
thisGroupList[num].append([name, instrument, num])
doSomethingPairwise(thisGroupList)
But when I tried to implement it my program still took a long time. Am I thinking about this the wrong way? I realize the nested loop is slow but I don't see an alternative.
Edit: I figured it out, the data was presorted which made my brute force solution very impractical and unnecessary.
I hope help you, since, the question needs a best input example file
#is better create PSLRecord class
class PSLRecord:
def __init__(self, line):
pslList = line.split()
properties = ("matches", "misMatches", "repMatches", "nCount",
"qNumInsert", "qBaseInsert", "tNumInsert",
"tBaseInsert", "strand", "qName", "qSize", "qStart",
"qEnd", "tName", "tSize", "tStart", "tEnd", "blockCount",
"blockSizes", "qStarts", "tStarts")
self.__dict__.update(dict(zip(properties, pslList)))
class PSLreader :
def __init__ (self, fname=''):
self.fname = fname
def doOpen (self):
if self.fname is '':
return sys.stdin
else:
return open(self.fname)
def readPSL (self):
with self.doOpen() as fileH:
for line in fileH:
pslrc = PSLRecord(line)
yield pslrc
#return a dictionary with all psl records group by qName and tName
def readPSLpairs (self):
dictpsl = {}
for pslrc in self.readPSL():
#OP requirement, remove '1' or '2' char, in pslrc.qName[:-1]
key = (pslrc.qName[:-1], pslrc.tName)
if not key in dictpsl:
dictpsl[key] = []
dictpsl[key].append(pslrc)
return dictpsl
#Function filter .... is better out and self-contained
def f_filter(pslrec, nearEnd = 1000):
if (pslrec.strand.startswith('+') and
(int(pslrec.tSize) - int(pslrec.tStart) > nearEnd)):
return False
if (pslrec.strand.startswith('-') and
(int(pslrec.tStart) > nearEnd)):
return False
return True
PSL_obj = PSLreader('EEV14-Vf.filtered.psl')
#read dictionary of pairs
dictpsl = PSL_obj.readPSLpairs()
from itertools import product
#product from itertools
#(1) x (2,3) = (1,2),(1,3)
output = {}
for key, v in dictpsl.items():
name, contig = key
#i get filters aligns in principal strand
strand_princ = [pslrec for pslrec in v if f_filter(pslrec) and
pslrec.qName[-1] == '1']
#i get filters aligns in secondary strand
strand_sec = [pslrec for pslrec in v if f_filter(pslrec) and
pslrec.qName[-1] == '2']
for pslrec_princ, pslrec_sec in product(strand_princ, strand_sec):
#This For has fewer comparisons, since I was grouped before
if not contig in output:
output[contig] = 1
output[contig] += 1
Note: 10-30 Mb isn't large file, if you ask me

Categories

Resources