How to create a nested dictionary from a text file - python

So, my file looks like this :
Intestinal infectious diseases (001-003)
001 Cholera
002 Typhoid and paratyphoid fevers
003 Other salmonella infections
Tuberculosis (004-006)
004 Primary tuberculous infection
005 Pulmonary tuberculosis
006 Other respiratory tuberculosis
.
.
.
I'm supposed to make a nested dictionary with the disease group as keys and the dictionary containing the disease code and name, as value for the first dictionary. I'm having some trouble separating the disease codes into their own disease groups. Here's what I've done so far:
import json
icd9_encyclopedia={}
lines = []
f = open("icd9_info.txt", 'r')
for line in f:
line = line.rstrip("\n")
if line[0].isnumeric() == True:
icd9_encyclopedia[line] = ???
f.close()

solution
import itertools
from pathlib import Path
# load text lines
lines = Path('data.txt').read_text().split('\n')
# build output dictionary
icd9_encyclopedia = {
# build single group dictionary
group_name: {
int(code): disease_name
# split each disease line into code and text name
for disease_string in disease_strings
for (code, _, disease_name) in [disease_string.partition(' ')]
}
# get groups separated by an empty line
# isolate first item in each group as its name
for x, (group_name, *disease_strings) in itertools.groupby(lines, bool) if x
}
result
{'Intestinal infectious diseases (001-003)': {1: 'Cholera',
2: 'Typhoid and paratyphoid '
'fevers',
3: 'Other salmonella infections'},
'Tuberculosis (004-006)': {4: 'Primary tuberculous infection',
5: 'Pulmonary tuberculosis',
6: 'Other respiratory tuberculosis'}}

Here's another take on the problem that uses just basic Python:
from pprint import pprint
icd9_encyclopedia={}
key = None
item = {}
with open("icd9_info.txt") as f:
for line in f:
line = line.strip()
if not line[0].isdigit():
# Start a new item
if key:
# Store the prior item in the main dictionary
icd9_encyclopedia[key] = item
# Initialize the new item
key = line
item = {}
else:
# A detail entry - add it to the current item
num, rest = line.split(' ', 1)
item[num] = rest
# Store the final item to the dictionary
if key:
icd9_encyclopedia[key] = item
pprint(icd9_encyclopedia)
Result:
{'Intestinal infectious diseases (001-003)': {'001': 'Cholera',
'002': 'Typhoid and paratyphoid '
'fevers',
'003': 'Other salmonella '
'infections'},
'Tuberculosis (004-006)': {'004': 'Primary tuberculous infection',
'005': 'Pulmonary tuberculosis',
'006': 'Other respiratory tuberculosis'}}

I used defaultdict to easily make a nested dictionary, as follows:
from collections import defaultdict
icd9_encyclopedia = defaultdict(dict)
disease_group = ""
with open("icd9_info.txt", 'r') as f:
for line in [i[:-1] for i in f.readlines()]: # [:-1] to remove '\n' for each line
if line == "": # skip if blank line
continue
if not line[0].isdigit():
disease_group = line # temporarily save current disease group name for the following lines
else:
code, name = line.split(maxsplit=1)
icd9_encyclopedia[disease_group][code] = name
for key, value in icd9_encyclopedia.items():
print(key, value)
#Intestinal infectious diseases (001-003) {'001': 'Cholera', '002': 'Typhoid and paratyphoid fevers', '003': 'Other salmonella infections'}
#Tuberculosis (004-006) {'004': 'Primary tuberculous infection', '005': 'Pulmonary tuberculosis', '006': 'Other respiratory tuberculosis'}
You can see more detail about defaultdict here: https://www.geeksforgeeks.org/defaultdict-in-python/

validInt checks weather the data is a valid integer
def validInt(data):
try:
int(data)
except Exception as e:
return False
pass
return True
encyclo = {}
with open("file.data",'r') as f:
lines = f.readlines()
for line in lines:
if len(line.strip()) == 0:#line should not be empty
continue
first = line.split(' ')[0]
if validInt(first):
di = encyclo[list(encyclo.keys())[-1]] # returns a dictionary
di[first] = line[len(first):] # inserting data to dictionary len(first) is used to skip the numeric part
else:
encyclo[line] = {}
for key, value in encyclo.items():#displaying data
print(key, value)
$ python3 test.py
Intestinal infectious diseases (001-003)
{'001': ' Cholera\n', '002': ' Typhoid and paratyphoid fevers\n', '003': ' Other salmonella infections\n'}
Tuberculosis (004-006)
{'004': ' Primary tuberculous infection\n', '005': ' Pulmonary tuberculosis\n', '006': ' Other respiratory tuberculosis\n'}

Related

Make sentence from value of dictionary

link for original txt file
https://medusa.ugent.be/en/exercises/187053144/description/wM6YaQUbWdHKPhQX/media/ICD.txt
This is what I got:
given_string = 'You are what you eat.'
dictionary ={'D89.1': 'Cryoglobulinemia', 'M87.332': 'Other secondary osteonecrosis of left radius', 'M25.57': 'Pain in ankle and joints of foot', 'H59.111': 'Intraoperative hemorrhage and hematoma of right eye and adnexa complicating an ophthalmic procedure', 'I82.5Z9': 'Chronic embolism and thrombosis of unspecified deep veins of unspecified distal lower extremity', 'T38.3X': 'Poisoning by, adverse effect of and underdosing of insulin and oral hypoglycemic [antidiabetic] drugs', 'H95.52': 'Postprocedural hematoma of ear and mastoid process following other procedure', 'Q90.1': 'Trisomy 21, mosaicism (mitotic nondisjunction)', 'X83.8': 'Intentional self-harm by other specified means', 'H02.145': 'Spastic ectropion of left lower eyelid', 'M67.341': 'Transient synovitis, right hand', 'P07.32': 'Preterm newborn, gestational age 29 completed weeks', 'R44.8': 'Other symptoms and signs involving general sensations and perceptions', 'R03.1': 'Nonspecific low blood-pressure reading', 'Q03': 'Congenital hydrocephalus', 'C11.0': 'Malignant neoplasm of superior wall of nasopharynx', 'C44.4': 'Other and unspecified malignant neoplasm of skin of scalp and neck', 'N48.5': 'Ulcer of penis', 'T50.2X1': 'Poisoning by carbonic-anhydrase inhibitors, benzothiadiazides and other diuretics, accidental (unintentional)', 'V92.13': 'Drowning and submersion due to being thrown overboard by motion of other powered watercraft', 'D30.0': 'Benign neoplasm of kidney', 'M08.06': 'Unspecified juvenile rheumatoid arthritis, knee', 'T41.5X4': 'Poisoning by therapeutic gases, undetermined', 'T59.3X2': 'Toxic effect of lacrimogenic gas, intentional self-harm', 'S84.91': 'Injury of unspecified nerve at lower leg level, right leg', 'Z80.4': 'Family history of malignant neoplasm of genital organs', 'M05.34': 'Rheumatoid heart disease with rheumatoid arthritis of hand', 'Y36.531': 'War operations involving thermal radiation effect of nuclear weapon, civilian', 'H59.88': 'Other intraoperative complications of eye and adnexa, not elsewhere classified', 'R29.91': 'Unspecified symptoms and signs involving the musculoskeletal system', 'M71.139': 'Other infective bursitis, unspecified wrist', 'S00.441': 'External constriction of right ear', 'V04': 'Pedestrian injured in collision with heavy transport vehicle or bus', 'C92.1': 'Chronic myeloid leukemia, BCR/ABL-positive', 'I82.60': 'Acute embolism and thrombosis of unspecified veins of upper extremity', 'I75.89': 'Atheroembolism of other site', 'S51.031': 'Puncture wound without foreign body of right elbow', 'Z01.110': 'Encounter for hearing examination following failed hearing screening', 'I06.8': 'Other rheumatic aortic valve diseases', 'Z68.25': 'Body mass index (BMI) 25.0-25.9, adult', 'A66': 'Yaws', 'S78.921': 'Partial traumatic amputation of right hip and thigh, level unspecified', 'F44': 'Dissociative and conversion disorders', 'O87.8': 'Other venous complications in the puerperium', 'K04.3': 'Abnormal hard tissue formation in pulp', 'V38.7': 'Person on outside of three-wheeled motor vehicle injured in noncollision transport accident in traffic accident', 'V36.1': 'Passenger in three-wheeled motor vehicle injured in collision with other nonmotor vehicle in nontraffic accident', 'B94.9': 'Sequelae of unspecified infectious and parasitic disease', 'K50.911': "Crohn's disease, unspecified, with rectal bleeding", 'S00.52': 'Blister (nonthermal) of lip and oral cavity', 'T43.1': 'Poisoning by, adverse effect of and underdosing of monoamine-oxidase-inhibitor antidepressants', 'B99.8': 'Other infectious disease', 'S97.12': 'Crushing injury of lesser toe(s)', 'S02.69': 'Fracture of mandible of other specified site', 'V29.10': 'Motorcycle passenger injured in collision with unspecified motor vehicles in nontraffic accident', 'Z68.35': 'Body mass index (BMI) 35.0-35.9, adult', 'A81.2': 'Progressive multifocal leukoencephalopathy', 'V44.4': 'Person boarding or alighting a car injured in collision with heavy transport vehicle or bus', 'M62.51': 'Muscle wasting and atrophy, not elsewhere classified, shoulder', 'M62.151': 'Other rupture of muscle (nontraumatic), right thigh', 'V52.2': 'Person on outside of pick-up truck or van injured in collision with two- or three-wheeled motor vehicle in nontraffic accident', 'E09.622': 'Drug or chemical induced diabetes mellitus with other skin ulcer', 'S43.492': 'Other sprain of left shoulder joint', 'M08.212': 'Juvenile rheumatoid arthritis with systemic onset, left shoulder', 'R00.0': 'Tachycardia, unspecified', 'G21.8': 'Other secondary parkinsonism', 'W58.01': 'Bitten by alligator', 'D46.1': 'Refractory anemia with ring sideroblasts', 'H61.32': 'Acquired stenosis of external ear canal secondary to inflammation and infection', 'H95.0': 'Recurrent cholesteatoma of postmastoidectomy cavity', 'Z72.4': 'Inappropriate diet and eating habits', 'Z68.41': 'Body mass index (BMI) 40.0-44.9, adult', 'S20.172': 'Other superficial bite of breast, left breast', 'I63.232': 'Cerebral infarction due to unspecified occlusion or stenosis of left carotid arteries', 'M14.811': 'Arthropathies in other specified diseases classified elsewhere, right shoulder', 'E13.41': 'Other specified diabetes mellitus with diabetic mononeuropathy', 'H02.53': 'Eyelid retraction', 'V95.49': 'Other spacecraft accident injuring occupant', 'D74.0': 'Congenital methemoglobinemia', 'D60.1': 'Transient acquired pure red cell aplasia', 'T52.1X2': 'Toxic effect of benzene, intentional self-harm', 'O71.2': 'Postpartum inversion of uterus', 'M08.439': 'Pauciarticular juvenile rheumatoid arthritis, unspecified wrist', 'M01.X72': 'Direct infection of left ankle and foot in infectious and parasitic diseases classified elsewhere', 'H95.3': 'Accidental puncture and laceration of ear and mastoid process during a procedure', 'C74.92': 'Malignant neoplasm of unspecified part of left adrenal gland', 'G00': 'Bacterial meningitis, not elsewhere classified', 'M19.011': 'Primary osteoarthritis, right shoulder', 'G72.49': 'Other inflammatory and immune myopathies, not elsewhere classified', 'Z68.34': 'Body mass index (BMI) 34.0-34.9, adult', 'V86.64': 'Passenger of military vehicle injured in nontraffic accident', 'L20.9': 'Atopic dermatitis, unspecified', 'S65.51': 'Laceration of blood vessel of other and unspecified finger', 'B67.1': 'Echinococcus granulosus infection of lung', 'S08.81': 'Traumatic amputation of nose', 'Z36.5': 'Encounter for antenatal screening for isoimmunization', 'S59.22': 'Salter-Harris Type II physeal fracture of lower end of radius', 'M66.359': 'Spontaneous rupture of flexor tendons, unspecified thigh', 'I69.919': 'Unspecified symptoms and signs involving cognitive functions following unspecified cerebrovascular disease', 'I25.700': 'Atherosclerosis of coronary artery bypass graft(s), unspecified, with unstable angina pectoris', 'V24.0': 'Motorcycle driver injured in collision with heavy transport vehicle or bus in nontraffic accident', 'S53.025': 'Posterior dislocation of left radial head', 'Q72.819': 'Congenital shortening of unspecified lower limb', 'G44.82': 'Headache associated with sexual activity', 'M93.2': 'Osteochondritis dissecans', 'V44.6': 'Car passenger injured in collision with heavy transport vehicle or bus in traffic accident', 'O90.89': 'Other complications of the puerperium, not elsewhere classified', 'T83.518': 'Infection and inflammatory reaction due to other urinary catheter', 'Z02.9': 'Encounter for administrative examinations, unspecified', 'S55.091': 'Other specified injury of ulnar artery at forearm level, right arm'}
Each character of the string must be replaced by randomly choosing among all possible Hippocrates-codes that encode the character, and return result contain code where character is in, and index of character in value
so. this is the answer that I supposed to get
A66.0 M62.51.29 V44.6.68 H95.3.70 M08.06.26 S51.031.39 V92.13.17 V95.49.25 P07.32.46 C11.0.44 V04.45 E13.41.30 G21.8.5 R00.0.4 V52.2.54 B67.1.38 V24.0.43 M01.X72.10 C74.92.35 G72.49.35 Z68.41.24
and, this is the answer that i got.
F44.6.4 S78.922.3 W36.1.17 S93.121.2 E10.32.39 A00.1.12 S90.464.3 T37.1X.9 T43.2.17 W24.0.3 Q60.3.5 V59.9.14 S66.911.5 W93.42 V14.1.34 Y92.139.14 T21.06.12 T65.89.6 Q95.3.4 S85.161.16 S93.121.7 T37.1X.18 V49.60.23 T37.1X5.7 F98.29.16 J10.89.14
for get that I wrote code like this
import re
import random
class Hippocrates:
def __init__(self, code):
self.code = code
def description(self, x):
line_list = []
split_point = []
k = []
v = []
with open(self.code) as f:
for line in f:
for i in line:
if i == " ":
split_point.append(line.find(i))
with open(self.code) as f:
for line in f:
line_list.append(line.rstrip())
for i in line_list:
a = i.split(" ", 1)
k.append(a[0])
v.append(a[1])
d = dict(zip(k, v))
for key, value in d.items():
if x == key:
return d[key]
else:
raise ValueError('invalid ICD-code')
def character(self, numb):
line_list = []
split_point = []
k = []
v = []
with open(self.code) as f:
for line in f:
for i in line:
if i == " ":
split_point.append(line.find(i))
with open(self.code) as f:
for line in f:
line_list.append(line.rstrip())
for i in line_list:
a = i.split(" ", 1)
k.append(a[0])
v.append(a[1])
d = dict(zip(k, v))
rev = numb[::-1]
revs = rev.split('.',1)
r1 =(revs[1][::-1])
r2 = (revs[0][::-1])
for key, value in d.items():
if r1 == key:
answer = d[key]
result = answer[int(r2)]
return result
else:
raise ValueError('invalid Hippocrates-code')
def codes(self, char):
line_list = []
split_point = []
k = []
v = []
r_v = []
code_result = []
des_result = []
des_result2 = []
location = []
final = []
with open(self.code) as f:
for line in f:
for i in line:
if i == " ":
split_point.append(line.find(i))
with open(self.code) as f:
for line in f:
line_list.append(line.rstrip())
for i in line_list:
a = i.split(" ", 1)
k.append(a[0])
v.append(a[1])
d = dict(zip(k, v))
for i in v:
for x in i:
if x == char:
r_v.append(i)
for key, value in d.items():
for i in r_v:
if i == value:
code_result.append(key)
for key in d.keys():
for i in code_result:
if i == key:
des_result.append(d[i])
for i in des_result:
if i not in des_result2:
des_result2.append(i)
for i in des_result2:
regex = re.escape(char)
a = [m.start() for m in re.finditer(regex,i)]
location.append(a)
location = (sum(location,[]))
for i in range(len(code_result)):
answer = (str(code_result[i]) +'.'+ str(location[i]))
final.append(answer)
return (set(final))
def encode(self, plaintxt):
line_list = []
split_point = []
#key of dictionary
k = []
#value of dictionary
v = []
#description that contain character with index
r = []
#list of possible choice
t = []
#randomly choosen result from t
li_di = []
#descriptoin
des = []
#index of char in description
index_char = []
#answer to print
resul = []
dictlist = []
answers = []
with open(self.code) as f:
for line in f:
for i in line:
if i == " ":
split_point.append(line.find(i))
with open(self.code) as f:
for line in f:
line_list.append(line.rstrip())
for i in line_list:
a = i.split(" ", 1)
k.append(a[0])
v.append(a[1])
d = dict(zip(k, v))
print(d)
for key, value in d.items():
for i in plaintxt:
if i in value:
answer = d[key] +':'+ str(d[key].index(i))
r.append(answer)
print(r)
a = len(plaintxt)
b=0
for i in range(len(r)):
t.append(r[b::a])
b+=1
if b == len(plaintxt):
break
for i in t:
li_di.append(random.choice(i))
for i in li_di:
sep = i.split(":", 1)
des.append(sep[0])
index_char.append(sep[1])
print(index_char)
for i in des:
for key, value in d.items():
if i == value:
resul.append(key)
print(resul)
for i in range(len(resul)):
answers.append(resul[i]+'.'+index_char[i]+'')
return(" ".join(answers))
the codes that represent character in given_string should be in same order with, original given string, but i messed it up. how can i fix this?
This should work for your encode function:
def encode(self, plaintxt):
code_map = {}
codes = []
with open(self.code) as f:
for line in f:
line = line.rstrip().split(' ', 1)
code_map[line[0]] = line[1]
for ch in plaintxt:
matches = []
for key, value in code_map.items():
pos = -1
while True:
pos = value.find(ch, pos + 1)
if pos != -1:
matches.append((key, pos))
else:
break
if not matches:
raise ValueError(f'Character {ch} cannot be encoded as there are no matches')
code_tuple = random.choice(matches)
code, idx = code_tuple
codes.append(f'{code}.{idx}')
return ' '.join(codes)
Edit: I updated this to make it more space-efficient, by getting rid of char_map and appending codes as it goes
First, it creates a dict of keys as codes and values as the corresponding strings. Then it iterates through the given plaintxt string, and searches all of the values of the dict for matches (including multiple matches in a single value), and adds this to a matches list of tuples, where each tuple contains a suitable code and the index of the match. If there are no matches, it raises a ValueError as soon as it runs into an issue. It chooses randomly from each list of tuples to choose some code and index pair, and appends this to a list on the fly, and then at the end it joins this list to make your encoded string.
If memory is not a problem, I think you should build an index of possible choices of each character from the dictionary. Here is an example code:
import random
def build_char_codes(d):
result = {}
for key, val in d.items():
for i in range(len(val)):
ch = val[i]
if ch not in result:
result[ch] = {key: [i]}
else:
result[ch][key] = result[ch].get(key, []) + [i]
return result
def get_code(ch, char_codes):
key = random.sample(char_codes[ch].keys(), 1)[0]
char_pos = random.choice(char_codes[ch][key])
code = '{}.{}'.format(key, char_pos)
return code
char_codes = build_char_codes(dictionary)
given_string = 'You are what you eat.'
codes = [get_code(ch, char_codes) for ch in given_string]
print(' '.join(codes))
Notes:
char_codes index all possible choices of each character in the dictionary
it sample all the key in dictionary first (uniformly random), and then it sample the position in the string (uniformly random). But it is not sampling uniformly among all the possible choices of a character.
In preparation for the transformation, you could create a dictionary with each letter in the ICD description mapping to a list of codes that contain it at various indexes.
Then, the transformation process would simply be a matter of picking one of the code.index from the entry in the dictionary for each letter in the given string:
preparation ...
with open(fileName,'r') as f:
icd = [line.split(" ",1) for line in f.read().split("\n")]
icdLetters = dict() # list of ICD codes with index for each possible letter
for code,description in icd:
for i,letter in enumerate(description):
icdLetters.setdefault(letter,[]).append(f"{code}.{i}")
transformation....
import random
given_string = 'You are what you eat.'
result = [ random.choice(icdLetters.get(c,["-"])) for c in given_string ]
output:
print(result)
['A66.0', 'T80.22.35', 'S53.136.34', 'C40.90.33', 'S53.136.43', 'Z96.621.12', 'B57.30.24', 'H59.121.55', 'V14.1.43', 'S93.121.47', 'H59.121.9', 'V04.92.17', 'T80.22.80', 'O16.1.22', 'T25.61.10', 'S53.136.34', 'F44.6.32', 'M67.232.29', 'M89.771.34', 'S93.121.7', 'Z68.36.29']
If you want to save some memory, your dictionary could store indexes in the main list of icd codes and descriptions instead of the formatted values:
with open(fileName,'r') as f:
icd = [line.split(" ",1) for line in f.read().split("\n")]
icdLetters = dict()
for codeIndex,(code,description) in enumerate(icd):
for letterIndex,letter in enumerate(description):
icdLetters.setdefault(letter,[]).append((codeIndex,letterIndex))
import random
def letterToCode(letter):
if letter not in icdLetters: return "-"
codeIndex,letterIndex = random.choice(icdLetters[letter])
return f"{icd[codeIndex][0]}.{letterIndex}"
given_string = 'You are what you eat.'
result = [ letterToCode(c) for c in given_string ]

add values to a list from specific part of a text file

I am having this text
/** Goodmorning
Alex
Dog
House
Red
*/
/** Goodnight
Maria
Cat
Office
Green
*/
I would like to have Alex , Dog , House and red in one list and Maria,Cat,office,green in an other list.
I am having this code
with open(filename) as f :
for i in f:
if i.startswith("/** Goodmorning"):
#add files to list
elif i.startswith("/** Goodnight"):
#add files to other list
So, is there any way to write the script so it can understands that Alex belongs in the part of the text that has Goodmorning?
I'd recommend you to use dict, where "section name" will be a key:
with open(filename) as f:
result = {}
current_list = None
for line in f:
if line.startswith("/**"):
current_list = []
result[line[3:].strip()] = current_list
elif line != "*/":
current_list.append(line.strip())
Result:
{'Goodmorning': ['Alex', 'Dog', 'House', 'Red'], 'Goodnight': ['Maria', 'Cat', 'Office', 'Green']}
To search which key one of values belongs you can use next code:
search_value = "Alex"
for key, values in result.items():
if search_value in values:
print(search_value, "belongs to", key)
break
I would recommend to use Regular expressions. In python there is a module for this called re
import re
s = """/** Goodmorning
Alex
Dog
House
Red
*/
/** Goodnight
Maria
Cat
Office
Green
*/"""
pattern = r'/\*\*([\w \n]+)\*/'
word_groups = re.findall(pattern, s, re.MULTILINE)
d = {}
for word_group in word_groups:
words = word_group.strip().split('\n\n')
d[words[0]] = words[1:]
print(d)
Output:
{'Goodmorning': ['Alex', 'Dog', 'House', 'Red'], 'Goodnight':
['Maria', 'Cat', 'Office', 'Green']}
expanding on Olvin Roght (sorry can't comment - not enough reputation) I would keep a second dictionary for the reverse lookup
with open(filename) as f:
key_to_list = {}
name_to_key = {}
current_list = None
current_key = None
for line in f:
if line.startswith("/**"):
current_list = []
current_key = line[3:].strip()
key_to_list[current_key] = current_list
elif line != "*/":
current_name=line.strip()
name_to_key[current_name]=current_key
current_list.append(current_name)
print key_to_list
print name_to_key['Alex']
alternative is to convert the dictionary afterwards:
name_to_key = {n : k for k in key_to_list for n in key_to_list[k]}
(i.e if you want to go with the regex version from ashwani)
Limitation is that this only permits one membership per name.

Parse vertical text in a file with repeated block

What is the best way for parsing below file? The blocks repeat multiple times.
The expected result is output to CSV file as:
{Place: REGION-1, Host: ABCD, Area: 44...}
I tried the code below, but it only iterates first blocks and than finishes.
with open('/tmp/t2.txt', 'r') as input_data:
for line in input_data:
if re.findall('(.*_RV)\n',line):
myDict={}
myDict['HOST'] = line[6:]
continue
elif re.findall('Interface(.*)\n',line):
myDict['INTF'] = line[6:]
elif len(line.strip()) == 0:
print(myDict)
Text file is below.
Instance REGION-1:
ABCD_RV
Interface: fastethernet01/01
Last state change: 0h54m44s ago
Sysid: 01441
Speaks: IPv4
Topologies:
ipv4-unicast
SAPA: point-to-point
Area Address(es):
441
IPv4 Address(es):
1.1.1.1
EFGH_RV
Interface: fastethernet01/01
Last state change: 0h54m44s ago
Sysid: 01442
Speaks: IPv4
Topologies:
ipv4-unicast
SAPA: point-to-point
Area Address(es):
442
IPv4 Address(es):
1.1.1.2
Instance REGION-2:
IJKL_RV
Interface: fastethernet01/01
Last state change: 0h54m44s ago
Sysid: 01443
Speaks: IPv4
Topologies:
ipv4-unicast
SAPA: point-to-point
Area Address(es):
443
IPv4 Address(es):
1.1.1.3
Or if you prefer an ugly regex route:
import re
region_re = re.compile("^Instance\s+([^:]+):.*")
host_re = re.compile("^\s+(.*?)_RV.*")
interface_re = re.compile("^\s+Interface:\s+(.*?)\s+")
other_re = re.compile("^\s+([^\s]+).*?:\s+([^\s]*){0,1}")
myDict = {}
extra = None
with open('/tmp/t2.txt', 'r') as input_data:
for line in input_data:
if extra: # value on next line from key
myDict[extra] = line.strip()
extra = None
continue
region = region_re.match(line)
if region:
if len(myDict) > 1:
print(myDict)
myDict = {'Place': region.group(1)}
continue
host = host_re.match(line)
if host:
if len(myDict) > 1:
print(myDict)
myDict = {'Place': myDict['Place'], 'Host': host.group(1)}
continue
interface = interface_re.match(line)
if interface:
myDict['INTF'] = interface.group(1)
continue
other = other_re.match(line)
if other:
groups = other.groups()
if groups[1]:
myDict[groups[0]] = groups[1]
else:
extra = groups[0]
# dump out final one
if len(myDict) > 1:
print(myDict)
output:
{'Place': 'REGION-1', 'Host': 'ABCD', 'INTF': 'fastethernet01/01', 'Last': '0h54m44s', 'Sysid': '01441', 'Speaks': 'IPv4', 'Topologies': 'ipv4-unicast', 'SAPA': 'point-to-point', 'Area': '441', 'IPv4': '1.1.1.1'}
{'Place': 'REGION-1', 'Host': 'EFGH', 'INTF': 'fastethernet01/01', 'Last': '0h54m44s', 'Sysid': '01442', 'Speaks': 'IPv4', 'Topologies': 'ipv4-unicast', 'SAPA': 'point-to-point', 'Area': '442', 'IPv4': '1.1.1.2'}
{'Place': 'REGION-2', 'Host': 'IJKL', 'INTF': 'fastethernet01/01', 'Last': '0h54m44s', 'Sysid': '01443', 'Speaks': 'IPv4', 'Topologies': 'ipv4-unicast', 'SAPA': 'point-to-point', 'Area': '443', 'IPv4': '1.1.1.3'}
This doesn't use much regex and could be more optimized. Hope it helps!
import re
import pandas as pd
from collections import defaultdict
_level_1 = re.compile(r'instance region.*', re.IGNORECASE)
with open('stack_formatting.txt') as f:
data = f.readlines()
"""
Format data so that it could be split easily
"""
data_blocks = defaultdict(lambda: defaultdict(str))
header = None
instance = None
for line in data:
line = line.strip()
if _level_1.match(line):
header = line
else:
if "_RV" in line:
instance = line
elif not line.endswith(":"):
data_blocks[header][instance] += line + ";"
else:
data_blocks[header][instance] += line
def parse_text(data_blocks):
"""
Generate a dict which could be converted easily to a pandas dataframe
:param data_blocks: splittable data
:return: dict with row values for every column
"""
final_data = defaultdict(list)
for key1 in data_blocks.keys():
for key2 in data_blocks.get(key1):
final_data['instance'].append(key1)
final_data['sub_instance'].append(key2)
for items in data_blocks[key1][key2].split(";"):
print(items)
if items.isspace() or len(items) == 0:
continue
a,b = re.split(r':\s*', items)
final_data[a].append(b)
return final_data
print(pd.DataFrame(parse_text(data_blocks)))
This worked for me but it's not pretty:
text=input_data
text=text.rstrip(' ').rstrip('\n').strip('\n')
#first I get ready to create a csv by replacing the headers for the data
text=text.replace('Instance REGION-1:',',')
text=text.replace('Instance REGION-2:',',')
text=text.replace('Interface:',',')
text=text.replace('Last state change:',',')
text=text.replace('Sysid:',',')
text=text.replace('Speaks:',',')
text=text.replace('Topologies:',',')
text=text.replace('SAPA:',',')
text=text.replace('Area Address(es):',',')
text=text.replace('IPv4 Address(es):',',')
#now I strip out the leading whitespace, cuz it messes up the split on '\n\n'
lines=[x.lstrip(' ') for x in text.split('\n')]
clean_text=''
#now that the leading whitespace is gone I recreate the text file
for line in lines:
clean_text+=line+'\n'
#Now split the data into groups based on single entries
entries=clean_text.split('\n\n')
#create one liners out of the entries so they can be split like csv
entry_lines=[x.replace('\n',' ') for x in entries]
#create a dataframe to hold the data for each line
df=pd.DataFrame(columns=['Instance REGION','Interface',
'Last state change','Sysid','Speaks',
'Topologies','SAPA','Area Address(es)',
'IPv4 Address(es)']).T
#now the meat and potatoes
count=0
for line in entry_lines:
data=line[1:].split(',') #split like a csv on commas
data=[x.lstrip(' ').rstrip(' ') for x in data] #get rid of extra leading/trailing whitespace
df[count]=data #create an entry for each split
count+=1 #incriment the count
df=df.T #transpose back to normal so it doesn't look weird
Output looks like this for me
Edit: Also, since you have various answers here, I test the performance of mine. It is mildly exponential as described by the equation y = 100.97e^(0.0003x)
Here are my timeit results.
Entries Milliseconds
18 49
270 106
1620 394
178420 28400

How to search for multiple data from multiple lines and store them in dictionary?

Say I have a file with the following:
/* Full name: abc */
.....
.....(.....)
.....(".....) ;
/* .....
/* .....
..... : "....."
}
"....., .....
Car : true ;
House : true ;
....
....
Age : 33
....
/* Full name: xyz */
....
....
Car : true ;
....
....
Age : 56
....
I am only interested in full name, car, house and age of each person. There are many other lines of data with different format between the variable/attritbute that I am interested.
My code so far:
import re
initial_val = {'House': 'false', 'Car': 'false'}
with open('input.txt') as f:
records = []
current_record = None
for line in f:
if not line.strip():
continue
elif current_record is None:
people_name = re.search('.+Full name ?: (.+) ', line)
if people_name:
current_record = dict(initial_val, Name = people_name.group(1))
else:
continue
elif current_record is not None:
house = re.search(' *(House) ?: ?([a-z]+)', line)
if house:
current_record['House'] = house.group(2)
car = re.search(' *(Car) ?: ?([a-z]+)', line)
if car:
current_record['Car'] = car.group(2)
people_name = re.search('.+Full name ?: (.+) ', line)
if people_name:
records.append(current_record)
current_record = dict(initial_val, Name = people_name.group(1))
print records
What I get:
[{'Name': 'abc', 'House': 'true', 'Car': 'true'}]
My question:
How am I suppose to extract the data and store it in a dictionary like:
{'abc': {'Car': true, 'House': true, 'Age': 33}, 'xyz':{'Car': true, 'House': false, 'Age': 56}}
My purpose:
check whether each person has car, house and age, if no then return false
The I could print them in a table like this:
Name Car House Age
abc true true 33
xyz true false 56
Note that I am using Python 2.7 and I do not know what is the actual value of each variable/attribute (Eg. abc, true, true, 33) of each person.
What is the best solution to my question? Thanks.
Well, you just have to keep track of the current record:
def parse_name(line):
# first remove the initial '/* ' and final ' */'
stripped_line = line.strip('/* ')
return stripped_line.split(':')[-1]
WANTED_KEYS = ('Car', 'Age', 'House')
# default values for when the lines are not present for a record
INITIAL_VAL = {'Car': False, 'House': False, Age: -1}
with open('the_filename') as f:
records = []
current_record = None
for line in f:
if not line.strip():
# skip empty lines
continue
elif current_record is None:
# first record in the file
if line.startswith('/*'):
current_record = dict(INITIAL_VAL, name=parse_name(line))
else:
# this should probably be an error in the file contents
continue
elif line.startswith('/*'):
# this means that the current record finished, and a new one is starting
records.append(current_record)
current_record = dict(INITIAL_VAL, name=parse_name(line))
else:
key, val = line.split(':')
if key.strip() in WANTED_KEYS:
# we want to keep track of this field
current_record[key.strip()] = val.strip()
# otherwise just ignore the line
print('Name\tCar\tHouse\tAge')
for record in records:
print(record['name'], record['Car'], record['House'], record['Age'], sep='\t')
Note that for Age you may want to convert it to an integer using int:
if key == 'Age':
current_record['Age'] = int(val)
The above code produces a list of dictionaries, but it is easy enough to convert it to a dictionary of dicts:
new_records = {r['name']: dict(r) for r in records}
for val in new_records.values():
del val['name']
After this new_records will be something like:
{'abc': {'Car': True, 'House': True, Age: 20}, ...}
If you have other lines with a different format in between the interesting ones you can simply write a function that returns True or False depending on whether the line is in the format you require and use it to filter the lines of the file:
def is_interesting_line(line):
if line.startswith('/*'):
return True
elif ':' in line:
return True
for line in filter(is_interesting_line, f):
# code as before
Change is_interesting_line to suit your needs. In the end, if you have to handle several different formats etc. maybe using a regex would be better, in that case you could do something like:
import re
LINE_REGEX = re.compile(r'(/\*.*\*/)|(\w+\s*:.*)| <other stuff>')
def is_interesting_line(line):
return LINE_REGEX.match(line) is not None
If you want you can obtain fancier formatting for the table, but you probably first need to determine the maximum length of the name etc. or you can use something like tabulate to do that for you.
For example something like (not tested):
max_name_length = max(max(len(r['name']) for r in records), 4)
format_string = '{:<{}}\t{:<{}}\t{}\t{}'
print(format_string.format('Name', max_name_length, 'Car', 5, 'House', 'Age'))
for record in records:
print(format_string.format(record['name'], max_name_length, record['Car'], 5, record['House'], record['Age']))

Python: Parse a list of strings into a dictionnary

This is somewhat complicated. I have a list that looks like this:
['19841018 ID1\n', ' Plunging oil... \n', 'cut in the price \n', '\n', '19841018 ID2\n', ' The U.S. dollar... \n', 'the foreign-exchange markets \n', 'late New York trading \n', '\n']
In my list, the '\n' is what separate a story. What I would like to do is to create a dictionary from the above list that would like this:
dict = {ID1: [19841018, 'Plunging oil... cut in the price'], ID2: [19841018, 'The U.S. dollar... the foreign-exchange markets']}
You can see that my KEY of my dictionnary is the ID and the items are the year and the combination of the stories. Is that doable?
My IDs, are in this format J00100394, J00384932. So they all start with J00.
The tricky part is split your list by any value, so i've take this part from here.Then i've parsed the list parts to built the res dict
>>> import itertools
>>> def isplit(iterable,splitters):
... return [list(g) for k,g in itertools.groupby(iterable,lambda x:x in splitters) if not k]
...
>>> l = ['19841018 ID1\n', ' Plunging oil... \n', 'cut in the price \n', '\n', '19841018 ID2\n', ' The U.S. dollar... \n', 'the foreign-exchange markets \n', 'late New York trading \n', '\n']
>>> res = {}
>>> for sublist in isplit(l,('\n',)):
... id_parts = sublist[0].split()
... story = ' '.join (sentence.strip() for sentence in sublist[1:])
... res[id_parts[1].strip()] = [id_parts[0].strip(), story]
...
>>> res
{'ID2': ['19841018', 'The U.S. dollar... the foreign-exchange markets late New York trading'], 'ID1': ['19841018', 'Plunging oil... cut in the price']}
I code an answer that use generator. The idea is that every time that start an id token the generator return the last key computed. You can costumize by change the check_fun() and how to mix the part of the description.
def trailing_carriage(s):
if s.endswith('\n'):
return s[:-1]
return s
def check_fun(s):
"""
:param s:Take a string s
:return: None if s dosn't match the ID rules. Otherwise return the
name,value of the token
"""
if ' ' in s:
id_candidate,name = s.split(" ",1)
try:
return trailing_carriage(name),int(id_candidate)
except ValueError:
pass
def parser_list(list, check_id_prefix=check_fun):
name = None #key dict
id_candidate = None
desc = "" #description string
for token in list:
check = check_id_prefix(token)
if check is not None:
if name is not None:
"""Return the previous coputed entry"""
yield name,id_val,desc
name,id_val = check
else:
"""Append the description"""
desc += trailing_carriage(token)
if name is not None:
"""Flush the last entry"""
yield name,id_val,desc
>>> list = ['19841018 ID1\n', ' Plunging oil... \n', 'cut in the price \n', '\n', '19841018 ID2\n', ' The U.S. dollar... \n', 'the foreign-exchange markets \n', 'late New York trading \n', '\n']
>>> print {k:[i,d] for k,i,d in parser_list(list)}
{'ID2': [19841018, ' Plunging oil... cut in the price The U.S. dollar... the foreign-exchange markets late New York trading '], 'ID1': [19841018, ' Plunging oil... cut in the price ']}

Categories

Resources