Python: Parse a list of strings into a dictionnary

Python: Parse a list of strings into a dictionnary - python

This is somewhat complicated. I have a list that looks like this:
['19841018 ID1\n', ' Plunging oil... \n', 'cut in the price \n', '\n', '19841018 ID2\n', ' The U.S. dollar... \n', 'the foreign-exchange markets \n', 'late New York trading \n', '\n']
In my list, the '\n' is what separate a story. What I would like to do is to create a dictionary from the above list that would like this:
dict = {ID1: [19841018, 'Plunging oil... cut in the price'], ID2: [19841018, 'The U.S. dollar... the foreign-exchange markets']}
You can see that my KEY of my dictionnary is the ID and the items are the year and the combination of the stories. Is that doable?
My IDs, are in this format J00100394, J00384932. So they all start with J00.

The tricky part is split your list by any value, so i've take this part from here.Then i've parsed the list parts to built the res dict
>>> import itertools
>>> def isplit(iterable,splitters):
... return [list(g) for k,g in itertools.groupby(iterable,lambda x:x in splitters) if not k]
...
>>> l = ['19841018 ID1\n', ' Plunging oil... \n', 'cut in the price \n', '\n', '19841018 ID2\n', ' The U.S. dollar... \n', 'the foreign-exchange markets \n', 'late New York trading \n', '\n']
>>> res = {}
>>> for sublist in isplit(l,('\n',)):
... id_parts = sublist[0].split()
... story = ' '.join (sentence.strip() for sentence in sublist[1:])
... res[id_parts[1].strip()] = [id_parts[0].strip(), story]
...
>>> res
{'ID2': ['19841018', 'The U.S. dollar... the foreign-exchange markets late New York trading'], 'ID1': ['19841018', 'Plunging oil... cut in the price']}

I code an answer that use generator. The idea is that every time that start an id token the generator return the last key computed. You can costumize by change the check_fun() and how to mix the part of the description.
def trailing_carriage(s):
if s.endswith('\n'):
return s[:-1]
return s
def check_fun(s):
"""
:param s:Take a string s
:return: None if s dosn't match the ID rules. Otherwise return the
name,value of the token
"""
if ' ' in s:
id_candidate,name = s.split(" ",1)
try:
return trailing_carriage(name),int(id_candidate)
except ValueError:
pass
def parser_list(list, check_id_prefix=check_fun):
name = None #key dict
id_candidate = None
desc = "" #description string
for token in list:
check = check_id_prefix(token)
if check is not None:
if name is not None:
"""Return the previous coputed entry"""
yield name,id_val,desc
name,id_val = check
else:
"""Append the description"""
desc += trailing_carriage(token)
if name is not None:
"""Flush the last entry"""
yield name,id_val,desc
>>> list = ['19841018 ID1\n', ' Plunging oil... \n', 'cut in the price \n', '\n', '19841018 ID2\n', ' The U.S. dollar... \n', 'the foreign-exchange markets \n', 'late New York trading \n', '\n']
>>> print {k:[i,d] for k,i,d in parser_list(list)}
{'ID2': [19841018, ' Plunging oil... cut in the price The U.S. dollar... the foreign-exchange markets late New York trading '], 'ID1': [19841018, ' Plunging oil... cut in the price ']}

Related

How to create a nested dictionary from a text file

So, my file looks like this :
Intestinal infectious diseases (001-003)
001 Cholera
002 Typhoid and paratyphoid fevers
003 Other salmonella infections
Tuberculosis (004-006)
004 Primary tuberculous infection
005 Pulmonary tuberculosis
006 Other respiratory tuberculosis
.
.
.
I'm supposed to make a nested dictionary with the disease group as keys and the dictionary containing the disease code and name, as value for the first dictionary. I'm having some trouble separating the disease codes into their own disease groups. Here's what I've done so far:
import json
icd9_encyclopedia={}
lines = []
f = open("icd9_info.txt", 'r')
for line in f:
line = line.rstrip("\n")
if line[0].isnumeric() == True:
icd9_encyclopedia[line] = ???
f.close()

solution
import itertools
from pathlib import Path
# load text lines
lines = Path('data.txt').read_text().split('\n')
# build output dictionary
icd9_encyclopedia = {
# build single group dictionary
group_name: {
int(code): disease_name
# split each disease line into code and text name
for disease_string in disease_strings
for (code, _, disease_name) in [disease_string.partition(' ')]
}
# get groups separated by an empty line
# isolate first item in each group as its name
for x, (group_name, *disease_strings) in itertools.groupby(lines, bool) if x
}
result
{'Intestinal infectious diseases (001-003)': {1: 'Cholera',
2: 'Typhoid and paratyphoid '
'fevers',
3: 'Other salmonella infections'},
'Tuberculosis (004-006)': {4: 'Primary tuberculous infection',
5: 'Pulmonary tuberculosis',
6: 'Other respiratory tuberculosis'}}

Here's another take on the problem that uses just basic Python:
from pprint import pprint
icd9_encyclopedia={}
key = None
item = {}
with open("icd9_info.txt") as f:
for line in f:
line = line.strip()
if not line[0].isdigit():
# Start a new item
if key:
# Store the prior item in the main dictionary
icd9_encyclopedia[key] = item
# Initialize the new item
key = line
item = {}
else:
# A detail entry - add it to the current item
num, rest = line.split(' ', 1)
item[num] = rest
# Store the final item to the dictionary
if key:
icd9_encyclopedia[key] = item
pprint(icd9_encyclopedia)
Result:
{'Intestinal infectious diseases (001-003)': {'001': 'Cholera',
'002': 'Typhoid and paratyphoid '
'fevers',
'003': 'Other salmonella '
'infections'},
'Tuberculosis (004-006)': {'004': 'Primary tuberculous infection',
'005': 'Pulmonary tuberculosis',
'006': 'Other respiratory tuberculosis'}}

I used defaultdict to easily make a nested dictionary, as follows:
from collections import defaultdict
icd9_encyclopedia = defaultdict(dict)
disease_group = ""
with open("icd9_info.txt", 'r') as f:
for line in [i[:-1] for i in f.readlines()]: # [:-1] to remove '\n' for each line
if line == "": # skip if blank line
continue
if not line[0].isdigit():
disease_group = line # temporarily save current disease group name for the following lines
else:
code, name = line.split(maxsplit=1)
icd9_encyclopedia[disease_group][code] = name
for key, value in icd9_encyclopedia.items():
print(key, value)
#Intestinal infectious diseases (001-003) {'001': 'Cholera', '002': 'Typhoid and paratyphoid fevers', '003': 'Other salmonella infections'}
#Tuberculosis (004-006) {'004': 'Primary tuberculous infection', '005': 'Pulmonary tuberculosis', '006': 'Other respiratory tuberculosis'}
You can see more detail about defaultdict here: https://www.geeksforgeeks.org/defaultdict-in-python/

validInt checks weather the data is a valid integer
def validInt(data):
try:
int(data)
except Exception as e:
return False
pass
return True
encyclo = {}
with open("file.data",'r') as f:
lines = f.readlines()
for line in lines:
if len(line.strip()) == 0:#line should not be empty
continue
first = line.split(' ')[0]
if validInt(first):
di = encyclo[list(encyclo.keys())[-1]] # returns a dictionary
di[first] = line[len(first):] # inserting data to dictionary len(first) is used to skip the numeric part
else:
encyclo[line] = {}
for key, value in encyclo.items():#displaying data
print(key, value)
$ python3 test.py
Intestinal infectious diseases (001-003)
{'001': ' Cholera\n', '002': ' Typhoid and paratyphoid fevers\n', '003': ' Other salmonella infections\n'}
Tuberculosis (004-006)
{'004': ' Primary tuberculous infection\n', '005': ' Pulmonary tuberculosis\n', '006': ' Other respiratory tuberculosis\n'}

Text Preprocessing Translation Error Python

I was trying to translate tweet text using a deep translator but I found some issues.
Before translating the texts, I did some text preprocessing such as cleaning, removing emoji, etc. This is the ddefined functions of pre-processing :
def deEmojify(text):
regrex_pattern = re.compile("["
u"\U0001F600-\U0001F64F" # emoticons
u"\U0001F300-\U0001F5FF" # symbols & pictographs
u"\U0001F680-\U0001F6FF" # transport & map symbols
u"\U0001F1E0-\U0001F1FF" # flags (iOS)
u"\U00002500-\U00002BEF" # chinese char
u"\U00002702-\U000027B0"
u"\U00002702-\U000027B0"
u"\U000024C2-\U0001F251"
u"\U0001f926-\U0001f937"
u"\U00010000-\U0010ffff"
u"\u2640-\u2642"
u"\u2600-\u2B55"
u"\u200d"
u"\u23cf"
u"\u23e9"
u"\u231a"
u"\ufe0f" # dingbats
u"\u3030"
"]+", re.UNICODE)
return regrex_pattern.sub(r'',text)
def cleaningText(text):
text = re.sub(r'#[A-Za-z0-9]+', '', text) # remove mentions
text = re.sub(r'#[A-Za-z0-9]+', '', text) # remove hashtag
text = re.sub(r'RT[\s]', '', text) # remove RT
text = re.sub(r"http\S+", '', text) # remove link
text = re.sub(r"[!##$]", '', text) # remove link
text = re.sub(r'[0-9]+', '', text) # remove numbers
text = text.replace('\n', ' ') # replace new line into space
text = text.translate(str.maketrans('', '', string.punctuation)) # remove all punctuations
text = text.strip(' ') # remove characters space from both left and right text
return text
def casefoldingText(text): # Converting all the characters in a text into lower case
text = text.lower()
return text
def tokenizingText(text): # Tokenizing or splitting a string, text into a list of tokens
text = word_tokenize(text)
return text
def filteringText(text): # Remove stopwors in a text
listStopwords = set(stopwords.words('indonesian'))
filtered = []
for txt in text:
if txt not in listStopwords:
filtered.append(txt)
text = filtered
return text
def stemmingText(text): # Reducing a word to its word stem that affixes to suffixes and prefixes or to the roots of words
factory = StemmerFactory()
stemmer = factory.create_stemmer()
text = [stemmer.stem(word) for word in text]
return text
def convert_eng(text):
text = GoogleTranslator(source='auto', target='en').translate_batch(text)
return text
And here's the translate function :
def convert_eng(text):
text = GoogleTranslator(source='auto', target='en').translate(text)
return text
this is an example of the expected result ( text in Indonesian)
text = '#jshuahaee Ketemu agnes mo lagii😍😍'
clean = cleaningText(text)
print('After cleaning ==> ', clean)
emoji = deEmojify(clean)
print('After emoji ==> ', emoji)
cf = casefoldingText(emoji)
print('After case folding ==> ', cf)
token = tokenizingText(cf)
print('After token ==> ', token)
filter= filteringText(token)
print('After filter ==> ', filter)
stem = stemmingText(filter)
print('After Stem ==> ', stem)
en = convert_eng(stem)
print('After translate ==> ', en)
Result :
After cleaning ==> Ketemu agnes mo lagii😍😍
After emoji ==> Ketemu agnes mo lagii
After case folding ==> ketemu agnes mo lagii
After token ==> ['ketemu', 'agnes', 'mo', 'lagii']
After filter ==> ['ketemu', 'agnes', 'mo', 'lagii']
After Stem ==> ['ketemu', 'agnes', 'mo', 'lagi']
After translate ==> ['meet', 'agnes', 'mo', 'again']
But, I found issues when the sentences contain some dots, the error happened when after the stem process the text contain of [''] ( I don't know how to call this)
text = 'News update Meski kurang diaspirasi Shoppee yg korea minded dalam waktu indonesa belaja di bulan November Lazada 1… '
clean = cleaningText(text)
print('After cleaning ==> ', clean)
emoji = deEmojify(clean)
print('After emoji ==> ', emoji)
cf = casefoldingText(emoji)
print('After case folding ==> ', cf)
token = tokenizingText(cf)
print('After token ==> ', token)
filter= filteringText(token)
print('After filter ==> ', filter)
stem = stemmingText(filter)
print('After Stem ==> ', stem)
en = convert_eng(stem)
print('After translate ==> ', en)
Result
After cleaning ==> News update Meski kurang diaspirasi Shoppee yg korea minded dalam waktu indonesa belaja di bulan November Lazada …
After emoji ==> News update Meski kurang diaspirasi Shoppee yg korea minded dalam waktu indonesa belaja di bulan November Lazada …
After case folding ==> news update meski kurang diaspirasi shoppee yg korea minded dalam waktu indonesa belaja di bulan november lazada …
After token ==> ['news', 'update', 'meski', 'kurang', 'diaspirasi', 'shoppee', 'yg', 'korea', 'minded', 'dalam', 'waktu', 'indonesa', 'belaja', 'di', 'bulan', 'november', 'lazada', '…']
After filter ==> ['news', 'update', 'diaspirasi', 'shoppee', 'yg', 'korea', 'minded', 'indonesa', 'belaja', 'november', 'lazada', '…']
After Stem ==> ['news', 'update', 'aspirasi', 'shoppee', 'yg', 'korea', 'minded', 'indonesa', 'baja', 'november', 'lazada', '']
This is the error message
NotValidPayload Traceback (most recent call last)
<ipython-input-40-cb9390422d3c> in <module>
14 print('After Stem ==> ', stem)
15
---> 16 en = convert_eng(stem)
17 print('After translate ==> ', en)
<ipython-input-28-28bc36c96914> in convert_eng(text)
8 return text
9 def convert_eng(text):
---> 10 text = GoogleTranslator(source='auto', target='en').translate_batch(text)
11 return text
C:\Python\lib\site-packages\deep_translator\google_trans.py in translate_batch(self, batch, **kwargs)
195 for i, text in enumerate(batch):
196
--> 197 translated = self.translate(text, **kwargs)
198 arr.append(translated)
199 return arr
C:\Python\lib\site-packages\deep_translator\google_trans.py in translate(self, text, **kwargs)
108 """
109
--> 110 if self._validate_payload(text):
111 text = text.strip()
112
C:\Python\lib\site-packages\deep_translator\parent.py in _validate_payload(payload, min_chars, max_chars)
44
45 if not payload or not isinstance(payload, str) or not payload.strip() or payload.isdigit():
---> 46 raise NotValidPayload(payload)
47
48 # check if payload contains only symbols
NotValidPayload: --> text must be a valid text with maximum 5000 character, otherwise it cannot be translated
My idea is to remove the '', i think that was the problem, but I have no idea how to do that.
Anyone, please kindly help me

You need to introduce a bit of error checking into your code, and only process an expected data type. Your convert_eng function (that uses GoogleTranslator#translate_batch) requires a list of non-blank strings as an argument (see if not payload or not isinstance(payload, str) or not payload.strip() or payload.isdigit(): part), and your stem contains an empty string as the last item in the list.
Besides, it is possible that filteringText(text) can return [] because all words can turn out to be stopwords. Also, do not use filter as a name of a variable, it is a built-in.
So, change
filter= filteringText(token)
print('After filter ==> ', filter)
stem = stemmingText(filter)
print('After Stem ==> ', stem)
to
filter1 = filteringText(token)
print('After filter ==> ', filter1)
if filter1:
stem = stemmingText(filter1)
print('After Stem ==> ', stem)
en = convert_eng([x for x in stem if x.strip() and not x.isdigit()])
print('After translate ==> ', en)
I left out the isinstance(x, str) check because I assume you already know your list only contains strings.

Creating a vocabulary game

I am completely new to python. I have been creating a vocabulary program, but I want the to mix the words, so also the ones behind :. So far it keeps throwing only the ones in front of :. How can I achieve this?
print()
print('Welcome to german vocabulary quiz!')
import random
answer = input('Ready? ')
¨
print('')
while answer=='y' or 'yes':
vocabDictionary={
'e Arbeit':'pracovat', 'oder':'nebo', 'r Abend':'večer', 'als':'jako', 'bitten':'prosit',
'buchstabieren':'hláskovat','wessen':'čí','r Koffer':'kufr','wer':'kdo','wem':'komu',
'wen':'koho','sehen':'vidět','e Tochter':'dcera','gruSen':'zdravit','warten':'čekat','sagen':'říkat',
'e Lehrerin':'učitelka','r Lehrer':'učitel','schreiben':'napsat','zeigen':'ukázat','stehen':'stát','suchen':'hledat',
'fahren':'jet','abfahren':'odjet','kommen':'přijít','hier'and'da':'tady','s Buch':'kniha',
'r Zug':'vlak','offnen':'otevřít','schlieSen':'zavřít','ab/biegen':'odbočit','e Ampel':'semafor',
'denn':'pak'and'potom','dorthin':'tam','až'and'dokud':'bis','zu':'k'and'ke','druben'and'gegenuber':'naproti','fremd':'cizí',
'r FuSganger':'chodec','gerade':'právě','geradeaus':'rovně','e Halstestelle':'zastávka','r Hauptbahnhof':'hlavní nádraží',
'ihnen':'vám','e Kreuzung':'křižovatka','links':'vlevo','nach links':'doleva','mit':'se'or's','nach':'do'or'po',
'rechts':'vpravo','e StraSe':'ulice'and'silnice','uberqueren':'přejít','ungefahr':'přibližně'or'asi',
'von hier':'odsud','weiter':'dál','zu FuS':'pěšky','aber':'ale','alles':'všechno','e Blume':'květina',
'brav':'hodný','ein bisschen':'trochu','faul':'líný','fleiSig':'pilný','e Freizeit':'volný čas','r FuSball':'fotbal',
'gern(e)':'rád','groS':'velký','haben':'mít','horen':'poslouchat','hubsch'and'schon':'hezký'or'pěkný','jetzt':'teď'or'nyní',
'e Journalistin':'novinářka','s Kaninchen':'králík','lernen':'učit se','lieb':'milý','lustig':'veselý',
'manchmal':'někdy'or'občas','nett':'milý'or'vlídný'or'hezký','noch':'ještě','nur':'jen','oft':'často',
'recht':'skutečně'or'opravdu'or'velmi','sauber':'čistý','sauber machen':'uklízet','schauen':'dívat se'or'podívat se',
'schlank':'štíhlý','sehr':'velmi','zehn':'deset','r Spaziergang':'procházka','einen Spaziergang machen':'jít na procházku',
'spielen':'hrát','studieren':'studovat','s Tier':'zvíře','treiben':'zabývat se'or'provozovat','e Zeit':'čas',
'Sport treiben':'sportovat','verheiratet':'ženatý'or'vdaná','r Unternhehmer':'podnikatel','zu Hause':'doma',
'ziemlich':'pořádně'or'značně','zwanzig':'dvacet','aus':'z','dann':'potom','dich':'tebe'or'tě',
'dir':'ti'or'tobě','e Entschuldigung':'omluva'or'prominutí','finden':'nacházet'or'shledávat','gehen':'jít',
'geil':'báječný'or'skvělý'or'super','heiSen':'jmenovat se','r Herr':'pán','e Frau':'paní','r Nachname':'příjmení',
'leider':'bohužel','r Tag':'den','viel':'hodně'and'hodně','was':'co','wie':'jak','woher':'odkud','wohnen':'bydlet',
'Tschechien':'Česko'
}
keyword_list=list(vocabDictionary.keys())
random.shuffle(keyword_list)
score=0
for keyword in keyword_list:
display='{}'
print(display.format(keyword))
userInputAnswer=input(': ')
print('')

vocabDictionary.keys() This code only returns the keys of a dictionary, which are the words before the :
To create a list containing both the keys and the values, you can use .values() to create another list, and add the two lists
keyword_list1=list(vocabDictionary.keys())
keyword_list2= list(vocabDictionary.values())
keyword_list = keyword_list1 + keyword_list2
Full codes below:
print('Welcome to german vocabulary quiz!')
import random
answer = input('Ready? ')
print('')
while answer=='y' or 'yes':
vocabDictionary={
'e Arbeit':'pracovat', 'oder':'nebo', 'r Abend':'večer', 'als':'jako', 'bitten':'prosit',
'buchstabieren':'hláskovat','wessen':'čí','r Koffer':'kufr','wer':'kdo','wem':'komu',
'wen':'koho','sehen':'vidět','e Tochter':'dcera','gruSen':'zdravit','warten':'čekat','sagen':'říkat',
'e Lehrerin':'učitelka','r Lehrer':'učitel','schreiben':'napsat','zeigen':'ukázat','stehen':'stát','suchen':'hledat',
'fahren':'jet','abfahren':'odjet','kommen':'přijít','hier'and'da':'tady','s Buch':'kniha',
'r Zug':'vlak','offnen':'otevřít','schlieSen':'zavřít','ab/biegen':'odbočit','e Ampel':'semafor',
'denn':'pak'and'potom','dorthin':'tam','až'and'dokud':'bis','zu':'k'and'ke','druben'and'gegenuber':'naproti','fremd':'cizí',
'r FuSganger':'chodec','gerade':'právě','geradeaus':'rovně','e Halstestelle':'zastávka','r Hauptbahnhof':'hlavní nádraží',
'ihnen':'vám','e Kreuzung':'křižovatka','links':'vlevo','nach links':'doleva','mit':'se'or's','nach':'do'or'po',
'rechts':'vpravo','e StraSe':'ulice'and'silnice','uberqueren':'přejít','ungefahr':'přibližně'or'asi',
'von hier':'odsud','weiter':'dál','zu FuS':'pěšky','aber':'ale','alles':'všechno','e Blume':'květina',
'brav':'hodný','ein bisschen':'trochu','faul':'líný','fleiSig':'pilný','e Freizeit':'volný čas','r FuSball':'fotbal',
'gern(e)':'rád','groS':'velký','haben':'mít','horen':'poslouchat','hubsch'and'schon':'hezký'or'pěkný','jetzt':'teď'or'nyní',
'e Journalistin':'novinářka','s Kaninchen':'králík','lernen':'učit se','lieb':'milý','lustig':'veselý',
'manchmal':'někdy'or'občas','nett':'milý'or'vlídný'or'hezký','noch':'ještě','nur':'jen','oft':'často',
'recht':'skutečně'or'opravdu'or'velmi','sauber':'čistý','sauber machen':'uklízet','schauen':'dívat se'or'podívat se',
'schlank':'štíhlý','sehr':'velmi','zehn':'deset','r Spaziergang':'procházka','einen Spaziergang machen':'jít na procházku',
'spielen':'hrát','studieren':'studovat','s Tier':'zvíře','treiben':'zabývat se'or'provozovat','e Zeit':'čas',
'Sport treiben':'sportovat','verheiratet':'ženatý'or'vdaná','r Unternhehmer':'podnikatel','zu Hause':'doma',
'ziemlich':'pořádně'or'značně','zwanzig':'dvacet','aus':'z','dann':'potom','dich':'tebe'or'tě',
'dir':'ti'or'tobě','e Entschuldigung':'omluva'or'prominutí','finden':'nacházet'or'shledávat','gehen':'jít',
'geil':'báječný'or'skvělý'or'super','heiSen':'jmenovat se','r Herr':'pán','e Frau':'paní','r Nachname':'příjmení',
'leider':'bohužel','r Tag':'den','viel':'hodně'and'hodně','was':'co','wie':'jak','woher':'odkud','wohnen':'bydlet',
'Tschechien':'Česko'
}
keyword_list1=list(vocabDictionary.keys())
keyword_list2= list(vocabDictionary.values())
keyword_list = keyword_list1 + keyword_list2
random.shuffle(keyword_list)
score=0
for keyword in keyword_list:
display='{}'
print(display.format(keyword))
userInputAnswer=input(': ')
print('')
try:
if userInputAnswer==(vocabDictionary[keyword]):
score += 1
except KeyError:
try:
if keyword == vocabDictionary[userInputAnswer]:
score +=1
except KeyError:
pass
print(score)

Currently, you are only picking words from keys (so before the semicolumn).
You could try this:
keyword_list_keys=list(vocabDictionary.keys())
keyword_list_values=list(vocabDictionary.values())
random.shuffle(keyword_list_keys + keyword_list_values)
Then you would have to differentiate depending on the two cases, to find the matching key/value.

Make sentence from value of dictionary

link for original txt file
https://medusa.ugent.be/en/exercises/187053144/description/wM6YaQUbWdHKPhQX/media/ICD.txt
This is what I got:
given_string = 'You are what you eat.'
dictionary ={'D89.1': 'Cryoglobulinemia', 'M87.332': 'Other secondary osteonecrosis of left radius', 'M25.57': 'Pain in ankle and joints of foot', 'H59.111': 'Intraoperative hemorrhage and hematoma of right eye and adnexa complicating an ophthalmic procedure', 'I82.5Z9': 'Chronic embolism and thrombosis of unspecified deep veins of unspecified distal lower extremity', 'T38.3X': 'Poisoning by, adverse effect of and underdosing of insulin and oral hypoglycemic [antidiabetic] drugs', 'H95.52': 'Postprocedural hematoma of ear and mastoid process following other procedure', 'Q90.1': 'Trisomy 21, mosaicism (mitotic nondisjunction)', 'X83.8': 'Intentional self-harm by other specified means', 'H02.145': 'Spastic ectropion of left lower eyelid', 'M67.341': 'Transient synovitis, right hand', 'P07.32': 'Preterm newborn, gestational age 29 completed weeks', 'R44.8': 'Other symptoms and signs involving general sensations and perceptions', 'R03.1': 'Nonspecific low blood-pressure reading', 'Q03': 'Congenital hydrocephalus', 'C11.0': 'Malignant neoplasm of superior wall of nasopharynx', 'C44.4': 'Other and unspecified malignant neoplasm of skin of scalp and neck', 'N48.5': 'Ulcer of penis', 'T50.2X1': 'Poisoning by carbonic-anhydrase inhibitors, benzothiadiazides and other diuretics, accidental (unintentional)', 'V92.13': 'Drowning and submersion due to being thrown overboard by motion of other powered watercraft', 'D30.0': 'Benign neoplasm of kidney', 'M08.06': 'Unspecified juvenile rheumatoid arthritis, knee', 'T41.5X4': 'Poisoning by therapeutic gases, undetermined', 'T59.3X2': 'Toxic effect of lacrimogenic gas, intentional self-harm', 'S84.91': 'Injury of unspecified nerve at lower leg level, right leg', 'Z80.4': 'Family history of malignant neoplasm of genital organs', 'M05.34': 'Rheumatoid heart disease with rheumatoid arthritis of hand', 'Y36.531': 'War operations involving thermal radiation effect of nuclear weapon, civilian', 'H59.88': 'Other intraoperative complications of eye and adnexa, not elsewhere classified', 'R29.91': 'Unspecified symptoms and signs involving the musculoskeletal system', 'M71.139': 'Other infective bursitis, unspecified wrist', 'S00.441': 'External constriction of right ear', 'V04': 'Pedestrian injured in collision with heavy transport vehicle or bus', 'C92.1': 'Chronic myeloid leukemia, BCR/ABL-positive', 'I82.60': 'Acute embolism and thrombosis of unspecified veins of upper extremity', 'I75.89': 'Atheroembolism of other site', 'S51.031': 'Puncture wound without foreign body of right elbow', 'Z01.110': 'Encounter for hearing examination following failed hearing screening', 'I06.8': 'Other rheumatic aortic valve diseases', 'Z68.25': 'Body mass index (BMI) 25.0-25.9, adult', 'A66': 'Yaws', 'S78.921': 'Partial traumatic amputation of right hip and thigh, level unspecified', 'F44': 'Dissociative and conversion disorders', 'O87.8': 'Other venous complications in the puerperium', 'K04.3': 'Abnormal hard tissue formation in pulp', 'V38.7': 'Person on outside of three-wheeled motor vehicle injured in noncollision transport accident in traffic accident', 'V36.1': 'Passenger in three-wheeled motor vehicle injured in collision with other nonmotor vehicle in nontraffic accident', 'B94.9': 'Sequelae of unspecified infectious and parasitic disease', 'K50.911': "Crohn's disease, unspecified, with rectal bleeding", 'S00.52': 'Blister (nonthermal) of lip and oral cavity', 'T43.1': 'Poisoning by, adverse effect of and underdosing of monoamine-oxidase-inhibitor antidepressants', 'B99.8': 'Other infectious disease', 'S97.12': 'Crushing injury of lesser toe(s)', 'S02.69': 'Fracture of mandible of other specified site', 'V29.10': 'Motorcycle passenger injured in collision with unspecified motor vehicles in nontraffic accident', 'Z68.35': 'Body mass index (BMI) 35.0-35.9, adult', 'A81.2': 'Progressive multifocal leukoencephalopathy', 'V44.4': 'Person boarding or alighting a car injured in collision with heavy transport vehicle or bus', 'M62.51': 'Muscle wasting and atrophy, not elsewhere classified, shoulder', 'M62.151': 'Other rupture of muscle (nontraumatic), right thigh', 'V52.2': 'Person on outside of pick-up truck or van injured in collision with two- or three-wheeled motor vehicle in nontraffic accident', 'E09.622': 'Drug or chemical induced diabetes mellitus with other skin ulcer', 'S43.492': 'Other sprain of left shoulder joint', 'M08.212': 'Juvenile rheumatoid arthritis with systemic onset, left shoulder', 'R00.0': 'Tachycardia, unspecified', 'G21.8': 'Other secondary parkinsonism', 'W58.01': 'Bitten by alligator', 'D46.1': 'Refractory anemia with ring sideroblasts', 'H61.32': 'Acquired stenosis of external ear canal secondary to inflammation and infection', 'H95.0': 'Recurrent cholesteatoma of postmastoidectomy cavity', 'Z72.4': 'Inappropriate diet and eating habits', 'Z68.41': 'Body mass index (BMI) 40.0-44.9, adult', 'S20.172': 'Other superficial bite of breast, left breast', 'I63.232': 'Cerebral infarction due to unspecified occlusion or stenosis of left carotid arteries', 'M14.811': 'Arthropathies in other specified diseases classified elsewhere, right shoulder', 'E13.41': 'Other specified diabetes mellitus with diabetic mononeuropathy', 'H02.53': 'Eyelid retraction', 'V95.49': 'Other spacecraft accident injuring occupant', 'D74.0': 'Congenital methemoglobinemia', 'D60.1': 'Transient acquired pure red cell aplasia', 'T52.1X2': 'Toxic effect of benzene, intentional self-harm', 'O71.2': 'Postpartum inversion of uterus', 'M08.439': 'Pauciarticular juvenile rheumatoid arthritis, unspecified wrist', 'M01.X72': 'Direct infection of left ankle and foot in infectious and parasitic diseases classified elsewhere', 'H95.3': 'Accidental puncture and laceration of ear and mastoid process during a procedure', 'C74.92': 'Malignant neoplasm of unspecified part of left adrenal gland', 'G00': 'Bacterial meningitis, not elsewhere classified', 'M19.011': 'Primary osteoarthritis, right shoulder', 'G72.49': 'Other inflammatory and immune myopathies, not elsewhere classified', 'Z68.34': 'Body mass index (BMI) 34.0-34.9, adult', 'V86.64': 'Passenger of military vehicle injured in nontraffic accident', 'L20.9': 'Atopic dermatitis, unspecified', 'S65.51': 'Laceration of blood vessel of other and unspecified finger', 'B67.1': 'Echinococcus granulosus infection of lung', 'S08.81': 'Traumatic amputation of nose', 'Z36.5': 'Encounter for antenatal screening for isoimmunization', 'S59.22': 'Salter-Harris Type II physeal fracture of lower end of radius', 'M66.359': 'Spontaneous rupture of flexor tendons, unspecified thigh', 'I69.919': 'Unspecified symptoms and signs involving cognitive functions following unspecified cerebrovascular disease', 'I25.700': 'Atherosclerosis of coronary artery bypass graft(s), unspecified, with unstable angina pectoris', 'V24.0': 'Motorcycle driver injured in collision with heavy transport vehicle or bus in nontraffic accident', 'S53.025': 'Posterior dislocation of left radial head', 'Q72.819': 'Congenital shortening of unspecified lower limb', 'G44.82': 'Headache associated with sexual activity', 'M93.2': 'Osteochondritis dissecans', 'V44.6': 'Car passenger injured in collision with heavy transport vehicle or bus in traffic accident', 'O90.89': 'Other complications of the puerperium, not elsewhere classified', 'T83.518': 'Infection and inflammatory reaction due to other urinary catheter', 'Z02.9': 'Encounter for administrative examinations, unspecified', 'S55.091': 'Other specified injury of ulnar artery at forearm level, right arm'}
Each character of the string must be replaced by randomly choosing among all possible Hippocrates-codes that encode the character, and return result contain code where character is in, and index of character in value
so. this is the answer that I supposed to get
A66.0 M62.51.29 V44.6.68 H95.3.70 M08.06.26 S51.031.39 V92.13.17 V95.49.25 P07.32.46 C11.0.44 V04.45 E13.41.30 G21.8.5 R00.0.4 V52.2.54 B67.1.38 V24.0.43 M01.X72.10 C74.92.35 G72.49.35 Z68.41.24
and, this is the answer that i got.
F44.6.4 S78.922.3 W36.1.17 S93.121.2 E10.32.39 A00.1.12 S90.464.3 T37.1X.9 T43.2.17 W24.0.3 Q60.3.5 V59.9.14 S66.911.5 W93.42 V14.1.34 Y92.139.14 T21.06.12 T65.89.6 Q95.3.4 S85.161.16 S93.121.7 T37.1X.18 V49.60.23 T37.1X5.7 F98.29.16 J10.89.14
for get that I wrote code like this
import re
import random
class Hippocrates:
def __init__(self, code):
self.code = code
def description(self, x):
line_list = []
split_point = []
k = []
v = []
with open(self.code) as f:
for line in f:
for i in line:
if i == " ":
split_point.append(line.find(i))
with open(self.code) as f:
for line in f:
line_list.append(line.rstrip())
for i in line_list:
a = i.split(" ", 1)
k.append(a[0])
v.append(a[1])
d = dict(zip(k, v))
for key, value in d.items():
if x == key:
return d[key]
else:
raise ValueError('invalid ICD-code')
def character(self, numb):
line_list = []
split_point = []
k = []
v = []
with open(self.code) as f:
for line in f:
for i in line:
if i == " ":
split_point.append(line.find(i))
with open(self.code) as f:
for line in f:
line_list.append(line.rstrip())
for i in line_list:
a = i.split(" ", 1)
k.append(a[0])
v.append(a[1])
d = dict(zip(k, v))
rev = numb[::-1]
revs = rev.split('.',1)
r1 =(revs[1][::-1])
r2 = (revs[0][::-1])
for key, value in d.items():
if r1 == key:
answer = d[key]
result = answer[int(r2)]
return result
else:
raise ValueError('invalid Hippocrates-code')
def codes(self, char):
line_list = []
split_point = []
k = []
v = []
r_v = []
code_result = []
des_result = []
des_result2 = []
location = []
final = []
with open(self.code) as f:
for line in f:
for i in line:
if i == " ":
split_point.append(line.find(i))
with open(self.code) as f:
for line in f:
line_list.append(line.rstrip())
for i in line_list:
a = i.split(" ", 1)
k.append(a[0])
v.append(a[1])
d = dict(zip(k, v))
for i in v:
for x in i:
if x == char:
r_v.append(i)
for key, value in d.items():
for i in r_v:
if i == value:
code_result.append(key)
for key in d.keys():
for i in code_result:
if i == key:
des_result.append(d[i])
for i in des_result:
if i not in des_result2:
des_result2.append(i)
for i in des_result2:
regex = re.escape(char)
a = [m.start() for m in re.finditer(regex,i)]
location.append(a)
location = (sum(location,[]))
for i in range(len(code_result)):
answer = (str(code_result[i]) +'.'+ str(location[i]))
final.append(answer)
return (set(final))
def encode(self, plaintxt):
line_list = []
split_point = []
#key of dictionary
k = []
#value of dictionary
v = []
#description that contain character with index
r = []
#list of possible choice
t = []
#randomly choosen result from t
li_di = []
#descriptoin
des = []
#index of char in description
index_char = []
#answer to print
resul = []
dictlist = []
answers = []
with open(self.code) as f:
for line in f:
for i in line:
if i == " ":
split_point.append(line.find(i))
with open(self.code) as f:
for line in f:
line_list.append(line.rstrip())
for i in line_list:
a = i.split(" ", 1)
k.append(a[0])
v.append(a[1])
d = dict(zip(k, v))
print(d)
for key, value in d.items():
for i in plaintxt:
if i in value:
answer = d[key] +':'+ str(d[key].index(i))
r.append(answer)
print(r)
a = len(plaintxt)
b=0
for i in range(len(r)):
t.append(r[b::a])
b+=1
if b == len(plaintxt):
break
for i in t:
li_di.append(random.choice(i))
for i in li_di:
sep = i.split(":", 1)
des.append(sep[0])
index_char.append(sep[1])
print(index_char)
for i in des:
for key, value in d.items():
if i == value:
resul.append(key)
print(resul)
for i in range(len(resul)):
answers.append(resul[i]+'.'+index_char[i]+'')
return(" ".join(answers))
the codes that represent character in given_string should be in same order with, original given string, but i messed it up. how can i fix this?

This should work for your encode function:
def encode(self, plaintxt):
code_map = {}
codes = []
with open(self.code) as f:
for line in f:
line = line.rstrip().split(' ', 1)
code_map[line[0]] = line[1]
for ch in plaintxt:
matches = []
for key, value in code_map.items():
pos = -1
while True:
pos = value.find(ch, pos + 1)
if pos != -1:
matches.append((key, pos))
else:
break
if not matches:
raise ValueError(f'Character {ch} cannot be encoded as there are no matches')
code_tuple = random.choice(matches)
code, idx = code_tuple
codes.append(f'{code}.{idx}')
return ' '.join(codes)
Edit: I updated this to make it more space-efficient, by getting rid of char_map and appending codes as it goes
First, it creates a dict of keys as codes and values as the corresponding strings. Then it iterates through the given plaintxt string, and searches all of the values of the dict for matches (including multiple matches in a single value), and adds this to a matches list of tuples, where each tuple contains a suitable code and the index of the match. If there are no matches, it raises a ValueError as soon as it runs into an issue. It chooses randomly from each list of tuples to choose some code and index pair, and appends this to a list on the fly, and then at the end it joins this list to make your encoded string.

If memory is not a problem, I think you should build an index of possible choices of each character from the dictionary. Here is an example code:
import random
def build_char_codes(d):
result = {}
for key, val in d.items():
for i in range(len(val)):
ch = val[i]
if ch not in result:
result[ch] = {key: [i]}
else:
result[ch][key] = result[ch].get(key, []) + [i]
return result
def get_code(ch, char_codes):
key = random.sample(char_codes[ch].keys(), 1)[0]
char_pos = random.choice(char_codes[ch][key])
code = '{}.{}'.format(key, char_pos)
return code
char_codes = build_char_codes(dictionary)
given_string = 'You are what you eat.'
codes = [get_code(ch, char_codes) for ch in given_string]
print(' '.join(codes))
Notes:
char_codes index all possible choices of each character in the dictionary
it sample all the key in dictionary first (uniformly random), and then it sample the position in the string (uniformly random). But it is not sampling uniformly among all the possible choices of a character.

In preparation for the transformation, you could create a dictionary with each letter in the ICD description mapping to a list of codes that contain it at various indexes.
Then, the transformation process would simply be a matter of picking one of the code.index from the entry in the dictionary for each letter in the given string:
preparation ...
with open(fileName,'r') as f:
icd = [line.split(" ",1) for line in f.read().split("\n")]
icdLetters = dict() # list of ICD codes with index for each possible letter
for code,description in icd:
for i,letter in enumerate(description):
icdLetters.setdefault(letter,[]).append(f"{code}.{i}")
transformation....
import random
given_string = 'You are what you eat.'
result = [ random.choice(icdLetters.get(c,["-"])) for c in given_string ]
output:
print(result)
['A66.0', 'T80.22.35', 'S53.136.34', 'C40.90.33', 'S53.136.43', 'Z96.621.12', 'B57.30.24', 'H59.121.55', 'V14.1.43', 'S93.121.47', 'H59.121.9', 'V04.92.17', 'T80.22.80', 'O16.1.22', 'T25.61.10', 'S53.136.34', 'F44.6.32', 'M67.232.29', 'M89.771.34', 'S93.121.7', 'Z68.36.29']
If you want to save some memory, your dictionary could store indexes in the main list of icd codes and descriptions instead of the formatted values:
with open(fileName,'r') as f:
icd = [line.split(" ",1) for line in f.read().split("\n")]
icdLetters = dict()
for codeIndex,(code,description) in enumerate(icd):
for letterIndex,letter in enumerate(description):
icdLetters.setdefault(letter,[]).append((codeIndex,letterIndex))
import random
def letterToCode(letter):
if letter not in icdLetters: return "-"
codeIndex,letterIndex = random.choice(icdLetters[letter])
return f"{icd[codeIndex][0]}.{letterIndex}"
given_string = 'You are what you eat.'
result = [ letterToCode(c) for c in given_string ]

Matching states and cities with possibly multiple words

I have a Python list like the following elements:
['Alabama[edit]',
'Auburn (Auburn University)[1]',
'Florence (University of North Alabama)',
'Jacksonville (Jacksonville State University)[2]',
'Livingston (University of West Alabama)[2]',
'Montevallo (University of Montevallo)[2]',
'Troy (Troy University)[2]',
'Tuscaloosa (University of Alabama, Stillman College, Shelton State)[3][4]',
'Tuskegee (Tuskegee University)[5]',
'Alaska[edit]',
'Fairbanks (University of Alaska Fairbanks)[2]',
'Arizona[edit]',
'Flagstaff (Northern Arizona University)[6]',
'Tempe (Arizona State University)',
'Tucson (University of Arizona)',
'Arkansas[edit]',
'Arkadelphia (Henderson State University, Ouachita Baptist University)[2]',
'Conway (Central Baptist College, Hendrix College, University of Central Arkansas)[2]',
'Fayetteville (University of Arkansas)[7]']
The list is not complete, but is sufficient to give you an idea of what's in it.
The data is structured like this:
There is a name of a US state and following the state name, there are some names of cities IN THAT STATE. The state name, as you can see ends in "[edit]", and the cities' name either end in a bracket with a number (for example "1", or "[2]"), or with a university's name within parenthesis (for example "(University of North Alabama)").
(Find the full reference file for this problem here)
I ideally want a Python dictionary with the state names as the index, and all the cities' names in that state in a nested listed as a value to that particular index. So, for example the dictionary should be like:
{'Alabama': ['Auburn', 'Florence', 'Jacksonville'...], 'Arizona': ['Flagstaff', 'Temple', 'Tucson', ....], ......}
Now, I tried the following solution, to weed out the unnecessary parts:
import numpy as np
import pandas as pd
def get_list_of_university_towns():
'''
Returns a DataFrame of towns and the states they are in from the
university_towns.txt list. The format of the DataFrame should be:
DataFrame( [ ["Michigan", "Ann Arbor"], ["Michigan", "Yipsilanti"] ],
columns=["State", "RegionName"] )
The following cleaning needs to be done:
1. For "State", removing characters from "[" to the end.
2. For "RegionName", when applicable, removing every character from " (" to the end.
3. Depending on how you read the data, you may need to remove newline character '\n'.
'''
fhandle = open("university_towns.txt")
ftext = fhandle.read().split("\n")
reftext = list()
for item in ftext:
reftext.append(item.split(" ")[0])
#pos = reftext[0].find("[")
#reftext[0] = reftext[0][:pos]
towns = list()
dic = dict()
for item in reftext:
if item == "Alabama[edit]":
state = "Alabama"
elif item.endswith("[edit]"):
dic[state] = towns
towns = list()
pos = item.find("[")
item = item[:pos]
state = item
else:
towns.append(item)
return ftext
get_list_of_university_towns()
A snippet of my output generated by my code looks like this:
{'Alabama': ['Auburn',
'Florence',
'Jacksonville',
'Livingston',
'Montevallo',
'Troy',
'Tuscaloosa',
'Tuskegee'],
'Alaska': ['Fairbanks'],
'Arizona': ['Flagstaff', 'Tempe', 'Tucson'],
'Arkansas': ['Arkadelphia',
'Conway',
'Fayetteville',
'Jonesboro',
'Magnolia',
'Monticello',
'Russellville',
'Searcy'],
'California': ['Angwin',
'Arcata',
'Berkeley',
'Chico',
'Claremont',
'Cotati',
'Davis',
'Irvine',
'Isla',
'University',
'Merced',
'Orange',
'Palo',
'Pomona',
'Redlands',
'Riverside',
'Sacramento',
'University',
'San',
'San',
'Santa',
'Santa',
'Turlock',
'Westwood,',
'Whittier'],
'Colorado': ['Alamosa',
'Boulder',
'Durango',
'Fort',
'Golden',
'Grand',
'Greeley',
'Gunnison',
'Pueblo,'],
'Connecticut': ['Fairfield',
'Middletown',
'New',
'New',
'New',
'Storrs',
'Willimantic'],
'Delaware': ['Dover', 'Newark'],
'Florida': ['Ave',
'Boca',
'Coral',
'DeLand',
'Estero',
'Gainesville',
'Orlando',
'Sarasota',
'St.',
'St.',
'Tallahassee',
'Tampa'],
'Georgia': ['Albany',
'Athens',
'Atlanta',
'Carrollton',
'Demorest',
'Fort',
'Kennesaw',
'Milledgeville',
'Mount',
'Oxford',
'Rome',
'Savannah',
'Statesboro',
'Valdosta',
'Waleska',
'Young'],
'Hawaii': ['Manoa'],
But, there is one error in the output: States with a space in their names (e.g. "North Carolina") are not included. I can the the reason behind it.
I thought of using regular expressions, but since I have yet to study about them, I do not know how to form one. Any ideas as to how it could be done with or without the use of Regex?

Praise the power of regular expressions then:
states_rx = re.compile(r'''
^
(?P<state>.+?)\[edit\]
(?P<cities>[\s\S]+?)
(?=^.*\[edit\]$|\Z)
''', re.MULTILINE | re.VERBOSE)
cities_rx = re.compile(r'''^[^()\n]+''', re.MULTILINE)
transformed = '\n'.join(lst_)
result = {state.group('state'): [city.group(0).rstrip()
for city in cities_rx.finditer(state.group('cities'))]
for state in states_rx.finditer(transformed)}
print(result)
This yields
{'Alabama': ['Auburn', 'Florence', 'Jacksonville', 'Livingston', 'Montevallo', 'Troy', 'Tuscaloosa', 'Tuskegee'], 'Alaska': ['Fairbanks'], 'Arizona': ['Flagstaff', 'Tempe', 'Tucson'], 'Arkansas': ['Arkadelphia', 'Conway', 'Fayetteville']}
Explanation:
The idea is to split the task up into several smaller tasks:
Join the complete list with \n
Separate states
Separate towns
Use a dict comprehension for all found items
First subtask
transformed = '\n'.join(your_list)
Second subtask
^ # match start of the line
(?P<state>.+?)\[edit\] # capture anything in that line up to [edit]
(?P<cities>[\s\S]+?) # afterwards match anything up to
(?=^.*\[edit\]$|\Z) # ... either another state or the very end of the string
See the demo on regex101.com.
Third subtask
^[^()\n]+ # match start of the line, anything not a newline character or ( or )
See another demo on regex101.com.
Fourth subtask
result = {state.group('state'): [city.group(0).rstrip() for city in cities_rx.finditer(state.group('cities'))] for state in states_rx.finditer(transformed)}
This is roughly equivalent to:
for state in states_rx.finditer(transformed):
# state is in state.group('state')
for city in cities_rx.finditer(state.group('cities')):
# city is in city.group(0), possibly with whitespaces
# hence the rstrip
Lastly, some timing issues:
import timeit
print(timeit.timeit(findstatesandcities, number=10**5))
# 12.234304904000965
So running the above a 100.000 times took me round 12 seconds on my computer, so it should be reasonably fast.

You [c/sh]ould change
fhandle = open("university_towns.txt")
ftext = fhandle.read().split("\n")
# to
with open("university_towns.txt","r") as f:
d = f.readlines()
# file is autoclosed here, lines are autosplit by readlines()
No regex solution:
def save(state,city,dic):
'''convenience fnkt to add or create set entry with list of city'''
if state in dic:
dic[state].append(city)
else:
dic[state] = [] # fix for glitch
dic = {}
state = ""
with open("university_towns.txt","r") as f:
d = f.readlines()
for n in d: # iterate all lines
if "[edit]" in n: # handles states
act_state = n.replace("[edit]","").strip() # clean up state
# needed in case 2 states w/o cities follow right after each other
save(act_state,"", dic) # create state in dic, no cities
state = n.replace("[edit]","").strip() # clean up state
else:
# splits at ( takes first and splits at [ takes first removes blanks
# => get city name before ( or [
city = n.split("(")[0].split("[")[0].strip()
save(state,city,dic) # adds city to state in dic
print (dic)
Yields (re-formatted):
{
'Alabama' : ['Auburn', 'Florence', 'Jacksonville', 'Livingston',
'Montevallo', 'Troy', 'Tuscaloosa', 'Tuskegee'],
'Alaska' : ['Fairbanks'],
'Arizona' : ['Flagstaff', 'Tempe', 'Tucson'],
'Arkansas': ['Arkadelphia', 'Conway', 'Fayetteville']
}

Let's solve your problem step by step :
First step:
collect all the data and here i am using putting a track word whenever any state name appear it put a word 'pos_flag' so with the help of this word we will track and chunk:
import re
pattern='\w+(?=\[edit\])'
track=[]
with open('mon.txt','r') as f:
for line in f:
match=re.search(pattern,line)
if match:
track.append('pos_flag')
track.append(line.strip().split('[')[0])
else:
track.append(line.strip().split('(')[0])
it will give something like this output:
['pos_flag', 'Alabama', 'Auburn ', 'Florence ', 'Jacksonville ', 'Livingston ', 'Montevallo ', 'Troy ', 'Tuscaloosa ', 'Tuskegee ', 'pos_flag', 'Alaska', 'Fairbanks ', 'pos_flag', 'Arizona', 'Flagstaff ', 'Tempe ', 'Tucson ', 'pos_flag', 'Arkansas', 'Arkadelphia ', 'Conway ', 'Fayetteville ', 'Jonesboro ', 'Magnolia ', 'Monticello ', 'Russellville ', 'Searcy ', 'pos_flag',
As you can see before every state name there is a word 'pos_flag' now let's use this word and do some stuff:
Second step:
Track the index of all the 'pos_flag words' in list:
index_no=[]
for index,value in enumerate(track):
if value=='pos_flag':
index_no.append(index)
This will give output something like this :
[0, 10, 13, 18, 28, 55, 66, 75, 79, 93, 111, 114, 119, 131, 146, 161, 169, 182, 192, 203, 215, 236, 258, 274, 281, 292, 297, 306, 310, 319, 331, 338, 371, 391, 395, 419, 432, 444, 489, 493, 506, 512, 527, 551, 559, 567, 581, 588, 599, 614]
We have now index no and we can chunk the link with these index numbers :
Last step:
chunk the list with using index no and set first word as dict key and rest of as dict values:
city_dict={}
for i in range(0,len(index_no),1):
try:
value_1=track[index_no[i:i + 2][0]:index_no[i:i + 2][1]]
city_dict[value_1[1]]=value_1[2:]
except IndexError:
city_dict[track[index_no[i:i + 2][0]:][1]]=track[index_no[i:i + 2][0]:][1:]
print(city_dict)
output:
since dict are not ordered in python 3.5 so order of output is different from input file :
{'Kentucky': ['Bowling Green ', 'Columbia ', 'Georgetown ', 'Highland Heights ', 'Lexington ', 'Louisville ', 'Morehead ', 'Murray ', 'Richmond ', 'Williamsburg ', 'Wilmore '], 'Mississippi': ['Cleveland ', 'Hattiesburg ', 'Itta Bena ', 'Oxford ', 'Starkville '], 'Wisconsin': ['Appleton ', 'Eau Claire ', 'Green Bay ', 'La Crosse ', 'Madison ', 'Menomonie ', 'Milwaukee ',
full_code:
import re
pattern='\w+(?=\[edit\])'
track=[]
with open('mon.txt','r') as f:
for line in f:
match=re.search(pattern,line)
if match:
track.append('pos_flag')
track.append(line.strip().split('[')[0])
else:
track.append(line.strip().split('(')[0])
index_no=[]
for index,value in enumerate(track):
if value=='pos_flag':
index_no.append(index)
city_dict={}
for i in range(0,len(index_no),1):
try:
value_1=track[index_no[i:i + 2][0]:index_no[i:i + 2][1]]
city_dict[value_1[1]]=value_1[2:]
except IndexError:
city_dict[track[index_no[i:i + 2][0]:][1]]=track[index_no[i:i + 2][0]:][1:]
print(city_dict)
Second solution:
If you want to use regex then try this small solution :
import re
pattern='((\w+\[edit\])(?:(?!^\w+\[edit\]).)*)'
with open('file.txt','r') as f:
prt=re.finditer(pattern,f.read(),re.DOTALL | re.MULTILINE)
for line in prt:
dict_p={}
match = []
match.append(line.group(1))
dict_p[match[0].split('\n')[0].strip().split('[')[0]]= [i.split('(')[0].strip() for i in match[0].split('\n')[1:][:-1]]
print(dict_p)
it will give:
{'Alabama': ['Auburn', 'Florence', 'Jacksonville', 'Livingston', 'Montevallo', 'Troy', 'Tuscaloosa', 'Tuskegee']}
{'Alaska': ['Fairbanks']}
{'Arizona': ['Flagstaff', 'Tempe', 'Tucson']}
{'Arkansas': ['Arkadelphia', 'Conway', 'Fayetteville', 'Jonesboro', 'Magnolia', 'Monticello', 'Russellville', 'Searcy']}
{'California': ['Angwin', 'Arcata', 'Berkeley', 'Chico', 'Claremont', 'Cotati', 'Davis', 'Irvine', 'Isla Vista', 'University Park, Los Angeles', 'Merced', 'Orange', 'Palo Alto', 'Pomona', 'Redlands', 'Riverside', 'Sacramento', 'University District, San Bernardino', 'San Diego', 'San Luis Obispo', 'Santa Barbara', 'Santa Cruz', 'Turlock', 'Westwood, Los Angeles', 'Whittier']}
{'Colorado': ['Alamosa', 'Boulder', 'Durango', 'Fort Collins', 'Golden', 'Grand Junction', 'Greeley', 'Gunnison', 'Pueblo, Colorado']}
demo :

I tried to eliminate the need for more than one regex.
import re
def mkdict(data):
state, dict = None, {}
rx = re.compile(r'^(?:(.+\[edit\])|([^\(\n:]+))', re.M)
for m in rx.finditer(data):
if m.groups()[0]:
state = m.groups()[0].rstrip('[edit]')
dict[state] = []
else:
dict[state].append(m.groups()[1].rstrip())
return dict
if __name__ == '__main__':
import sys, timeit, functools
data = sys.stdin.read()
print(timeit.Timer(functools.partial(mkdict, data)).timeit(10**3))
print(mkdict(data))
Try it online.

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Python: Parse a list of strings into a dictionnary - python

Related

How to create a nested dictionary from a text file

Text Preprocessing Translation Error Python

Creating a vocabulary game

Make sentence from value of dictionary

Matching states and cities with possibly multiple words

Categories

Resources