Python Iterating New Data into nested dictionary - python

I have been working on a Python Role-Playing game and I have a function to import item data from a text file. The text file is structured as follows:
WEAPON 3 sword_of_eventual_obsolescence 6 10 2 0 10
WEAPON 4 dagger_of_bluntness 2 5 3 1 0
WEAPON 5 sword_of_extreme_flimsiness 3 8 3 7 0
The data importing goes like this:
def items_get():
import os
global items
items = {
"weapon":{},
"armour":{},
"potion":{},
"misc":{}
}
file_dir = ( os.getcwd() + '\Code\items.txt' )
file_in = open( file_dir, 'r')
for each_line in file_in:
line = file_in.readline()
line = line.split(' ')
if line[0] == "WEAPON":
weapon_id = line[1]
name = line[2]
attack_min = line[3]
attack_max = line[4]
range = line[5]
weight = line[6]
value = line[7]
weapon_data = {
"name": name.replace('_', ' '),
"atk_min": attack_min,
"atk_max": attack_max,
"rng": range,
"wt": weight,
"val": value,
}
items["weapon"][weapon_id] = {}
items["weapon"][weapon_id].update(weapon_data)
However, when I print items["weapon"], I get this:
{'4': {'wt': '1', 'atk_min': '2', 'atk_max': '5', 'val': '0', 'name': 'dagger of bluntness', 'rng': '3'}}
As you can see, there is only 1 item there. On other occasions I have had two even though I actually have 3 items listed. Why is this happening, and how do I get all 3 items in the dictionary?
Thanks!
:P
EDIT: Here is the data for the potions, in case you were wondering.
elif line.split()[0] == "POTION":
_, id, name, hp_bonus, atk_bonus, range_bonus, ac_bonus, str_bonus, con_bonus, dex_bonus, int_bonus, wis_bonus, cha_bonus, wt, val = line.split()
A healing potion looks like this in the file:
POTION 1 potion_of_healing 20 0 0 0 0 0 0 0 0 0 0.1 2

for each_line in file_in:
line = file_in.readline()
each_line already contains the next line, because iterating through a file-like object (say, with a for loop) causes it to go by lines.
On each iteration of the loop, the file pointer is advanced by one line (file-like objects, though rewindable, keep track of their last-accessed position), and then before anything is done it gets advanced once more by the readline(), so the only line that doesn't get skipped entirely is the middle one (4).
To fix this, use the loop variable (each_line) within the loop body directly and nix the file_in.readline().

#noname1014, I know you know this but I want to point out few of the problems with your code (that may occur in some special cases, e.g if you change your file name items.txt to new_items.txt, rare_fruits.txt etc.) and some suggestions.
Do not use \ as path separators in Windows. Use \\ otherwise you may get into problems. \Code\time_items.txt will be evaluated as \Code imeitems.txt because \t is TAB here.
Using \ only works in few cases if \ followed by any character A, p, n, t, ", ' etc. does not construct escape sequences like \n, \t, \f, \r, \b etc.
Have a look at the below example for clarification.
>>> import os
>>>
>>> print(os.getcwd() + '\Code\timeitems.txt')
E:\Users\Rishikesh\Python3\Practice\Code imeitems.txt
>>>
>>> print(os.getcwd() + '\Code\\timeitems.txt')
E:\Users\Rishikesh\Python3\Practice\Code\timeitems.txt
>>>
>>> print(os.getcwd() + '\Code\newitems.txt')
E:\Users\Rishikesh\Python3\Practice\Code
ewitems.txt
>>>
>>> print(os.getcwd() + '\\Code\\newitems.txt')
E:\Users\Rishikesh\Python3\Practice\Code\newitems.txt
>>>
>>> # Do not use it as it may work only in some cases if \ followed by any character does not construct escape sequences.
...
>>> os.getcwd() + '\Code\items.txt'
'E:\\Users\\Rishikesh\\Python3\\Practice\\Code\\items.txt'
>>>
>>> # Use \\ as path separators
...
>>> os.getcwd() + '\\Code\\items.txt'
'E:\\Users\\Rishikesh\\Python3\\Practice\\Code\\items.txt'
>>>
>>> print(os.getcwd() + '\Code\items.txt')
E:\Users\Rishikesh\Python3\Practice\Code\items.txt
>>>
>>> print(os.getcwd() + '\\Code\\items.txt')
E:\Users\Rishikesh\Python3\Practice\Code\items.txt
>>>
If your dictionay is huge and you are facing any issue while looking into its items, pretty it using json module, it has a function called dumps() which is used to pretty print list and dictionary objects.
It is ok to place import statements inside function but placing it on the top is a Pythonic way (https://www.python.org/dev/peps/pep-0008/#imports). It is good for large applications with multiple functions in the same module.
Use with statement for opening files, in this case you do not need to close files.
Your code is fine, I have just modified it as below.
import os
global items
import json
def items_get():
items = {
"weapon":{},
"armour":{},
"potion":{},
"misc":{}
}
# Do not use \ as path separators in Windows. Use \\ (\t, \n, \' have speacial meanings)
file_dir = ( os.getcwd() + '\\Code\\items.txt' )
with open( file_dir, 'r') as file_in:
lines = file_in.readlines();
# ['WEAPON 3 sword_of_eventual_obsolescence 6 10 2 0 10\n', 'WEAPON 4 dagger_of_bluntness 2 5 3 1 0\n', 'WEAPON 5 sword_of_extreme_flimsiness 3 8 3 7 0']
for each_line in lines:
# Use strip() to remove any leading/trailing whitespaces (\n, \t, spaces etc.)
line = each_line.strip().split(' ');
if line[0] == "WEAPON":
weapon_id = line[1]
name = line[2]
attack_min = line[3]
attack_max = line[4]
range = line[5]
weight = line[6]
value = line[7]
weapon_data = {
"name": name.replace('_', ' '),
"atk_min": attack_min,
"atk_max": attack_max,
"rng": range,
"wt": weight,
"val": value,
}
items["weapon"][weapon_id] = {}
items["weapon"][weapon_id].update(weapon_data)
return items
# Calling items_get() to get dictionary
items = items_get();
# Pretty printing dictionary using json.dumps()
print(json.dumps(items, indent=4))
» Output
{
"weapon": {
"3": {
"name": "sword of eventual obsolescence",
"atk_min": "6",
"atk_max": "10",
"rng": "2",
"wt": "0",
"val": "10"
},
"4": {
"name": "dagger of bluntness",
"atk_min": "2",
"atk_max": "5",
"rng": "3",
"wt": "1",
"val": "0"
},
"5": {
"name": "sword of extreme flimsiness",
"atk_min": "3",
"atk_max": "8",
"rng": "3",
"wt": "7",
"val": "0"
}
},
"armour": {},
"potion": {},
"misc": {}
}

Related

Print specific lines from a text file in python

I have a text file which I'm trying to print the lines that don't start with a number but the output is not readable.
This is a part of what my code returns:
{\rtf1\ansi\ansicpg1252\cocoartf1671\cocoasubrtf600
{\fonttbl\f0\fswiss\fcharset0 Helvetica;}
{\colortbl;\red255\green255\blue255;\red4\green4\blue4;\red247\green247\blue247;\red0\green0\blue0;
\red255\green255\blue255;\red77\green77\blue77;}
{\*\expandedcolortbl;;\cssrgb\c1176\c1176\c1176;\cssrgb\c97647\c97647\c97647;\cssrgb\c0\c0\c0;
\cssrgb\c100000\c100000\c100000;\cssrgb\c37647\c37647\c37647;}
\paperw11900\paperh16840\margl1440\margr1440\vieww10800\viewh8400\viewkind0
\deftab720
\pard\pardeftab720\partightenfactor0
\f0\fs26 \cf2 \cb3 Since the start of digital video in 1988, new video formats are developed every year\cf4 \cb5 \
\pard\pardeftab720\partightenfactor0
\cf6 \cb3 00:14\cb5 \
And this is my code:
numbers = ("0", "1", "2", "3", "4", "5", "6", "7", "8", "9")
aFile = open("/Users/maira/Desktop/text.rtf")
lines = aFile.readlines()
for line in lines:
if not line.startswith((numbers)):
print(line)
aFile.close()
This is an example of the original text:
Since the start of digital video in 1988, new video formats are developed every year
00:14
in an attempt to provide improvements in quality, file size and video playback.
00:18
The popularity of video continues to grow rapidly, with 78% of people watching at least
00:24
one digital video on one of their devices every single day; However video formats and
00:29
how they work is still a subject of much confusion for most people.
I've seen some questions similar to mine but I can't get to a solution.
I appreciate any advices and if there's also a way of deleting the blank lines in between lines, I'd be very thankful.
Thank you.
I used a very complete function provided on this answer to strip all the rtf text, and after that i use a regex to remove the format numbers HH:MM. Maybe this will help you.
def striprtf(text):
pattern = re.compile(r"\\([a-z]{1,32})(-?\d{1,10})?[ ]?|\\'([0-9a-f]{2})|\\([^a-z])|([{}])|[\r\n]+|(.)", re.I)
# control words which specify a "destionation".
destinations = frozenset((
'aftncn','aftnsep','aftnsepc','annotation','atnauthor','atndate','atnicn','atnid',
'atnparent','atnref','atntime','atrfend','atrfstart','author','background',
'bkmkend','bkmkstart','blipuid','buptim','category','colorschememapping',
'colortbl','comment','company','creatim','datafield','datastore','defchp','defpap',
'do','doccomm','docvar','dptxbxtext','ebcend','ebcstart','factoidname','falt',
'fchars','ffdeftext','ffentrymcr','ffexitmcr','ffformat','ffhelptext','ffl',
'ffname','ffstattext','field','file','filetbl','fldinst','fldrslt','fldtype',
'fname','fontemb','fontfile','fonttbl','footer','footerf','footerl','footerr',
'footnote','formfield','ftncn','ftnsep','ftnsepc','g','generator','gridtbl',
'header','headerf','headerl','headerr','hl','hlfr','hlinkbase','hlloc','hlsrc',
'hsv','htmltag','info','keycode','keywords','latentstyles','lchars','levelnumbers',
'leveltext','lfolevel','linkval','list','listlevel','listname','listoverride',
'listoverridetable','listpicture','liststylename','listtable','listtext',
'lsdlockedexcept','macc','maccPr','mailmerge','maln','malnScr','manager','margPr',
'mbar','mbarPr','mbaseJc','mbegChr','mborderBox','mborderBoxPr','mbox','mboxPr',
'mchr','mcount','mctrlPr','md','mdeg','mdegHide','mden','mdiff','mdPr','me',
'mendChr','meqArr','meqArrPr','mf','mfName','mfPr','mfunc','mfuncPr','mgroupChr',
'mgroupChrPr','mgrow','mhideBot','mhideLeft','mhideRight','mhideTop','mhtmltag',
'mlim','mlimloc','mlimlow','mlimlowPr','mlimupp','mlimuppPr','mm','mmaddfieldname',
'mmath','mmathPict','mmathPr','mmaxdist','mmc','mmcJc','mmconnectstr',
'mmconnectstrdata','mmcPr','mmcs','mmdatasource','mmheadersource','mmmailsubject',
'mmodso','mmodsofilter','mmodsofldmpdata','mmodsomappedname','mmodsoname',
'mmodsorecipdata','mmodsosort','mmodsosrc','mmodsotable','mmodsoudl',
'mmodsoudldata','mmodsouniquetag','mmPr','mmquery','mmr','mnary','mnaryPr',
'mnoBreak','mnum','mobjDist','moMath','moMathPara','moMathParaPr','mopEmu',
'mphant','mphantPr','mplcHide','mpos','mr','mrad','mradPr','mrPr','msepChr',
'mshow','mshp','msPre','msPrePr','msSub','msSubPr','msSubSup','msSubSupPr','msSup',
'msSupPr','mstrikeBLTR','mstrikeH','mstrikeTLBR','mstrikeV','msub','msubHide',
'msup','msupHide','mtransp','mtype','mvertJc','mvfmf','mvfml','mvtof','mvtol',
'mzeroAsc','mzeroDesc','mzeroWid','nesttableprops','nextfile','nonesttables',
'objalias','objclass','objdata','object','objname','objsect','objtime','oldcprops',
'oldpprops','oldsprops','oldtprops','oleclsid','operator','panose','password',
'passwordhash','pgp','pgptbl','picprop','pict','pn','pnseclvl','pntext','pntxta',
'pntxtb','printim','private','propname','protend','protstart','protusertbl','pxe',
'result','revtbl','revtim','rsidtbl','rxe','shp','shpgrp','shpinst',
'shppict','shprslt','shptxt','sn','sp','staticval','stylesheet','subject','sv',
'svb','tc','template','themedata','title','txe','ud','upr','userprops',
'wgrffmtfilter','windowcaption','writereservation','writereservhash','xe','xform',
'xmlattrname','xmlattrvalue','xmlclose','xmlname','xmlnstbl',
'xmlopen',
))
# Translation of some special characters.
specialchars = {
'par': '\n',
'sect': '\n\n',
'page': '\n\n',
'line': '\n',
'tab': '\t',
'emdash': u'\u2014',
'endash': u'\u2013',
'emspace': u'\u2003',
'enspace': u'\u2002',
'qmspace': u'\u2005',
'bullet': u'\u2022',
'lquote': u'\u2018',
'rquote': u'\u2019',
'ldblquote': u'\201C',
'rdblquote': u'\u201D',
}
stack = []
ignorable = False # Whether this group (and all inside it) are "ignorable".
ucskip = 1 # Number of ASCII characters to skip after a unicode character.
curskip = 0 # Number of ASCII characters left to skip
out = [] # Output buffer.
for match in pattern.finditer(text):
word,arg,hex,char,brace,tchar = match.groups()
if brace:
curskip = 0
if brace == '{':
# Push state
stack.append((ucskip,ignorable))
elif brace == '}':
# Pop state
ucskip,ignorable = stack.pop()
elif char: # \x (not a letter)
curskip = 0
if char == '~':
if not ignorable:
out.append(u'\xA0')
elif char in '{}\\':
if not ignorable:
out.append(char)
elif char == '*':
ignorable = True
elif word: # \foo
curskip = 0
if word in destinations:
ignorable = True
elif ignorable:
pass
elif word in specialchars:
out.append(specialchars[word])
elif word == 'uc':
ucskip = int(arg)
elif word == 'u':
c = int(arg)
if c < 0: c += 0x10000
if c > 127: out.append(unichr(c))
else: out.append(chr(c))
curskip = ucskip
elif hex: # \'xx
if curskip > 0:
curskip -= 1
elif not ignorable:
c = int(hex,16)
if c > 127: out.append(unichr(c))
else: out.append(chr(c))
elif tchar:
if curskip > 0:
curskip -= 1
elif not ignorable:
out.append(tchar)
return ''.join(out)
with open('/Users/maira/Desktop/text.rtf', 'r') as file:
rtf = file.read()
text = striprtf(rtf)
text = re.sub('(0[0-9]|1[0-9]|2[0-3]):[0-5][0-9]', '', text)
print(text)
file.close()

parse a structured (structure of machine) text-file (config-file) into a structured table format

main goal is to get from a more or less readable config file into a table format which can be read from everyone witouth deeper understanding of the machine and their configuration standards.
i've got a config file:
******A MANO:111111 ,20190726,001,0914,06621242746
DXS*HAWA776A0A*VA*V0/6*1
ST*001*0001
ID1*HAW250755*VMI1-9900****250755*6*0
CB1*021545*DeBright*7.030.16*3.02*250755
PA1*0*100
PA1*1*60
PA2*2769*166140*210*12600*0*0*0*0
******E MANO:111111 ,20190726,001,0914,06621242746
******A MANO:222222 ,20190726,001,0914,06621242746
DXS*HAWA776A0A*VA*V0/6*1
ST*001*0001
ID1*HAW250755*VMI1-9900****250755*6*0
CB1*021545*DeBright*7.030.16*3.02*250755
PA1*0*100
PA1*1*60
PA2*2769*166140*210*12600*0*0*0*0
******E MANO:222222 ,20190726,001,0914,06621242746
There are several objects in the file always starting with 'A MANO:' and ending with 'E MANO:' followed by the object-number.
all the lines underneath are the attributes of the object (settings of the machine). Not all objects have the same amount of settings. There could be 55 Lines for one object and 199 for another.
what i tried so far:
from pyparsing import *
'''
grammar:
object_nr ::= Word(nums, exact=6)
num ::= '0' | '1' | '2' | '3' | '4' | '5' | '6' | '7' | '8' | '9'
'''
path_input = r'\\...\...'
with open(path_input) as input_file:
line = input_file.readline()
cnt = 1
object_nr_parser = Word(nums, exact=6)
for match, start, stop in object_nr_parser.scanString(input_file):
print(match, start, stop)
which gives me the printout:
['201907'] 116 122
['019211'] 172 178
the number it founds and the start and ending points in the string. But this numbers are not what I'm looking for nor correct. i can't even find the second number in the config-file.
is it the right way to solve this with pyparsing or is there a more convenient way to do it? Where did i do the mistake?
At the end it would be astounding if i would have an object for every machine with attributes which would be all the lines between the A MANO: and the E MANO:
expected result would be something like this:
{"object": "111111",
"line1":"DXS*HAWA776A0A*VA*V0/6*1",
"line2":"ST*001*0001",
"line3":"ID1*HAW250755*VMI1-9900****250755*6*0",
"line4":"CB1*021545*DeBright*7.030.16*3.02*250755",
"line5":"PA1*0*100",
"line6":"PA1*1*60",
"line7":"PA2*2769*166140*210*12600*0*0*0*0"},
{"object": "222222",
"line1":"DXS*HAWA776A0A*VA*V0/6*1",
"line2":"ST*001*0001",
"line3":"ID1*HAW250755*VMI1-9900****250755*6*0",
"line4":"CB1*021545*DeBright*7.030.16*3.02*250755",
"line5":"PA1*0*100",
"line6":"PA1*1*60",
"line7":"PA2*2769*166140*210*12600*0*0*0*0",
"line8":"PA2*2769*166140*210*12600*0*0*0*0",
"line9":"PA2*2769*166140*210*12600*0*0*0*0",
"line10":"PA2*2769*166140*210*12600*0*0*0*0"}
Not sure if that is the best solution for the purpose but it's the one that came into mind at this point.
One of the dirtiest ways to get the thing done would be using regex and replace the MANO with line break and all the line breaks with ';'. I don't think that this would be a solution one should use
You can parse it line by line:
import re
with open('file.txt', 'r') as f:
lines = f.readlines()
lines = [x.strip() for x in lines]
result = []
name = ''
i = 1
for line in lines:
if 'A MANO' in line:
name = re.findall('A MANO:(\d+)', line)[0]
result.append({'object': name})
i = 1
elif 'E MANO' not in line:
result[-1][f'line{i}'] = line
i += 1
Output:
[{
'object': '111111',
'line1': 'DXS*HAWA776A0A*VA*V0/6*1',
'line2': 'ST*001*0001',
'line3': 'ID1*HAW250755*VMI1-9900****250755*6*0',
'line4': 'CB1*021545*DeBright*7.030.16*3.02*250755',
'line5': 'PA1*0*100',
'line6': 'PA1*1*60',
'line7': 'PA2*2769*166140*210*12600*0*0*0*0'
}, {
'object': '222222',
'line1': 'DXS*HAWA776A0A*VA*V0/6*1',
'line2': 'ST*001*0001',
'line3': 'ID1*HAW250755*VMI1-9900****250755*6*0',
'line4': 'CB1*021545*DeBright*7.030.16*3.02*250755',
'line5': 'PA1*0*100',
'line6': 'PA1*1*60',
'line7': 'PA2*2769*166140*210*12600*0*0*0*0'
}
]
But I suggest using more compact output format:
import re
with open('file.txt', 'r') as f:
lines = f.readlines()
lines = [x.strip() for x in lines]
result = {}
name = ''
for line in lines:
if 'A MANO' in line:
name = re.findall('A MANO:(\d+)', line)[0]
result[name] = []
elif 'E MANO' not in line:
result[name].append(line)
Output:
{
'111111': ['DXS*HAWA776A0A*VA*V0/6*1', 'ST*001*0001', 'ID1*HAW250755*VMI1-9900****250755*6*0', 'CB1*021545*DeBright*7.030.16*3.02*250755', 'PA1*0*100', 'PA1*1*60', 'PA2*2769*166140*210*12600*0*0*0*0'],
'222222': ['DXS*HAWA776A0A*VA*V0/6*1', 'ST*001*0001', 'ID1*HAW250755*VMI1-9900****250755*6*0', 'CB1*021545*DeBright*7.030.16*3.02*250755', 'PA1*0*100', 'PA1*1*60', 'PA2*2769*166140*210*12600*0*0*0*0']
}

Python 'key error' while building dictionary dynamically (On the fly)

I get the error onthis line of code -
result_dict['strat'][k]['name'] = current_comps[0].strip()
The error is : Keyerror: 'strat'
I have an input line
PERSON1 ## CAR1 # ENTRY : 0 | EXIT : 0 ## CAR2 # M1 : YES : 10/01/17 02:00 | M2 : NO : 10/02/16 03:00 | M3 : NO : 05/07/17 11:00 | M4 : YES : 01/01/16 03:00 ## TRUCK # M3 : NO : 03/01/17 03:45 | M23 : NO : 01/01/14 07:00 | M27 : YES : 02/006/18 23:00
I 'm looking to parse this input to generate the output detailed below. As part of this, I'm trying to build a dictionary inserting both keys & values dynamically. I'm having a lot of problems doing this.
Could I please request help on this?
Here is what I've tried so far -
# File read
f = open('input_data', 'r')
file_cont = f.read().splitlines()
f.close()
#json template
# Initialize dictionary
result_arr = []
result_dict = {}
k = 0
for item in file_cont:
strat = item.split('##')
result_dict['Person'] = strat[0].strip()
j = 1
while j < len(strat):
# Split various components of the main line
current_comps = strat[j].split('#')
# Name of strat being parsed
result_dict['strat'][k]['name'] = current_comps[0].strip()
# tfs across the various time frames
tfs = current_comps[1].split('|')
# First travel mode
if current_comps[0].strip() == 'CAR1':
temp_low_arr = tfs[0].split(':')
temp_high_arr = tfs[1].split(':')
result_dict['strat'][k]['Entry'] = temp_low_arr[1].strip()
result_dict['strat'][k]['Exit'] = temp_high_arr[1].strip()
# Second travel mode
elif current_comps[0].strip() == 'CAR2':
z = 0
while z < len(tfs):
# Split components of the sign
sign_comp_car_2 = tfs[z].split(':')
result_dict['strat'][k]['tf'][z]['path'] = sign_comp_ma_cross[0].strip()
result_dict['strat'][k]['tf'][z]['sign'] = sign_comp_ma_cross[1].strip()
result_dict['strat'][k]['tf'][z]['sign_time'] = sign_comp_ma_cross[2].strip()
z += 1
# Third travel mode
elif current_comps[0].strip() == 'CAR3':
b = 0
while b < len(tfs):
# Split components of the sign
sign_car_3 = tfs[z].split(':')
result_dict['strat'][k]['tf'][b]['path'] = sign_all_term[0].strip()
result_dict['strat'][k]['tf'][b]['sign'] = sign_all_term[1].strip()
result_dict['strat'][k]['tf'][b]['sign_time'] = sign_all_term[2].strip()
b += 1
j += 1
k += 1
Expected output
[{
"Person":"",
"Transport":[
{
"Name":"CAR1",
"Entry":"0",
"Exit":"0"
},
{
"name":"CAR2:",
"tf":[
{
"path":"M1",
"sign":"YES",
"sign_time":"10/01/17 02:00"
},
{
"path":"M2",
"sign":"NO",
"sign_time":"10/02/16 03:00"
},
{
"path":"M3",
"sign":"NO",
"sign_time":"05/07/17 11:00"
},
{
"path":"M4",
"sign":"YES",
"sign_time":"01/01/16 03:00"
}
]
},
{
"name":"CAR3",
"tf":[
{
"path":"M3",
"sign":"NO",
"sign_time":"03/01/17 03:45"
},
{
"path":"M23",
"sign":"NO",
"sign_time":"01/01/14 07:00"
},
{
"path":"M27",
"sign":"Yes",
"sign_time":"02/006/18 23:00"
}
]
}
]
}]
The issue is when you try to assign the ['name'] field in result_dict['strat'][k] when result_dict['strat'][k] hasn't been initialized yet. Before you run your for-loop, the dictionary has no key called strat.
Now you could have done something like result_dict['strat'] = dict() (assigning an object to that key in the dict), but when you further subscript it using result_dict['strat'][k], it will try to resolve that first, by accessing result_dict['strat'], expecting either a subscriptable collection or a dictionary in return. However, since that key doesn't exist yet, it throws you the error.
What you could do instead is initialize a default dictionary:
from collections import defaultdict
...
resultdict = defaultdict(dict)
...
Otherwise, in your existing code, you could initialize a dict within result_dict before entering the loop.

Python how convert single quotes to double quotes to format as json string

I have a file where on each line I have text like this (representing cast of a film):
[{'cast_id': 23, 'character': "Roger 'Verbal' Kint", 'credit_id': '52fe4260c3a36847f8019af7', 'gender': 2, 'id': 1979, 'name': 'Kevin Spacey', 'order': 5, 'profile_path': '/x7wF050iuCASefLLG75s2uDPFUu.jpg'}, {'cast_id': 27, 'character': 'Edie's Finneran', 'credit_id': '52fe4260c3a36847f8019b07', 'gender': 1, 'id': 2179, 'name': 'Suzy Amis', 'order': 6, 'profile_path': '/b1pjkncyLuBtMUmqD1MztD2SG80.jpg'}]
I need to convert it in a valid json string, thus converting only the necessary single quotes to double quotes (e.g. the single quotes around word Verbal must not be converted, eventual apostrophes in the text also should not be converted).
I am using python 3.x. I need to find a regular expression which will convert only the right single quotes to double quotes, thus the whole text resulting in a valid json string. Any idea?
First of all, the line you gave as example is not parsable! … 'Edie's Finneran' … contains a syntax error, not matter what.
Assuming that you have control over the input, you could simply use eval() to read in the file. (Although, in that case one would wonder why you can't produce valid JSON in the first place…)
>>> f = open('list.txt', 'r')
>>> s = f.read().strip()
>>> l = eval(s)
>>> import pprint
>>> pprint.pprint(l)
[{'cast_id': 23,
'character': "Roger 'Verbal' Kint",
...
'profile_path': '/b1pjkncyLuBtMUmqD1MztD2SG80.jpg'}]
>>> import json
>>> json.dumps(l)
'[{"cast_id": 23, "character": "Roger \'Verbal\' Kint", "credit_id": "52fe4260ca36847f8019af7", "gender": 2, "id": 1979, "name": "Kevin Spacey", "order": 5, "rofile_path": "/x7wF050iuCASefLLG75s2uDPFUu.jpg"}, {"cast_id": 27, "character":"Edie\'s Finneran", "credit_id": "52fe4260c3a36847f8019b07", "gender": 1, "id":2179, "name": "Suzy Amis", "order": 6, "profile_path": "/b1pjkncyLuBtMUmqD1MztDSG80.jpg"}]'
If you don't have control over the input, this is very dangerous, as it opens you up to code injection attacks.
I cannot emphasize enough that the best solution would be to produce valid JSON in the first place.
If you do not have control over the JSON data, do not eval() it!
I created a simple JSON correction mechanism, as that is more secure:
def correctSingleQuoteJSON(s):
rstr = ""
escaped = False
for c in s:
if c == "'" and not escaped:
c = '"' # replace single with double quote
elif c == "'" and escaped:
rstr = rstr[:-1] # remove escape character before single quotes
elif c == '"':
c = '\\' + c # escape existing double quotes
escaped = (c == "\\") # check for an escape character
rstr += c # append the correct json
return rstr
You can use the function in the following way:
import json
singleQuoteJson = "[{'cast_id': 23, 'character': 'Roger \\'Verbal\\' Kint', 'credit_id': '52fe4260c3a36847f8019af7', 'gender': 2, 'id': 1979, 'name': 'Kevin Spacey', 'order': 5, 'profile_path': '/x7wF050iuCASefLLG75s2uDPFUu.jpg'}, {'cast_id': 27, 'character': 'Edie\\'s Finneran', 'credit_id': '52fe4260c3a36847f8019b07', 'gender': 1, 'id': 2179, 'name': 'Suzy Amis', 'order': 6, 'profile_path': '/b1pjkncyLuBtMUmqD1MztD2SG80.jpg'}]"
correctJson = correctSingleQuoteJSON(singleQuoteJson)
print(json.loads(correctJson))
Here is the code to get desired output
import ast
def getJson(filepath):
fr = open(filepath, 'r')
lines = []
for line in fr.readlines():
line_split = line.split(",")
set_line_split = []
for i in line_split:
i_split = i.split(":")
i_set_split = []
for split_i in i_split:
set_split_i = ""
rev = ""
i = 0
for ch in split_i:
if ch in ['\"','\'']:
set_split_i += ch
i += 1
break
else:
set_split_i += ch
i += 1
i_rev = (split_i[i:])[::-1]
state = False
for ch in i_rev:
if ch in ['\"','\''] and state == False:
rev += ch
state = True
elif ch in ['\"','\''] and state == True:
rev += ch+"\\"
else:
rev += ch
i_rev = rev[::-1]
set_split_i += i_rev
i_set_split.append(set_split_i)
set_line_split.append(":".join(i_set_split))
line_modified = ",".join(set_line_split)
lines.append(ast.literal_eval(str(line_modified)))
return lines
lines = getJson('test.txt')
for i in lines:
print(i)
Apart from eval() (mentioned in user3850's answer), you can use ast.literal_eval
This has been discussed in the thread: Using python's eval() vs. ast.literal_eval()?
You can also look at the following discussion threads from Kaggle competition which has data similar to the one mentioned by OP:
https://www.kaggle.com/c/tmdb-box-office-prediction/discussion/89313#latest-517927
https://www.kaggle.com/c/tmdb-box-office-prediction/discussion/80045#latest-518338

load parameters from a file in Python

I am writing a Python class to model a process and I want to initialized the parameters from a file, say 'input.dat'. The format of the input file looks like this.
'input.dat' file:
Z0: 0 0
k: 0.1
g: 1
Delta: 20
t_end: 300
The code I wrote is the following. It works but appears redundant and inflexible. Is there a better way to do the job? Such as a loop to do readline() and then match the keyword?
def load(self,filename="input.dat"):
FILE = open(filename)
s = FILE.readline().split()
if len(s) is 3:
self.z0 = [float(s[1]),float(s[2])] # initial state
s = FILE.readline().split()
if len(s) is 2:
self.k = float(s[1]) # kappa
s = FILE.readline().split()
if len(s) is 2:
self.g = float(s[1])
s = FILE.readline().split()
if len(s) is 2:
self.D = float(s[1]) # Delta
s = FILE.readline().split()
if len(s) is 2:
self.T = float(s[1]) # end time
Assuming the params are coming from a safe place (made by you or users, not the internet), just make the parameters file a Python file, params.py:
Z0 = (0, 0)
k = 0.1
g = 1
Delta = 20
t_end = 300
Then in your code all you need is:
import params
fancy_calculation(10, k=params.k, delta=params.Delta)
The beauty of this is two-fold: 1) simplicity, and 2) you can use the power of Python in your parameter descriptions -- particularly useful here, for example:
k = 0.1
Delta = 20
g = 3 * k + Delta
Alternatively, you could use Python's built-in JSON or ConfigParser .INI parser modules.
If you are open to some other kind of file where you can keep your parameters, I would suggest you to use a YAML file.
The Python library is PyYAML. This is how you can easily use it with Python.
For a better introduction, look at this Wikipedia article: http://en.wikipedia.org/wiki/YAML.
The benefit is you can read the parameter values as lists or maps.
You would love it!
Try the following:
def load(self, filename="input.dat"):
d = {"Z0": "z0", "k": "k", "g": "g", "Delta": "D", "t_end": "T"}
FILE = open(filename)
for line in FILE:
name, value = line.split(":")
value = value.strip()
if " " in value:
value = map(float, value.split())
else:
value = float(value)
setattr(self, d[name], value)
Proof that it works:
>>> class A(object): pass
...
>>> a = A()
>>> load(a)
>>> a.__dict__
{'k': 0.10000000000000001, 'z0': [0.0, 0.0], 'D': 20.0, 'g': 1.0, 'T': 300.0}
As others have mentioned, in Python you can create object attributes dynamically "on the fly". That means you could do something like the following to create Params objects as they're read-in. I've tried to make the code as data-driven as possible, so relatively flexible.
# maps label to attribute name and types
label_attr_map = {
"Z0:": ["z0", float, float],
"k:": [ "k", float],
"g:": [ "g", float],
"Delta:": [ "D", float],
"t_end:": [ "T", float]
}
class Params(object):
def __init__(self, input_file_name):
with open(input_file_name, 'r') as input_file:
for line in input_file:
row = line.split()
label = row[0]
data = row[1:] # rest of row is data list
attr = label_attr_map[label][0]
datatypes = label_attr_map[label][1:]
values = [(datatypes[i](data[i])) for i in range(len(data))]
self.__dict__[attr] = values if len(values) > 1 else values[0]
params = Params('input.dat')
print 'params.z0:', params.z0
print 'params.k:', params.k
print 'params.g:', params.g
print 'params.D:', params.D
print 'params.T:', params.T
Output:
params.z0: [0.0, 0.0]
params.k: 0.1
params.g: 1.0
params.D: 20.0
params.T: 300.0
Perhaps this might give you what you need:
def load(self,filename='input.dat'):
with open(filename) as fh:
for line in fh:
s = line.split()
if len(s) == 2:
setattr(self,s[1],s[2])
elif len(s) == 3:
setattr(self,s[1],s[2:])
I also didn't include any error checking, but setattr is very handy.
Something like this:
def load(self,filename="input.dat"):
# maps names to number of fields they need
# only necessary for variables with more than 1 field
argmap = dict(Z0=2)
# maps config file names to their attribute names on the object
# if name is the same both places, no need
namemap = dict(Z0="z0", Delta="D", t_end="T")
with open(filename) as FILE:
for line in FILE:
s = line.split()
var = s[0].rstrip(":")
try:
val = [float(x) for x in s[1:]]
except ValueError:
continue
if len(val) == varmap.get(var, 1):
if len(val) == 1:
val = val[0]
setattr(self, namemap.get(var, var), val)
Python objects have a built-in __dict__ member. You can modify it, and then refer to properties as obj.key.
class Data(object):
def __init__(self, path='infile.dat'):
with open(path, 'r') as fo:
for line in fo.readlines():
if len(line) < 2: continue
parts = [s.strip(' :\n') for s in line.split(' ', 1)]
numbers = [float(s) for s in parts[1].split()]
# This is optional... do you want single values to be stored in lists?
if len(numbers) == 1: numbers = numbers[0]
self.__dict__[parts[0]] = numbers
# print parts -- debug
obj = Data('infile.dat')
print obj.g
print obj.Delta
print obj.Z0
At the end of this, we print out a few of the keys. Here's the output of those.
1.0
20.0
[0.0, 0.0]
For consistency, you can remove the line marked "optional" in my code, and have all objects in lists -- regardless of how many elements they have. That will make using them quite a bit easier, because you never have to worry about obj.g[0] returning an error.
Here's another one
def splitstrip(s):
return s.split(':')[1].strip()
with open('input.dat','r') as f:
a.z0 = [float(x) for x in splitstrip(f.readline()).split(' ')]
a.k, a.g, a.D, a.T = tuple([float(splitstrip(x)) for x in f.read().rstrip().split('\n')])
;)

Categories

Resources