Related
I have following example string:
'EXP DATE: 13.04.2022 PO: P101'
'LOCATION: 111 CONDITION: FN'
I need to split following strings to look like:
{"EXP DATE": "13.04.2022", "PO": "P101"}
{"LOCATION": "111", "CONDITION:" "FN"}
To achieve this i created following function:
def create_key_pair(lst):
return {lst[i].replace(':', ''): lst[i + 1] for i in range(0, len(lst), 2)}
so if i pass
str_ = 'LOCATION: 111 CONDITION: FN'
create_key_pair(str_.split(" "))
i got
{"LOCATION": "111", "CONDITION:" "FN"}
but if i pass
str_ = 'XP DATE: 13.04.2022 PO: P101'
create_key_pair(str_.split(" "))
i got
IndexError: list index out of range
since
EXP DATE
splited by space
If the values (the part after the colon) cannot contain spaces, the following will work. The approach uses a regular expression to match any number of characters up to a colon followed by a space and then any number of non-space characters. Then split each match on the colon.
import re
def to_dict(s):
matches = re.findall(r".+?: \S+", s)
d = {}
for m in matches:
k, v = m.split(":")
d[k.strip()] = v.strip()
return d
If the values can contain spaces you'll have to find some other way of separating a value from the next key. JonSG gave the example in the comments of "foo: bar bat hat: 10". With the above approach, you'll get {"foo": "bar", "bat hat": "10"}, but maybe you want {"foo": "bar bat", "hat": "10"}. From the information in the question the only pattern I can see is that the values do not contain spaces.
You can use a regex:
import re
def create_key_pair(s):
data = {}
for kv in re.findall('([^:]+:\s*[^\s]+)', s):
k, v = re.split(':\s*', kv)
data[k.strip()] = v.strip()
return data
Usage:
str_ = 'LOCATION: 111 CONDITION: FN'
create_key_pair(str_)
# Output
{'LOCATION': '111', 'CONDITION': 'FN'}
str_ = 'EXP DATE: 13.04.2022 PO: P101'
create_key_pair(str_)
# Output
{'EXP DATE': '13.04.2022', 'PO': 'P101'}
I got a badly managed log, and need to extract into a dictionary using Python.
# Pattern: (keys are not kw1, kw2 ,etc... no pattern in key)
"para1=a, kw2=b, (b, b=b), bb, kw3=c, t4=..."
# where
# - para1=a
# - kw2=b, (b, b=b), bb
# - kw3=c
# - and so on
# extract into a dict:
out = {"para1": "a", "kw2": "b, (b, b=b), bb", "kw3": "c", "t4": ...}
# Notes several important features
'''
1. all 'kw=value' are joined with certain spliter: ', '
2. value and kw themselves may contain spliter. i.e. 'f(x, y)=3, f(x=3, y=2, z=1)=g(x=1, t=2)'
3. all brackets must be in pair, (therefore we can identify spliters in kw or value).
4. all message must be part of kw or value.
'''
Q1: Is there a regex expression(or some Python code) that helps me get above key and value?
There's no pattern in key, kwN is just a reference to key
Q2Update: Thanks to Laurent, I alr know why Q2 doesn't work: Got unexpected result. ', (.*?)=' should give me the shortest matching between ',' and '=' right?
msg = 'a, a, b=b, c=c'
re.findall(', (.*?)=', msg)
>>> ['a, b', 'c']
# I was expecting ['b','c']
# shouldn't ', (.*?)=' give me the shortest matching between ',' and '='? which is 'b' instead of 'a, b'
(New)Q3: Since I'm working with huge loads of data, working efficiency is my first priority. I've worked out a python code which could achieve the goal, but it doesnt feel quick enough, could you help me to make it better?
def my_not_efficient_solution(msg):
'''
Notes:
1. all 'kw=value' are joined with certain spliter: ', '
2. value and kw themselves may contain spliter. i.e. 'f(x, y)=3, f(x=3, y=2, z=1)=g(x=1, t=2)'
3. all brackets must be in pair, (therefore we can identify spliters in kw or value).
4. all message must be part of kw or value.
Solution:
1. split message with spliter -> get entries
2. check each spliter bracekt and equal sign
3. for each entry: append to last one or serve as part of next one or good with itself
'''
spliter=', '
eq_sign=['=']
first=False
bracket_map={'(':1,")":-1,"[":1,"]":-1}
pair_chk_func = lambda s: not sum([bracket_map.get(i,0) for i in s])
eq_chk_func = lambda s: sum([i in s for i in eq_sign])
assert pair_chk_func(msg), 'msg pair check fail.'
res = msg.split(spliter)
# step1: split entry
entries=[]
do_pre='' # last entry is not complete(lack bracket)
do_first = '' # last entry is not complete(lack eq sign)
while res.__len__()>0:
if first and entries.__len__()==2:
entries.pop(-1)
break
if do_first and entries._len__()==0:
do=do_first+res.pop(0)
else:
do_first=''
do=res.pop(0)
eq_chk=eq_chk_func(do_pre+do)
pair_chk=pair_chk_func(do_pre+do)
# case1: not valid entry, no eq sign
# case2: previous entry not complete
# case3: current entry not valid(no eq sign, will drop) and pair incomplete(may be part of next entry)
if not eq_chk or do_pre:
if entries.__len__() > 0:
entries[-1]+=spliter+do
pair_chk=pair_chk_func(entries[-1])
if pair_chk: do_pre=''
else: do_pre=entries[-1]
elif not pair_chk:
do_first=do
# case4: current entry good to go
elif eq_chk and pair_chk:
entries.append(do)
do_pre=''
# case5: current entry not complete(pair not complete)
else:
entries.append(do)
do_pre=do
# step2: split each into dict
output={}
split_mark = '|'.join(eq_sign)
for entry in entries:
splits=re.split(split_mark, entry)
if splits.__len__()<2:
raise ValueError('split fail for message')
kw = splits.pop(0)
while not pair_chk_func(kw):
kw += '='+splits.pop(0)
output[kw]='='.join(splits)
return output
msg = 'B_=a, kw2=b, f(A=3, k=2)=g(t=3, v=5), mark[(blabla), f(xx tt)=33]'
my_not_efficient_solution(msg)
>>> {'B_': 'a',
'kw2': 'b',
'f(A=3, k=2)': 'g(t=3, v=5), mark[(blabla), f(xx tt)=33]'}
Answer to Q1:
Here is my suggestion:
import re
s = "kw1=a, kw2=b, (b, b=b), bb, kw3=c, kw4=..."
pattern = r'(?=(kw.)=(.*?)(?:, kw.=|$))'
result = dict(re.findall(pattern, s))
print(result) # {'kw1': 'a', 'kw2': 'b, (b, b=b), bb', 'kw3': 'c', 'kw4': '...'}
To explain the regex:
the (?=...) is a lookahead assertion to let you find overlapping matches
the ? in (.*?) makes the quantifier * (asterisk) non-greedy
the ?: makes the group (?:, kw.=|$) non-capturing
the |$ at the end allows to take account of the last value in your string
Answer to Q2:
No, this is wrong. The quantifier *? is non-greedy, so it finds the first match. Moreover there is no search for overlapping matches , which could be done with (?=...). So your observed result is the expected done.
I may suggest you this simple solution:
msg = 'a, a, b=b, c=c'
result = re.findall(', ([^,]*?)=', msg)
print(result) # ['b', 'c']
Q1: Is there a regex expression that helps me get above key and value?
To get the key:value in a dictionary format you can use
Say your string is
"kw1=a, kw2=b, (b, b=b), bb, kw3=c, kw4=dd, kw10=jndn"
Using the following regex gives you key and values in a list
results = re.findall(r'(\bkw\d+)=(.*?)(?=,+\s*\bkw\d+=|$)', s)
[('kw1', 'a'), ('kw2', 'b, (b, b=b), bb'), ('kw3', 'c'), ('kw4', 'dd'), ('kw10', 'jndn')]
You can convert it to a dictionary as
dict(results)
Output :
{
'kw1': 'a',
'kw2': 'b, (b, b=b), bb',
'kw3': 'c',
'kw4': 'dd',
'kw10': 'jndn'
}
Explanation :
\b is used like a word boundary and will only match kw and not something like XYZkw
\kw\d+= Match the word kw followed by 1+ digits and =
.*? (Lazy Match) Match as least chars as possible
(?= Positive lookahead, assert to the right
\s*\bkw\d+= Match optional whitespace chars, then pat, 1+ digits and =
| Or
$ Assert the end of the string for the last part
) Close the lookahead
Given a single word (x); return the possible n-grams that can be found in that word.
You can modify the n-gram value according as you want;
it is in the curly braces in the pat variable.
The default n-gram value is 4.
For example; for the word (x):
x = 'abcdef'
The possible 4-gram are:
['abcd', 'bcde', 'cdef']
def ngram_finder(x):
pat = r'(?=(\S{4}))'
xx = re.findall(pat, x)
return xx
The Question is:
How to combine the f-string with the r-string in the regex expression, using curly braces.
You can use this string to combine the n value into your regexp, using double curly brackets to create a single one in the output:
fr'(?=(\S{{{n}}}))'
The regex needs to have {} to make a quantifier (as you had in your original regex {4}). However f strings use {} to indicate an expression replacement so you need to "escape" the {} required by the regex in the f string. That is done by using {{ and }} which in the output create { and }. So {{{n}}} (where n=4) generates '{' + '4' + '}' = '{4}' as required.
Complete code:
import re
def ngram_finder(x, n):
pat = fr'(?=(\S{{{n}}}))'
return re.findall(pat, x)
x = 'abcdef'
print(ngram_finder(x, 4))
print(ngram_finder(x, 5))
Output:
['abcd', 'bcde', 'cdef']
['abcde', 'bcdef']
I have string below,and I want to get list,dict,var from this string.
How can I to split this string to specific format?
s = 'list_c=[1,2],a=3,b=1.3,c=abch,list_a=[1,2],dict_a={a:2,b:3}'
import re
m1 = re.findall (r'(?=.*,)(.*?=\[.+?\],?)',s)
for i in m1 :
print('m1:',i)
I only get result 1 correctly.
Does anyone know how to do?
m1: list_c=[1,2],
m1: a=3,b=1.3,c=abch,list_a=[1,2],
Use '=' to split instead, then you can work around with variable name and it's value.
You still need to handle the type casting for values (regex, split, try with casting may help).
Also, same as others' comment, using dict may be easier to handle
s = 'list_c=[1,2],a=3,b=1.3,c=abch,list_a=[1,2],dict_a={a:2,b:3}'
al = s.split('=')
var_l = [al[0]]
value_l = []
for a in al[1:-1]:
var_l.append(a.split(',')[-1])
value_l.append(','.join(a.split(',')[:-1]))
value_l.append(al[-1])
output = dict(zip(var_l, value_l))
print(output)
You may have better luck if you more or less explicitly describe the right-hand side expressions: numbers, lists, dictionaries, and identifiers:
re.findall(r"([^=]+)=" # LHS and assignment operator
+r"([+-]?\d+(?:\.\d+)?|" # Numbers
+r"[+-]?\d+\.|" # More numbers
+r"\[[^]]+\]|" # Lists
+r"{[^}]+}|" # Dictionaries
+r"[a-zA-Z_][a-zA-Z_\d]*)", # Idents
s)
# [('list_c', '[1,2]'), ('a', '3'), ('b', '1.3'), ('c', 'abch'),
# ('list_a', '[1,2]'), ('dict_a', '{a:2,b:3}')]
The answer is like below
import re
from pprint import pprint
s = 'list_c=[1,2],a=3,b=1.3,c=abch,list_a=[1],Save,Record,dict_a={a:2,b:3}'
m1 = re.findall(r"([^=]+)=" # LHS and assignment operator
+r"([+-]?\d+(?:\.\d+)?|" # Numbers
+r"[+-]?\d+\.|" # More numbers
+r"\[[^]]+\]|" # Lists
+r"{[^}]+}|" # Dictionaries
+r"[a-zA-Z_][a-zA-Z_\d]*)", # Idents
s)
temp_d = {}
for i,j in m1:
temp = i.strip(',').split(',')
if len(temp)>1:
for k in temp[:-1]:
temp_d[k]=''
temp_d[temp[-1]] = j
else:
temp_d[temp[0]] = j
pprint(temp_d)
Output is like
{'Record': '',
'Save': '',
'a': '3',
'b': '1.3',
'c': 'abch',
'dict_a': '{a:2,b:3}',
'list_a': '[1]',
'list_c': '[1,2]'}
Instead of picking out the types, you can start by capturing the identifiers. Here's a regex that captures all the identifiers in the string (for lowercase only, but see note):
regex = re.compile(r'([a-z]|_)+=')
#note if you want all valid variable names: r'([a-z]|[A-Z]|[0-9]|_)+'
cases = [x.group() for x in re.finditer(regex, s)]
This gives a list of all the identifiers in the string:
['list_c=', 'a=', 'b=', 'c=', 'list_a=', 'dict_a=']
We can now define a function to sequentially chop up s using the
above list to partition the string sequentially:
def chop(mystr, mylist):
temp = mystr.partition(mylist[0])[2]
cut = temp.find(mylist[1]) #strip leading bits
return mystr.partition(mylist[0])[2][cut:], mylist[1:]
mystr = s[:]
temp = [mystr]
mylist = cases[:]
while len() > 1:
mystr, mylist = chop(mystr, mylist)
temp.append(mystr)
This (convoluted) slicing operation gives this list of strings:
['list_c=[1,2],a=3,b=1.3,c=abch,list_a=[1,2],dict_a={a:2,b:3}',
'a=3,b=1.3,c=abch,list_a=[1,2],dict_a={a:2,b:3}',
'b=1.3,c=abch,list_a=[1,2],dict_a={a:2,b:3}',
'c=abch,list_a=[1,2],dict_a={a:2,b:3}',
'list_a=[1,2],dict_a={a:2,b:3}',
'dict_a={a:2,b:3}']
Now cut off the ends using each successive entry:
result = []
for x in range(len(temp) - 1):
cut = temp[x].find(temp[x+1]) - 1 #-1 to remove commas
result.append(temp[x][:cut])
result.append(temp.pop()) #get the last item
Now we have the full list:
['list_c=[1,2]', 'a=3', 'b=1.3', 'c=abch', 'list_a=[1,2]', 'dict_a={a:2,b:3}']
Each element is easily parsable into key:value pairs (and is also executable via exec).
There's a logfile with text in the form of space-separated key=value pairs, and each line was originally serialized from data in a Python dict, something like:
' '.join([f'{k}={v!r}' for k,v in d.items()])
The keys are always just strings. The values could be anything that ast.literal_eval can successfully parse, no more no less.
How to process this logfile and turn the lines back into Python dicts? Example:
>>> to_dict("key='hello world'")
{'key': 'hello world'}
>>> to_dict("k1='v1' k2='v2'")
{'k1': 'v1', 'k2': 'v2'}
>>> to_dict("s='1234' n=1234")
{'s': '1234', 'n': 1234}
>>> to_dict("""k4='k5="hello"' k5={'k6': ['potato']}""")
{'k4': 'k5="hello"', 'k5': {'k6': ['potato']}}
Here is some extra context about the data:
Keys are valid names
Input lines are well-formed (e.g. no dangling brackets)
The data is trusted (unsafe functions such as eval, exec, yaml.load are OK to use)
Order is not important. Performance is not important. Correctness is important.
Edit: As requested in the comments, here is an MCVE and an example code that didn't work correctly
>>> def to_dict(s):
... s = s.replace(' ', ', ')
... return eval(f"dict({s})")
...
...
>>> to_dict("k1='v1' k2='v2'")
{'k1': 'v1', 'k2': 'v2'} # OK
>>> to_dict("s='1234' n=1234")
{'s': '1234', 'n': 1234} # OK
>>> to_dict("key='hello world'")
{'key': 'hello, world'} # Incorrect, the value was corrupted
Your input can't be conveniently parsed by something like ast.literal_eval, but it can be tokenized as a series of Python tokens. This makes things a bit easier than they might otherwise be.
The only place = tokens can appear in your input is as key-value separators; at least for now, ast.literal_eval doesn't accept anything with = tokens in it. We can use the = tokens to determine where the key-value pairs start and end, and most of the rest of the work can be handled by ast.literal_eval. Using the tokenize module also avoids problems with = or backslash escapes in string literals.
import ast
import io
import tokenize
def todict(logstring):
# tokenize.tokenize wants an argument that acts like the readline method of a binary
# file-like object, so we have to do some work to give it that.
input_as_file = io.BytesIO(logstring.encode('utf8'))
tokens = list(tokenize.tokenize(input_as_file.readline))
eqsign_locations = [i for i, token in enumerate(tokens) if token[1] == '=']
names = [tokens[i-1][1] for i in eqsign_locations]
# Values are harder than keys.
val_starts = [i+1 for i in eqsign_locations]
val_ends = [i-1 for i in eqsign_locations[1:]] + [len(tokens)]
# tokenize.untokenize likes to add extra whitespace that ast.literal_eval
# doesn't like. Removing the row/column information from the token records
# seems to prevent extra leading whitespace, but the documentation doesn't
# make enough promises for me to be comfortable with that, so we call
# strip() as well.
val_strings = [tokenize.untokenize(tok[:2] for tok in tokens[start:end]).strip()
for start, end in zip(val_starts, val_ends)]
vals = [ast.literal_eval(val_string) for val_string in val_strings]
return dict(zip(names, vals))
This behaves correctly on your example inputs, as well as on an example with backslashes:
>>> todict("key='hello world'")
{'key': 'hello world'}
>>> todict("k1='v1' k2='v2'")
{'k1': 'v1', 'k2': 'v2'}
>>> todict("s='1234' n=1234")
{'s': '1234', 'n': 1234}
>>> todict("""k4='k5="hello"' k5={'k6': ['potato']}""")
{'k4': 'k5="hello"', 'k5': {'k6': ['potato']}}
>>> s=input()
a='=' b='"\'' c=3
>>> todict(s)
{'a': '=', 'b': '"\'', 'c': 3}
Incidentally, we probably could look for token type NAME instead of = tokens, but that'll break if they ever add set() support to literal_eval. Looking for = could also break in the future, but it doesn't seem as likely to break as looking for NAME tokens.
Regex replacement functions to the rescue
I'm not rewriting a ast-like parser for you, but one trick that works pretty well is to use regular expressions to replace the quoted strings and replace them by "variables" (I've chosen __token(number)__), a bit like you're offuscating some code.
Make a note of the strings you're replacing (that should take care of the spaces), replace space by comma (protecting against symbols before like : allows to pass last test) and replace by strings again.
import re,itertools
def to_dict(s):
rep_dict = {}
cnt = itertools.count()
def rep_func(m):
rval = "__token{}__".format(next(cnt))
rep_dict[rval] = m.group(0)
return rval
# replaces single/double quoted strings by token variable-like idents
# going on a limb to support escaped quotes in the string and double escapes at the end of the string
s = re.sub(r"(['\"]).*?([^\\]|\\\\)\1",rep_func,s)
# replaces spaces that follow a letter/digit/underscore by comma
s = re.sub("(\w)\s+",r"\1,",s)
#print("debug",s) # uncomment to see temp string
# put back the original strings
s = re.sub("__token\d+__",lambda m : rep_dict[m.group(0)],s)
return eval("dict({s})".format(s=s))
print(to_dict("k1='v1' k2='v2'"))
print(to_dict("s='1234' n=1234"))
print(to_dict(r"key='hello world'"))
print(to_dict('key="hello world"'))
print(to_dict("""k4='k5="hello"' k5={'k6': ['potato']}"""))
# extreme string test
print(to_dict(r"key='hello \'world\\'"))
prints:
{'k2': 'v2', 'k1': 'v1'}
{'n': 1234, 's': '1234'}
{'key': 'hello world'}
{'key': 'hello world'}
{'k5': {'k6': ['potato']}, 'k4': 'k5="hello"'}
{'key': "hello 'world\\"}
The key is to extract the strings (quoted/double quoted) using non-greedy regex and replace them by non-strings (like if those were string variables not literals) in the expression. The regex has been tuned so it can accept escaped quotes and double escape at the end of string (custom solution)
The replacement function is an inner function so it can make use of the nonlocal dictionary & counter and track the replaced text, so it can be restored once the spaces have been taken care of.
When replacing the spaces by commas, you have to be careful not to do it after a colon (last test) or all things considered after a alphanum/underscore (hence the \w protection in the replacement regex for comma)
If we uncomment the debug print code just before the original strings are put back that prints:
debug k1=__token0__,k2=__token1__
debug s=__token0__,n=1234
debug key=__token0__
debug k4=__token0__,k5={__token1__: [__token2__]}
debug key=__token0__
The strings have been pwned, and the replacement of spaces has worked properly. With some more effort, it should probably be possible to quote the keys and replace k1= by "k1": so ast.literal_eval can be used instead of eval (more risky, and not required here)
I'm sure some super-complex expressions can break my code (I've even heard that there are very few json parsers able to parse 100% of the valid json files), but for the tests you submitted, it'll work (of course if some funny guy tries to put __tokenxx__ idents in the original strings, that'll fail, maybe it could be replaced by some otherwise invalid-as-variable placeholders). I have built an Ada lexer using this technique some time ago to be able to avoid spaces in strings and that worked pretty well.
You can find all the occurrences of = characters, and then find the maximum runs of characters which give a valid ast.literal_eval result. Those characters can then be parsed for the value, associated with a key found by a string slice between the last successful parse and the index of the current =:
import ast, typing
def is_valid(_str:str) -> bool:
try:
_ = ast.literal_eval(_str)
except:
return False
else:
return True
def parse_line(_d:str) -> typing.Generator[typing.Tuple, None, None]:
_eq, last = [i for i, a in enumerate(_d) if a == '='], 0
for _loc in _eq:
if _loc >= last:
_key = _d[last:_loc]
_inner, seen, _running, _worked = _loc+1, '', _loc+2, []
while True:
try:
val = ast.literal_eval(_d[_inner:_running])
except:
_running += 1
else:
_max = max([i for i in range(len(_d[_inner:])) if is_valid(_d[_inner:_running+i])])
yield (_key, ast.literal_eval(_d[_inner:_running+_max]))
last = _running+_max
break
def to_dict(_d:str) -> dict:
return dict(parse_line(_d))
print([to_dict("key='hello world'"),
to_dict("k1='v1' k2='v2'"),
to_dict("s='1234' n=1234"),
to_dict("""k4='k5="hello"' k5={'k6': ['potato']}"""),
to_dict("val=['100', 100, 300]"),
to_dict("val=[{'t':{32:45}, 'stuff':100, 'extra':[]}, 100, 300]")
]
)
Output:
{'key': 'hello world'}
{'k1': 'v1', 'k2': 'v2'}
{'s': '1234', 'n': 1234}
{'k4': 'k5="hello"', 'k5': {'k6': ['potato']}}
{'val': ['100', 100, 300]}
{'val': [{'t': {32: 45}, 'stuff': 100, 'extra': []}, 100, 300]}
Disclaimer:
This solution is not as elegant as #Jean-FrançoisFabre's, and I am not sure if it can parse 100% of what is passed to to_dict, but it may give you inspiration for your own version.
Provide two helper functions.
popstr: split thing from start of string that looks like string
If it starts with a single or double quote mark, I'll look for the next one and split at that point.
def popstr(s):
i = s[1:].find(s[0]) + 2
return s[:i], s[i:]
poptrt: split thing from start of string that is surrounded by brackets ('[]', '()', '{}').
If it starts with a bracket, I'll start incrementing for every instance of the starting character and decrementing for every instance of it's complement. When I reach zero, I split.
def poptrt(s):
d = {'{': '}', '[': ']', '(': ')'}
b = s[0]
c = lambda x: {b: 1, d[b]: -1}.get(x, 0)
parts = []
t, i = 1, 1
while t > 0 and s:
if i > len(s) - 1:
break
elif s[i] in '\'"':
s, s, s = s[:i], *map(str.strip, popstr(s[i:]))
parts.extend([s, s])
i = 0
else:
t += c(s[i])
i += 1
if t == 0:
return ''.join(parts + [s[:i]]), s[i:]
else:
raise ValueError('Your string has unbalanced brackets.')
Chew through string until there is no more string to chew
def to_dict(log):
d = {}
while log:
k, log = map(str.strip, log.split('=', 1))
if log.startswith(('"', "'")):
v, log = map(str.strip, popstr(log))
elif log.startswith((*'{[(',)):
v, log = map(str.strip, poptrt(log))
else:
v, *log = map(str.strip, log.split(None, 1))
log = ' '.join(log)
d[k] = ast.literal_eval(v)
return d
All tests passed
assert to_dict("key='hello world'") == {'key': 'hello world'}
assert to_dict("k1='v1' k2='v2'") == {'k1': 'v1', 'k2': 'v2'}
assert to_dict("s='1234' n=1234") == {'s': '1234', 'n': 1234}
assert to_dict("""k4='k5="hello"' k5={'k6': ['potato']}""") == {'k4': 'k5="hello"', 'k5': {'k6': ['potato']}}
Deficiencies
Did not account for backslashes
Did not account for nested goofy formatting
All Together
import ast
def popstr(s):
i = s[1:].find(s[0]) + 2
return s[:i], s[i:]
def poptrt(s):
d = {'{': '}', '[': ']', '(': ')'}
b = s[0]
c = lambda x: {b: 1, d[b]: -1}.get(x, 0)
parts = []
t, i = 1, 1
while t > 0 and s:
if i > len(s) - 1:
break
elif s[i] in '\'"':
_s, s_, s = s[:i], *map(str.strip, popstr(s[i:]))
parts.extend([_s, s_])
i = 0
else:
t += c(s[i])
i += 1
if t == 0:
return ''.join(parts + [s[:i]]), s[i:]
else:
raise ValueError('Your string has unbalanced brackets.')
def to_dict(log):
d = {}
while log:
k, log = map(str.strip, log.split('=', 1))
if log.startswith(('"', "'")):
v, log = map(str.strip, popstr(log))
elif log.startswith((*'{[(',)):
v, log = map(str.strip, poptrt(log))
else:
v, *log = map(str.strip, log.split(None, 1))
log = ' '.join(log)
d[k] = ast.literal_eval(v)
return d
I have similar problem to convert 'key1="value1" key2="value2" ...' string into dict. I split string on spaces and create a list of ['key="value"'] pairs. Than in cycle through list again, split pairs on '=' and add pairs to dict.
Code:
str_attr = 'name="Attr1" type="Attr2" use="Attr3"'
list_attr = str_attr.split(' ')
dict_attr = {}
for item in list_attr:
list_item = item.split('=')
dict_attr.update({list_item[0] : list_item[1]})
print(dict_attr)
result:
{'name': '"Attr1"', 'type': '"Attr2"', 'use': '"Attr3"'}
Limitations:
keys and values should don't have space (' ') and/or equal sign ('=') inside.
If you have different delimiters like spaces, commas, commas with spaces, semicolon et cetera, use regex to split string, specify delimiters by '|':
'\s+|,\s*|;\s*'
\s+ - one or more spaces
",\s*" - colon or colon with space(s)
";\s*" - semicolon or semicolon with space(s)
"+" means "one or more"
"*" means "none or more"
import re
str_attr = 'name="Attr1" type="Attr2", use="Attr3",new="yes";old="no"'
list_attr = re.split(''\s+|,\s*|;\s*'', str_attr)
dict_attr = {}
for item in list_attr:
if item:
list_item = item.split('=')
dict_attr.update({list_item[0] : list_item[1]})
print(dict_attr)
Result:
{'name': '"Attr1"', 'type': '"Attr2"', 'use': '"Attr3"', 'new': '"yes"', 'old': '"no"'}