Related
I'm facing quite a tricky problem in my python code. I looked around and was not able to find anyone with a similar problem.
I'd like to generate strings translating some characters into several, different ones.
I'd like that original characters, meant to be replaced (translated), to be replaced by several different ones.
What I'm looking to do is something like this :
text = "hi there"
translations = {"i":["b", "c"], "r":["e","f"]}
result = magicfunctionHere(text,translations)
print(result)
> [
"hb there",
"hc there",
"hi theee",
"hi thefe",
"hb theee",
"hb thefe",
"hc theee",
"hc thefe"
]
The result contains any combination of the original text with 'i' and 'r' replaced respectively by 'b' and 'c', and 'e' and 'f'.
I don't see how to do that, using itertools and functions like permutations, product etc...
I hope I'm clear enough, it is quite a specific problem !
Thank you for your help !
def magicfunction(ret, text, alphabet_location, translations):
if len(alphabet_location) == 0:
ret.append(text)
return ret
index = alphabet_location.pop()
for w in translations[text[index]]:
ret = magicfunction(ret, text[:index] + w + text[index + 1:], alphabet_location, translations)
alphabet_location.append(index)
return ret
def magicfunctionHere(text, translations):
alphabet_location = []
for key in translations.keys():
alphabet_location.append(text.find(key))
translations[key].append(key)
ret = []
ret = magicfunction(ret, text, alphabet_location, translations)
ret.pop()
return ret
text = "hi there"
translations = {"i":["b", "c"], "r":["e","f"]}
result = magicfunctionHere(text,translations)
print(result)
One crude way to go would be to use a Nested Loop Constructin 2 steps (Functions) as depicted in the Snippet below:
def rearrange_characters(str_text, dict_translations):
tmp_result = []
for key, value in dict_translations.items():
if key in str_text:
for replacer in value:
str_temp = str_text.replace(key, replacer, 1)
if str_temp not in tmp_result:
tmp_result.append(str_temp)
return tmp_result
def get_rearranged_characters(str_text, dict_translations):
lst_result = rearrange_characters(str_text, dict_translations)
str_joined = ','.join(lst_result)
for str_part in lst_result:
str_joined = "{},{}".format(str_joined, ','.join(rearrange_characters(str_part, dict_translations)))
return set(str_joined.split(sep=","))
text = "hi there"
translations = {"i": ["b", "c"], "r":["e","f"]}
result = get_rearranged_characters(text, translations)
print(result)
## YIELDS: {
'hb theee',
'hc thefe',
'hc there',
'hi thefe',
'hb thefe',
'hi theee',
'hc theee',
'hb there'
}
See also: https://eval.in/960803
Another equally convoluted approach would be to use a single function with nested loops like so:
def process_char_replacement(str_text, dict_translations):
tmp_result = []
for key, value in dict_translations.items():
if key in str_text:
for replacer in value:
str_temp = str_text.replace(key, replacer, 1)
if str_temp not in tmp_result:
tmp_result.append(str_temp)
str_joined = ','.join(tmp_result)
for str_part in tmp_result:
tmp_result_2 = []
for key, value in dict_translations.items():
if key in str_part:
for replacer in value:
str_temp = str_part.replace(key, replacer, 1)
if str_temp not in tmp_result_2:
tmp_result_2.append(str_temp)
str_joined = "{},{}".format(str_joined, ','.join(tmp_result_2))
return set(str_joined.split(sep=","))
text = "hi there"
translations = {"i": ["b", "c"], "r":["e","f"]}
result = process_char_replacement(text, translations)
print(result)
## YIELDS: {
'hb theee',
'hc thefe',
'hc there',
'hi thefe',
'hb thefe',
'hi theee',
'hc theee',
'hb there'
}
Refer to: https://eval.in/961602
files = ['foo.0001.jpg', 'test2.0003.jpg', 'foo.0004.jpg', 'tmp.txt',
'foo.0003.jpg', 'test2.0002.jpg', 'test2.0004.jpg', 'test.0002.jpg',
'foo.0002.jpg', 'foo.0005.jpg', 'test.0001.jpg']
and I want foo.####.jpg and min, max print
test.####.jpg and min, max print
test2.####.jpg and min, max print
def get_frame_number(files):
for c in foo:
value = files.get(c)
for i in value:
num = i.split(".")[1]
num_list.append(int(num))
print str(min(num_list)) + "-" + str(max(num_list))
I have a function. but couldn't figure it out.
You can use re to try to pull the number out of your file name. Then use this function as the key argument to max and min respectively.
import re
def get_frame_number(file):
match = re.match(r'[\w\d]+\.(\d+)\.jpg', file)
if match:
return int(match.group(1))
else:
return float('nan')
>>> max(files, key=get_frame_number)
'foo.0005.jpg'
>>> min(files, key=get_frame_number)
'foo.0001.jpg'
An option would be using key arg (with lambda function) of max() and min() built-in functions like this:
for fn in ('foo', 'test', 'test2'):
fn_max = max(
(name for name in files if name.startswith('{}.'.format(fn))),
key=lambda name: int(name.split('.')[1]))
fn_min = min(
(name for name in files if name.startswith('{}.'.format(fn))),
key=lambda name: int(name.split('.')[1]))
print(fn, fn_max, fn_min)
Output:
('foo', 'foo.0005.jpg', 'foo.0001.jpg')
('test', 'test.0002.jpg', 'test.0001.jpg')
('test2', 'test2.0004.jpg', 'test2.0002.jpg')
import re
foo = re.findall( r'(foo\.\d+.jpg)','|'.join( sorted(files) ) )
foo[0], foo[-1]
Output :
('foo.0001.jpg', 'foo.0005.jpg')
Similarly you can check for min, max of other files:
test = re.findall( r'(test\.\d+.jpg)','|'.join( sorted(files) ) )
test[0], test[-1]
test2 = re.findall( r'(test2\.\d+.jpg)','|'.join( sorted(files) ) )
test2[0], test2[-1]
Putting all together in one liner:
[ ( i[0], i[-1] ) for i in [ re.findall( r'('+ j + '\.\d+.jpg)','|'.join( sorted(files) ) ) for j in ['foo','test','test2'] ] ]
Output:
[('foo.0001.jpg', 'foo.0005.jpg'),
('test.0001.jpg', 'test.0002.jpg'),
('test2.0002.jpg', 'test2.0004.jpg')]
def get_frame_number(files,name):
nums = []
for each in files:
parts = each.strip().split('.')
if parts[0] == name:nums.append(int(parts[1]))
else:print("Ignoring",each)
return(sorted(nums)[0],sorted(nums)[-1])
Try this with :
print(get_frame_number(files,"test"))
print(get_frame_numbers(files,"test2"))
print(get_frame_numbers(files,"foo"))
I'm a newbie in pyparsing and hope somebody can help me.The type of text I am trying to parse is of the following structure:
I have key=value pairs in a line that can have one or more pairs. The values can be of many types such as string, int, float, list, dictionary. The key is always a string. Example of a line with 4 pairs:
mode='clip' clipzeros=True field='1331+0444=3C286' clipminmax=[0,1.2]
So I have defined my grammar and parser to be:
import pyparsing as pp
key = pp.Word(pp.alphas+"_")
separator = pp. Literal("=").suppress()
value = pp.Word(pp.printables)
pair = key+separator+value
line = pp.OneOrMore(pair)
mytest = "mode='clip' clipzeros=True field='1331+0444=3C286' clipminmax=[0,1.2]"
res = line.parseString(mytest)
print res
It returns this:
['mode', "'clip'", 'clipzeros', 'True', 'field', "'1331+0444=3C286'", 'clipminmax', '[0,1.2]']
Two things I want to have as results:
I'd like to have the results as a dictionary such as:
{"mode":"clip", "clipzeros":True, "field":"1331+0444=3C286", "clipminmax":[0,1.2]}
I'd like to keep the types of the values in the resulting dictionary. For example:
the type of value clipzeros is a boolean. The type of value clipminmax is a list.
Is this at all possible in pyparsing?
Thanks a lot for any help.
Sandra
Try using eval() to get your type.
import pyparsing as pp
key = pp.Word(pp.alphas+"_")
separator = pp. Literal("=").suppress()
value = pp.Word(pp.printables)
pair = key+separator+value
line = pp.OneOrMore(pair)
mytest = "mode='clip' clipzeros=True field='1331+0444=3C286' clipminmax=[0,1.2]"
res = line.parseString(mytest)
mydict = dict(zip(res[::2],[eval(x) for x in res[1::2]])
Yields:
{'field': '1331+0444=3C286', 'mode': 'clip', 'clipzeros': True, 'clipminmax': [0, 1.2]}
Another Example:
res = ['mode', "'clip'", 'clipzeros', 'True', 'field', "'R RQT'", 'clipminmax', '[0,1.2]']
mydict = dict(zip(res[::2],[eval(x) for x in res[1::2]]))
print mydict
Yields:
{'field': 'R RQT', 'mode': 'clip', 'clipzeros': True, 'clipminmax': [0, 1.2]}
ALTERNATIVE to pyparser (I don't have that module):
class Parser():
def __init__(self,primarydivider,secondarydivider):
self.prime = primarydivider
self.second = secondarydivider
def parse(self,string):
res = self.initialsplit(string)
new = []
for entry in res:
if self.second not in entry:
new[-1] += ' ' + entry
else:
new.append(entry)
return dict((entry[0],eval(entry[1])) for entry in [entry.split(self.second) for entry in new])
def initialsplit(self,string):
return string.split(self.prime)
mytest = "mode='clip' clipzeros=True field='AEF D' clipminmax=[0,1.2]"
myParser = Parser(' ', '=')
parsed = myParser.parse(mytest)
print parsed
Yields
{'field': 'AEF D', 'mode': 'clip', 'clipzeros': True, 'clipminmax': [0, 1.2]}
OP's Comments Edit:
Python 2.7.5 (default, May 15 2013, 22:43:36) [MSC v.1500 32 bit (Intel)] on win32
Type "help", "copyright", "credits" or "license" for more information.
>>> mytest = "mode='clip' clipzeros=True field='R RQT' clipminmax=[0,1.2]"
>>> print mytest
mode='clip' clipzeros=True field='R RQT' clipminmax=[0,1.2]
>>>
Rather than use something as generic as Word(printables), I'd suggest defining specific expressions for different types of value literals:
from pyparsing import *
# what are the different kinds of values you can get?
int_literal = Combine(Optional('-') + Word(nums))
float_literal = Regex(r'\d+\.\d*')
string_literal = quotedString
bool_literal = Keyword("True") | Keyword("False")
none_literal = Keyword("None")
list_literal = originalTextFor(nestedExpr('[',']'))
dict_literal = originalTextFor(nestedExpr('{','}'))
# define an overall value expression, of the different types of values
value = (float_literal | int_literal | bool_literal | none_literal |
string_literal | list_literal | dict_literal)
key = Word(alphas + '_')
def evalValue(tokens):
import ast
# ast.literal_eval is safer than global eval()
return [ast.literal_eval(tokens[0])]
pair = Group(key("key") + '=' + value.setParseAction(evalValue)("value"))
line = OneOrMore(pair)
Now parse your sample:
sample = """mode='clip' clipzeros=True field='1331+0444=3C286' clipminmax=[0,1.2]"""
result = line.parseString(sample)
for r in result:
print r.dump()
print r.key
print r.value
print
Prints:
['mode', '=', 'clip']
- key: mode
- value: clip
mode
clip
['clipzeros', '=', True]
- key: clipzeros
- value: True
clipzeros
True
['field', '=', '1331+0444=3C286']
- key: field
- value: 1331+0444=3C286
field
1331+0444=3C286
['clipminmax', '=', [0, 1.2]]
- key: clipminmax
- value: [0, 1.2]
clipminmax
[0, 1.2]
I have a string as
(device
(vfb
(xxxxxxxx)
(xxxxxxxx)
(location 0.0.0.0:5900)
)
)
(device
(console
(xxxxxxxx)
(xxxxxxxx)
(location 80)
)
)
I need to read the location line from "vfb" portion of the string. I have tried to use regular expression like
import re
re.findall(r'device.*?\vfb.*?\(.*?(.*?).*(.*?\))
But it doesn't give me the required output.
It's better to use a parser for problems like this. Fortunately, a parser would be rather trivial in your case:
def parse(source):
def expr(tokens):
t = tokens.pop(0)
if t != '(':
return {'value': t}
key, val = tokens.pop(0), {}
while tokens[0] != ')':
val.update(expr(tokens))
tokens.pop(0)
return {key:val}
tokens = re.findall(r'\(|\)|[^\s()]+', source)
lst = []
while tokens:
lst.append(expr(tokens))
return lst
Given the above snippet, this creates a structure like:
[{'device': {'vfb': {'location': {'value': '0.0.0.0:5900'}, 'xxxxxxxx': {}}}},
{'device': {'console': {'location': {'value': '80'}, 'xxxxxxxx': {}}}}]
Now you can iterate it and fetch whatever you need:
for item in parse(source):
try:
location = item['device']['vfb']['location']['value']
except KeyError:
pass
With that intro from Martijn Pieters, here is a pyparsing approach:
inputdata = """(device
(vfb
(xxxxxxxx)
(xxxxxxxx)
(location 0.0.0.0:5900)
)
)
(device
(console
(xxxxxxxx)
(xxxxxxxx)
(location 80)
)
)"""
from pyparsing import OneOrMore, nestedExpr
# a nestedExpr defaults to reading space-separated words within nested parentheses
data = OneOrMore(nestedExpr()).parseString(inputdata)
print (data.asList())
# recursive search to walk parsed data to find desired entry
def findPath(seq, path):
for s in seq:
if s[0] == path[0]:
if len(path) == 1:
return s[1]
else:
ret = findPath(s[1:], path[1:])
if ret is not None:
return ret
return None
print findPath(data, "device/vfb/location".split('/'))
prints:
[['device', ['vfb', ['xxxxxxxx'], ['xxxxxxxx'], ['location', '0.0.0.0:5900']]],
['device', ['console', ['xxxxxxxx'], ['xxxxxxxx'], ['location', '80']]]]
0.0.0.0:5900
Maybe this gets you started:
In [84]: data = '(device(vfb(xxxxxxxx)(xxxxxxxx)(location 0.0.0.0:5900)))'
In [85]: m = re.search(r"""
.....: vfb
.....: .*
.....: \(
.....: location
.....: \s+
.....: (
.....: [^\)]+
.....: )
.....: \)""", data, flags=re.X)
In [86]: m.group(1)
Out[86]: '0.0.0.0:5900'
I have a dump of a data structure which i'm trying to convert into an XML. the structure has a number of nested structures within it. So i'm kind of lost on how to start because all the regex expressions that i can think of will not work on nested expressions.
For example, let's say there is a structure dump like this:
abc = (
bcd = (efg = 0, ghr = 5, lmn = 10),
ghd = 5,
zde = (dfs = 10, fge =20, dfg = (sdf = 3, ert = 5), juh = 0))
and i want to come out with an output like this:
< abc >
< bcd >
< efg >0< /efg >
< ghr >5< /ghr >
< lmn >10< /lmn >
< /bcd >
.....
< /abc >
So what would be a good approach to this? tokenizing the expression, a clever regex or using a stack?
Use pyparsing.
$ cat parsing.py
from pyparsing import nestedExpr
abc = """(
bcd = (efg = 0, ghr = 5, lmn 10),
ghd = 5,
zde = (dfs = 10, fge =20, dfg = (sdf = 3, ert = 5), juh = 0))"""
print nestedExpr().parseString(abc).asList()
$ python parsing.py
[['bcd', '=', ['efg', '=', '0,', 'ghr', '=', '5,', 'lmn', '10'], ',', 'ghd', '=', '5,', 'zde', '=', ['dfs', '=', '10,', 'fge', '=20,', 'dfg', '=', ['sdf', '=', '3,', 'ert', '=', '5'], ',', 'juh', '=', '0']]]
Here is an alternate answer that uses pyparsing more idiomatically. Because it provides a detailed grammar for what inputs may be seen and what results should be returned, parsed data is not "messy." Thus toXML() needn't work as hard nor do any real cleanup.
print "\n----- ORIGINAL -----\n"
dump = """
abc = (
bcd = (efg = 0, ghr = 5, lmn 10),
ghd = 5,
zde = (dfs = 10, fge =20, dfg = (sdf = 3, ert = 5), juh = 0))
""".strip()
print dump
print "\n----- PARSED INTO LIST -----\n"
from pyparsing import Word, alphas, nums, Optional, Forward, delimitedList, Group, Suppress
def Syntax():
"""Define grammar and parser."""
# building blocks
name = Word(alphas)
number = Word(nums)
_equals = Optional(Suppress('='))
_lpar = Suppress('(')
_rpar = Suppress(')')
# larger constructs
expr = Forward()
value = number | Group( _lpar + delimitedList(expr) + _rpar )
expr << name + _equals + value
return expr
parsed = Syntax().parseString(dump)
print parsed
print "\n----- SERIALIZED INTO XML ----\n"
def toXML(part, level=0):
xml = ""
indent = " " * level
while part:
tag = part.pop(0)
payload = part.pop(0)
insides = payload if isinstance(payload, str) \
else "\n" + toXML(payload, level+1) + indent
xml += "{indent}<{tag}>{insides}</{tag}>\n".format(**locals())
return xml
print toXML(parsed)
The input and XML output is the same as my other answer. The data returned by parseString() is the only real change:
----- PARSED INTO LIST -----
['abc', ['bcd', ['efg', '0', 'ghr', '5', 'lmn', '10'], 'ghd', '5', 'zde',
['dfs', '10', 'fge', '20', 'dfg', ['sdf', '3', 'ert', '5'], 'juh', '0']]]
I don't think regexps is the best approach here, but for those curious it can be done like this:
def expr(m):
out = []
for item in m.group(1).split(','):
a, b = map(str.strip, item.split('='))
out.append('<%s>%s</%s>' % (a, b, a))
return '\n'.join(out)
rr = r'\(([^()]*)\)'
while re.search(rr, data):
data = re.sub(rr, expr, data)
Basically, we repeatedly replace lowermost parenthesis (no parens here) with chunks of xml until there's no more parenthesis. For simplicity, I also included the main expression in parenthesis, if this is not the case, just do data='(%s)' % data before parsing.
I like Igor Chubin's "use pyparsing" answer, because in general, regexps handle nested structures very poorly (though thg435's iterative replacement solution is a clever workaround).
But once pyparsing's done its thing, you then need a routine to walk the list and emit XML. It needs to be intelligent about the imperfections of pyparsing's results. For example, fge =20, doesn't yield the ['fge', '=', '20'] you'd like, but ['fge', '=20,']. Commas are sometimes also added in places that are unhelpful. Here's how I did it:
from pyparsing import nestedExpr
dump = """
abc = (
bcd = (efg = 0, ghr = 5, lmn 10),
ghd = 5,
zde = (dfs = 10, fge =20, dfg = (sdf = 3, ert = 5), juh = 0))
"""
dump = dump.strip()
print "\n----- ORIGINAL -----\n"
print dump
wrapped = dump if dump.startswith('(') else "({})".format(dump)
parsed = nestedExpr().parseString(wrapped).asList()
print "\n----- PARSED INTO LIST -----\n"
print parsed
def toXML(part, level=0):
def grab_tag():
return part.pop(0).lstrip(",")
def grab_payload():
payload = part.pop(0)
if isinstance(payload, str):
payload = payload.lstrip("=").rstrip(",")
return payload
xml = ""
indent = " " * level
while part:
tag = grab_tag() or grab_tag()
payload = grab_payload() or grab_payload()
# grab twice, possibly, if '=' or ',' is in the way of what you're grabbing
insides = payload if isinstance(payload, str) \
else "\n" + toXML(payload, level+1) + indent
xml += "{indent}<{tag}>{insides}</{tag}>\n".format(**locals())
return xml
print "\n----- SERIALIZED INTO XML ----\n"
print toXML(parsed[0])
Resulting in:
----- ORIGINAL -----
abc = (
bcd = (efg = 0, ghr = 5, lmn 10),
ghd = 5,
zde = (dfs = 10, fge =20, dfg = (sdf = 3, ert = 5), juh = 0))
----- PARSED INTO LIST -----
[['abc', '=', ['bcd', '=', ['efg', '=', '0,', 'ghr', '=', '5,', 'lmn', '10'], ',', 'ghd', '=', '5,', 'zde', '=', ['dfs', '=', '10,', 'fge', '=20,', 'dfg', '=', ['sdf', '=', '3,', 'ert', '=', '5'], ',', 'juh', '=', '0']]]]
----- SERIALIZED INTO XML ----
<abc>
<bcd>
<efg>0</efg>
<ghr>5</ghr>
<lmn>10</lmn>
</bcd>
<ghd>5</ghd>
<zde>
<dfs>10</dfs>
<fge>20</fge>
<dfg>
<sdf>3</sdf>
<ert>5</ert>
</dfg>
<juh>0</juh>
</zde>
</abc>
You can use re module to parse nested expressions (though it is not recommended):
import re
def repl_flat(m):
return "\n".join("<{0}>{1}</{0}>".format(*map(str.strip, s.partition('=')[::2]))
for s in m.group(1).split(','))
def eval_nested(expr):
val, n = re.subn(r"\(([^)(]+)\)", repl_flat, expr)
return val if n == 0 else eval_nested(val)
Example
print eval_nested("(%s)" % (data,))
Output
<abc><bcd><efg>0</efg>
<ghr>5</ghr>
<lmn>10</lmn></bcd>
<ghd>5</ghd>
<zde><dfs>10</dfs>
<fge>20</fge>
<dfg><sdf>3</sdf>
<ert>5</ert></dfg>
<juh>0</juh></zde></abc>