Convert lvm.conf to python dict using pyparsing - python

I'm trying to convert lvm.conf to python (JSON like) object.
LVM (Logical Volume Management) configuration file looks like this:
# Configuration section config.
# How LVM configuration settings are handled.
config {
# Configuration option config/checks.
# If enabled, any LVM configuration mismatch is reported.
# This implies checking that the configuration key is understood by
# LVM and that the value of the key is the proper type. If disabled,
# any configuration mismatch is ignored and the default value is used
# without any warning (a message about the configuration key not being
# found is issued in verbose mode only).
checks = 1
# Configuration option config/abort_on_errors.
# Abort the LVM process if a configuration mismatch is found.
abort_on_errors = 0
# Configuration option config/profile_dir.
# Directory where LVM looks for configuration profiles.
profile_dir = "/etc/lvm/profile"
}
local {
}
log {
verbose=0
silent=0
syslog=1
overwrite=0
level=0
indent=1
command_names=0
prefix=" "
activation=0
debug_classes=["memory","devices","activation","allocation","lvmetad","metadata","cache","locking","lvmpolld","dbus"]
}
I'd like to get Python dict, like this:
{ "section_name"":
{"value1" : 1,
"value2" : "some_string",
"value3" : [list, of, strings]}... and so on.}
The parser function:
def parseLvmConfig2(path="/etc/lvm/lvm.conf"):
try:
EQ, LBRACE, RBRACE, LQ, RQ = map(pp.Suppress, "={}[]")
comment = pp.Suppress("#") + pp.Suppress(pp.restOfLine)
configSection = pp.Word(pp.alphas + "_") + LBRACE
sectionKey = pp.Word(pp.alphas + "_")
sectionValue = pp.Forward()
entry = pp.Group(sectionKey + EQ + sectionValue)
real = pp.Regex(r"[+-]?\d+\.\d*").setParseAction(lambda x: float(x[0]))
integer = pp.Regex(r"[+-]?\d+").setParseAction(lambda x: int(x[0]))
listval = pp.Regex(r'(?:\[)(.*)?(?:\])').setParseAction(lambda x: eval(x[0]))
pp.dblQuotedString.setParseAction(pp.removeQuotes)
struct = pp.Group(pp.ZeroOrMore(entry) + RBRACE)
sectionValue << (pp.dblQuotedString | real | integer | listval)
parser = pp.ZeroOrMore(configSection + pp.Dict(struct))
res = parser.parseFile(path)
print(res)
except (pp.ParseBaseException, ) as e:
print("lvm.conf bad format {0}".format(e))
The result is messy and the question is, how to make pyparsing do the job, without additional logic?
UPDATE(SOLVED):
For anyone who wants to understand pyparsing better, please check #PaulMcG explanation below. (Thanks for pyparsing, Paul! )
import pyparsing as pp
def parseLvmConf(conf="/etc/lvm/lvm.conf", res_type="dict"):
EQ, LBRACE, RBRACE, LQ, RQ = map(pp.Suppress, "={}[]")
comment = "#" + pp.restOfLine
integer = pp.nums
real = pp.Word(pp.nums + "." + pp.nums)
pp.dblQuotedString.setParseAction(pp.removeQuotes)
scalar_value = real | integer | pp.dblQuotedString
list_value = pp.Group(LQ + pp.delimitedList(scalar_value) + RQ)
key = pp.Word(pp.alphas + "_", pp.alphanums + '_')
key_value = pp.Group(key + EQ + (scalar_value | list_value))
struct = pp.Forward()
entry = key_value | pp.Group(key + struct)
struct <<= pp.Dict(LBRACE + pp.ZeroOrMore(entry) + RBRACE)
parser = pp.Dict(pp.ZeroOrMore(entry))
parser.ignore(comment)
try:
#return lvm.conf as dict
if res_type == "dict":
return parser.parseFile(conf).asDict()
# return lvm.conf as list
elif res_type == "list":
return parser.parseFile(conf).asList()
else:
#return lvm.conf as ParseResults
return parser.parseFile(conf)
except (pp.ParseBaseException,) as e:
print("lvm.conf bad format {0}".format(e))

Step 1 should always be to at least rough out a BNF for the format you are going to parse. This really helps organize your thoughts, and gets you thinking about the structure and data you are parsing, before starting to write actual code.
Here is a BNF that I came up with for this config (it looks like a Python string because that makes it easy to paste into your code for future reference - but pyparsing does not work with or require such strings, they are purely a design tool):
BNF = '''
key_struct ::= key struct
struct ::= '{' (key_value | key_struct)... '}'
key_value ::= key '=' (scalar_value | list_value)
key ::= word composed of alphas and '_'
list_value ::= '[' scalar_value [',' scalar_value]... ']'
scalar_value ::= real | integer | double-quoted-string
comment ::= '#' rest-of-line
'''
Notice that the opening and closing {}'s and []'s are at the same level, rather than having an opener in one expression and a closer in another.
This BNF also will allow for structs nested within structs, which is not strictly required in the sample text you posted, but since your code looked to be supporting that, I included it.
Translating to pyparsing is pretty straightforward from here, working bottom-up through the BNF:
EQ, LBRACE, RBRACE, LQ, RQ = map(pp.Suppress, "={}[]")
comment = "#" + pp.restOfLine
integer = ppc.integer #pp.Regex(r"[+-]?\d+").setParseAction(lambda x: int(x[0]))
real = ppc.real #pp.Regex(r"[+-]?\d+\.\d*").setParseAction(lambda x: float(x[0]))
pp.dblQuotedString.setParseAction(pp.removeQuotes)
scalar_value = real | integer | pp.dblQuotedString
# `delimitedList(expr)` is a shortcut for `expr + ZeroOrMore(',' + expr)`
list_value = pp.Group(LQ + pp.delimitedList(scalar_value) + RQ)
key = pp.Word(pp.alphas + "_", pp.alphanums + '_')
key_value = pp.Group(key + EQ + (scalar_value | list_value))
struct = pp.Forward()
entry = key_value | pp.Group(key + struct)
struct <<= (LBRACE + pp.ZeroOrMore(entry) + RBRACE)
parser = pp.ZeroOrMore(entry)
parser.ignore(comment)
Running this code:
try:
res = parser.parseString(lvm_source)
# print(res.dump())
res.pprint()
return res
except (pp.ParseBaseException, ) as e:
print("lvm.conf bad format {0}".format(e))
Gives this nested list:
[['config',
['checks', 1],
['abort_on_errors', 0],
['profile_dir', '/etc/lvm/profile']],
['local'],
['log',
['verbose', 0],
['silent', 0],
['syslog', 1],
['overwrite', 0],
['level', 0],
['indent', 1],
['command_names', 0],
['prefix', ' '],
['activation', 0],
['debug_classes',
['memory',
'devices',
'activation',
'allocation',
'lvmetad',
'metadata',
'cache',
'locking',
'lvmpolld',
'dbus']]]]
I think the format you would prefer is one where you can access the values as keys in a nested dict or in a hierarchical object. Pyparsing has a class called Dict that will do this at parse time, so that the results names are automatically assigned for nested subgroups. Change these two lines to have their sub-entries automatically dict-ified:
struct <<= pp.Dict(LBRACE + pp.ZeroOrMore(entry) + RBRACE)
parser = pp.Dict(pp.ZeroOrMore(entry))
Now if we call dump() instead of pprint(), we'll see the hierarchical naming:
[['config', ['checks', 1], ['abort_on_errors', 0], ['profile_dir', '/etc/lvm/profile']], ['local'], ['log', ['verbose', 0], ['silent', 0], ['syslog', 1], ['overwrite', 0], ['level', 0], ['indent', 1], ['command_names', 0], ['prefix', ' '], ['activation', 0], ['debug_classes', ['memory', 'devices', 'activation', 'allocation', 'lvmetad', 'metadata', 'cache', 'locking', 'lvmpolld', 'dbus']]]]
- config: [['checks', 1], ['abort_on_errors', 0], ['profile_dir', '/etc/lvm/profile']]
- abort_on_errors: 0
- checks: 1
- profile_dir: '/etc/lvm/profile'
- local: ''
- log: [['verbose', 0], ['silent', 0], ['syslog', 1], ['overwrite', 0], ['level', 0], ['indent', 1], ['command_names', 0], ['prefix', ' '], ['activation', 0], ['debug_classes', ['memory', 'devices', 'activation', 'allocation', 'lvmetad', 'metadata', 'cache', 'locking', 'lvmpolld', 'dbus']]]
- activation: 0
- command_names: 0
- debug_classes: ['memory', 'devices', 'activation', 'allocation', 'lvmetad', 'metadata', 'cache', 'locking', 'lvmpolld', 'dbus']
- indent: 1
- level: 0
- overwrite: 0
- prefix: ' '
- silent: 0
- syslog: 1
- verbose: 0
You can then access the fields as res['config']['checks'] or res.log.indent.

Related

PLY Lex: ID could be anything

I have a following simple format:
BLOCK ID {
SUBBLOCK ID {
SUBSUBBLOCK ID {
SOME STATEMENTS;
};
};
};
I configured ply to work with this format. But the issue is that ID could be any string including "BLOCK", "SUBBLOCK", etc.
In the lexer I define ID as:
#TOKEN(r'[a-zA-Z_][a-zA-Z_0-9]*')
def t_ID(self, t):
t.type = self.keyword_map.get(t.value, "ID")
return t
But it means that BLOCK word will not be allowed as a block name.
How I can overcome this issue?
The easiest solution is to create a non-terminal name to be used instead of ID in productions which need a name, such as block : BLOCK name braced_statements:
# Docstring is added later
def p_name(self, p):
p[0] = p[1]
Then you compute the productions for name and assign them to p_name's docstring by executing this before you generate the parser:
Parser.p_name.__doc__ = '\n| '.join(
['name : ID']
+ list(Lexer.keyword_map.values())
)
I probably dont understand your question
but i think something like the following would work (its been a long time since
I messed with PLY so keep in mind this is pseudocode
FUNCTION_ARGS = zeroOrMore(lazy('STATEMENT'),sep=',')
FUNCTION_CALL = t_ID + lparen + FUNCTION_ARGS + rparen
STATEMENT= FUNCTION_CALL | t_Literal | t_ID
SUBBLOCK = Literal('SUBBLOCK') + t_ID + lbrace + STATEMENT + rbrace
BLOCK = Literal('BLOCK') + lbrace + oneOrMore(SUBBLOCK) + rbrace

PyParsing: parse if not a keyword

I am trying to parse a file as follows:
testp.txt
title = Test Suite A;
timeout = 10000
exp_delay = 500;
log = TRUE;
sect
{
type = typeA;
name = "HelloWorld";
output_log = "c:\test\out.log";
};
sect
{
name = "GoodbyeAll";
type = typeB;
comm1_req = 0xDEADBEEF;
comm1_resp = (int, 1234366);
};
The file first contains a section with parameters and then some sects. I can parse a file containing just parameters and I can parse a file just containing sects but I can't parse both.
from pyparsing import *
from pathlib import Path
command_req = Word(alphanums)
command_resp = "(" + delimitedList(Word(alphanums)) + ")"
kW = Word(alphas+'_', alphanums+'_') | command_req | command_resp
keyName = ~Literal("sect") + Word(alphas+'_', alphanums+'_') + FollowedBy("=")
keyValue = dblQuotedString.setParseAction( removeQuotes ) | OneOrMore(kW,stopOn=LineEnd())
param = dictOf(keyName, Suppress("=")+keyValue+Optional(Suppress(";")))
node = Group(Literal("sect") + Literal("{") + OneOrMore(param) + Literal("};"))
final = OneOrMore(node) | OneOrMore(param)
param.setDebug()
p = Path(__file__).with_name("testp.txt")
with open(p) as f:
try:
x = final.parseFile(f, parseAll=True)
print(x)
print("...")
dx = x.asDict()
print(dx)
except ParseException as pe:
print(pe)
The issue I have is that param matches against sect so it expects a =. So I tried putting in ~Literal("sect") in keyName but that just leads to another error:
Exception raised:Found unwanted token, "sect", found '\n' (at char 188), (line:4, col:56)
Expected end of text, found 's' (at char 190), (line:6, col:1)
How do I get it use one parse method for sect and another (param) if not sect?
My final goal would be to have the whole lot in a Dict with the global params and sects included.
EDIT
Think I've figured it out:
This line...
final = OneOrMore(node) | OneOrMore(param)
...should be:
final = ZeroOrMore(param) + ZeroOrMore(node)
But I wonder if there is a more structured way (as I'd ultimately like a dict)?

Dealing with ZeroOrMore in pyparsing

I'm trying to parse pactl list with pyparsing: So far all parse is working correctly but I cannot make ZeroOrMore to work correctly.
I can find foo: or foo: bar and try to deal with that with ZeroOrMore but it doesn't work, I have to add special case "Argument:" to find results without value, but there're Argument: foo results (with value) so it will not work, and I expect any other property to exist without value.
With this definition, and a fixed pactl list output:
#!/usr/bin/env python
#
# parsing pactl list
#
from pyparsing import *
import os
from subprocess import check_output
import sys
data = '''
Module #6
Argument:
Name: module-alsa-card
Usage counter: 0
Properties:
module.author = "Lennart Poettering"
module.description = "ALSA Card"
module.version = "14.0-rebootstrapped"
'''
indentStack = [1]
stmt = Forward()
identifier = Word(alphanums+"-_.")
sect_def = Group(Group(identifier) + Suppress("#") + Group(Word(nums)))
inner_section = indentedBlock(stmt, indentStack)
section = (sect_def + inner_section)
value = Group(Group(Combine(OneOrMore(identifier|White(' ')))) + Suppress(":") + Group(Combine(ZeroOrMore(Word(alphanums+'-/=_".')|White(' ', max=1)))))
prop_name = Literal("Properties:")
prop_section = indentedBlock(stmt, indentStack)
prop_val = Group(Group(identifier) + Suppress("=") + Group(Combine(OneOrMore(Word(alphanums+'-"/.')|White(' \t')))))
prop = (prop_name + prop_section)
stmt << ( section | prop | ("Argument:") | value | prop_val )
syntax = OneOrMore(stmt)
parseTree = syntax.parseString(data)
parseTree.pprint()
This gets:
$ ./pactl.py
Module #6
Argument:
Name: module-alsa-card
Usage counter: 0
Properties:
module.author = "Lennart Poettering"
module.description = "ALSA Card"
module.version = "14.0-rebootstrapped"
[[['Module'], ['6']],
[['Argument:'],
[[['Name'], ['module-alsa-card']]],
[[['Usage counter'], ['0']]],
['Properties:',
[[[['module.author'], ['"Lennart Poettering"']]],
[[['module.description'], ['"ALSA Card"']]],
[[['module.version'], ['"14.0-rebootstrapped"']]]]]]]
So far so good, but removing special case for Argument: it gets into error, as ZeroOrMore doesn't behave as expected:
#!/usr/bin/env python
#
# parsing pactl list
#
from pyparsing import *
import os
from subprocess import check_output
import sys
data = '''
Module #6
Argument:
Name: module-alsa-card
Usage counter: 0
Properties:
module.author = "Lennart Poettering"
module.description = "ALSA Card"
module.version = "14.0-rebootstrapped"
'''
indentStack = [1]
stmt = Forward()
identifier = Word(alphanums+"-_.")
sect_def = Group(Group(identifier) + Suppress("#") + Group(Word(nums)))
inner_section = indentedBlock(stmt, indentStack)
section = (sect_def + inner_section)
value = Group(Group(Combine(OneOrMore(identifier|White(' ')))) + Suppress(":") + Group(Combine(ZeroOrMore(Word(alphanums+'-/=_".')|White(' ', max=1))))).setDebug()
prop_name = Literal("Properties:")
prop_section = indentedBlock(stmt, indentStack)
prop_val = Group(Group(identifier) + Suppress("=") + Group(Combine(OneOrMore(Word(alphanums+'-"/.')|White(' \t')))))
prop = (prop_name + prop_section)
stmt << ( section | prop | value | prop_val )
syntax = OneOrMore(stmt)
parseTree = syntax.parseString(data)
parseTree.pprint()
This results in:
$ ./pactl.py
Module #6
Argument:
Name: module-alsa-card
Usage counter: 0
Properties:
module.author = "Lennart Poettering"
module.description = "ALSA Card"
module.version = "14.0-rebootstrapped"
Match Group:({Group:(Combine:({{W:(ABCD...) | <SP>}}...)) Suppress:(":") Group:(Combine:([{W:(ABCD...) | <SP>}]...))}) at loc 19(3,9)
Matched Group:({Group:(Combine:({{W:(ABCD...) | <SP>}}...)) Suppress:(":") Group:(Combine:([{W:(ABCD...) | <SP>}]...))}) -> [[['Argument'], ['Name']]]
Match Group:({Group:(Combine:({{W:(ABCD...) | <SP>}}...)) Suppress:(":") Group:(Combine:([{W:(ABCD...) | <SP>}]...))}) at loc 1(2,1)
Exception raised:Expected ":", found '#' (at char 8), (line:2, col:8)
Traceback (most recent call last):
File "/home/alberto/projects/node/pacmd_list_json/./pactl.py", line 55, in <module>
parseTree = syntax.parseString(partial)
File "/usr/local/lib/python3.9/site-packages/pyparsing.py", line 1955, in parseString
raise exc
File "/usr/local/lib/python3.9/site-packages/pyparsing.py", line 6336, in checkUnindent
raise ParseException(s, l, "not an unindent")
pyparsing.ParseException: Expected {{Group:({Group:(W:(ABCD...)) Suppress:("#") Group:(W:(0123...))}) indented block} | {"Properties:" indented block} | Group:({Group:(Combine:({{W:(ABCD...) | <SP>}}...)) Suppress:(":") Group:(Combine:([{W:(ABCD...) | <SP>}]...))}) | Group:({Group:(W:(ABCD...)) Suppress:("=") Group:(Combine:({{W:(ABCD...) | <SP><TAB>}}...))})}, found ':' (at char 41), (line:4, col:13)
See from setDebug value grammar ZeroOrMore is getting the tokens from next line [[['Argument'], ['Name']]]
I tried LineEnd() and other tricks but none works.
Any idea on how to deal with ZeroOrMore to stop on LineEnd() or without special cases?
NOTE: Real output can be retrieved using:
env = os.environ.copy()
env['LANG'] = 'C'
data = check_output(
['pactl', 'list'], universal_newlines=True, env=env)
indentedBlock is not the easiest pyparsing element to work with. But there are a few things that you are doing that are getting in your way.
To debug this, I broke down some of your more complex expressions, use setName() to give them names, and then added .setDebug(). Like this:
identifier = Word(alphas, alphanums+"-_.").setName("identifier").setDebug()
This will tell pyparsing to output a message whenever this expression is about to be matched, if it matched successfully, or if not, the exception that was raised.
Match identifier at loc 1(2,1)
Matched identifier -> ['Module']
Match identifier at loc 15(3,5)
Matched identifier -> ['Argument']
Match identifier at loc 15(3,5)
Matched identifier -> ['Argument']
Match identifier at loc 23(3,13)
Exception raised:Expected identifier, found ':' (at char 23), (line:3, col:13)
It looks like these expressions are messing up the indentedBlock matching, by processing whitespace that should be indentation space:
Combine(OneOrMore(Word(alphanums+'-"/.')|White(' \t')))
The " character in the Word and the whitespace lead me to believe you are trying to match quoted strings. I replaced this expression with:
Combine(OneOrMore(Word(alphas, alphanums+'-/.') | quotedString))
You also need to take care not to read past the end of the line, or you'll also mess up the indentedBlock indentation tracking. I added this expression for a newline at the top:
NL = LineEnd()
and then used it as the stopOn argument to OneOrMore and ZeroOrMore:
prop_val_value = Combine(OneOrMore(Word(alphas, alphanums+'-/.') | quotedString(), stopOn=NL)).setName("prop_val_value")#.setDebug()
prop_val = Group(identifier + Suppress("=") + Group(prop_val_value)).setName("prop_val")#.setDebug()
Here is the parser I ended up with:
indentStack = [1]
stmt = Forward()
NL = LineEnd()
identifier = Word(alphas, alphanums+"-_.").setName("identifier").setDebug()
sect_def = Group(Group(identifier) + Suppress("#") + Group(Word(nums))).setName("sect_def")#.setDebug()
inner_section = indentedBlock(stmt, indentStack)
section = (sect_def + inner_section)
#~ value = Group(Group(Combine(OneOrMore(identifier|White(' ')))) + Suppress(":") + Group(Combine(ZeroOrMore(Word(alphanums+'-/=_".')|White(' ', max=1))))).setDebug()
value_label = originalTextFor(OneOrMore(identifier)).setName("value_label")#.setDebug()
value = Group(value_label
+ Suppress(":")
+ Optional(~NL + Group(Combine(ZeroOrMore(Word(alphanums+'-/=_.') | quotedString(), stopOn=NL))))).setName("value")#.setDebug()
prop_name = Literal("Properties:")
prop_section = indentedBlock(stmt, indentStack)
#~ prop_val = Group(Group(identifier) + Suppress("=") + Group(Combine(OneOrMore(Word(alphanums+'-"/.')|White(' \t')))))
prop_val_value = Combine(OneOrMore(Word(alphas, alphanums+'-/.') | quotedString(), stopOn=NL)).setName("prop_val_value")#.setDebug()
prop_val = Group(identifier + Suppress("=") + Group(prop_val_value)).setName("prop_val")#.setDebug()
prop = (prop_name + prop_section).setName("prop")#.setDebug()
stmt << ( section | prop | value | prop_val )
Which gives this:
[[['Module'], ['6']],
[[['Argument']],
[['Name', ['module-alsa-card']]],
[['Usage counter', ['0']]],
['Properties:',
[[['module.author', ['"Lennart Poettering"']]],
[['module.description', ['"ALSA Card"']]],
[['module.version', ['"14.0-rebootstrapped"']]]]]]]

pattern to dictionary of lists Python

I have a file like this
module1 instance1(.wire1 (connectionwire1), .wire2 (connectionwire2),.... ,wire100 (connectionwire100)) ; module 2 instance 2(.wire1 (newconnectionwire1), .wire2 (newconnectionwire2),.... ,wire99 (newconnectionwire99))
Ther wires are repeated along modules. There can be many modules.
I want to build a dictionary like this (not every wire in 2nd module is a duplicate).
[wire1:[(module1, instance1, connection1), (module2, instance2,newconnection1), wire2:[(module1 instance1 connection2),(module2, instance2,newconnection1)]... wire99:module2, instance2, connection99), ]
I am splitting the string on ; then splitting on , and then ( to get wire and connectionwire strings . I am not sure how to fill the data structure though so the wire is the key and module, instancename and connection are elements.
Goal- get this datastructure- [ wire: (module, instance, connectionwire) ]
filedata=file.read()
realindex=list(find_pos(filedata,';'))
tempindex=0
for l in realindex:
module=filedata[tempindex:l]
modulename=module.split()[0]
openbracketindex=module.find("(")
closebracketindex=module.strip("\n").find(");")
instancename=module[:openbracketindex].split()[1]
tempindex=l
tempwires=module[openbracketindex:l+1]
#got to split wires on commas
for tempw in tempwires.split(","):
wires=tempw
listofwires.append(wires)
Using the re module.
import re
from collections import defaultdict
s = "module1 instance1(.wire1 (connectionwire1), .wire2 (connectionwire2), .wire100 (connectionwire100)) ; module2 instance2(.wire1 (newconnectionwire1), .wire2 (newconnectionwire2), wire99 (newconnectionwire99))'
d = defaultdict(list)
module_pattern = r'(\w+)\s(\w+)\(([^;]+)'
mod_rex = re.compile(module_pattern)
wire_pattern = r'\.(\w+)\s\(([^\)]+)'
wire_rex = re.compile(wire_pattern)
for match in mod_rex.finditer(s):
#print '\n'.join(match.groups())
module, instance, wires = match.groups()
for match in wire_rex.finditer(wires):
wire, connection = match.groups()
#print '\t', wire, connection
d[wire].append((module, instance, connection))
for k, v in d.items():
print k, ':', v
Produces
wire1 : [('module1', 'instance1', 'connectionwire1'), ('module2', 'instance2', 'newconnectionwire1')]
wire2 : [('module1', 'instance1', 'connectionwire2'), ('module2', 'instance2', 'newconnectionwire2')]
wire100 : [('module1', 'instance1', 'connectionwire100')]
Answer provided by wwii using re is correct. I'm sharing an example of how you can solve your problem using pyparsing module which makes parsing human readable and easy to do.
from pyparsing import Word, alphanums, Optional, ZeroOrMore, Literal, Group, OneOrMore
from collections import defaultdict
s = 'module1 instance1(.wire1 (connectionwire1), .wire2 (connectionwire2), .wire100 (connectionwire100)) ; module2 instance2(.wire1 (newconnectionwire1), .wire2 (newconnectionwire 2), .wire99 (newconnectionwire99))'
connection = Word(alphanums)
wire = Word(alphanums)
module = Word(alphanums)
instance = Word(alphanums)
dot = Literal(".").suppress()
comma = Literal(",").suppress()
lparen = Literal("(").suppress()
rparen = Literal(")").suppress()
semicolon = Literal(";").suppress()
wire_connection = Group(dot + wire("wire") + lparen + connection("connection") + rparen + Optional(comma))
wire_connections = Group(OneOrMore(wire_connection))
module_instance = Group(module("module") + instance("instance") + lparen + ZeroOrMore(wire_connections("wire_connections")) + rparen + Optional(semicolon))
module_instances = OneOrMore(module_instance)
results = module_instances.parseString(s)
# create a dict
d = defaultdict(list)
for r in results:
m = r['module']
i = r['instance']
for wc in r['wire_connections']:
w = wc['wire']
c = wc['connection']
d[w].append((m, i, c))
print d
Output:
defaultdict(<type 'list'>, {'wire1': [('module1', 'instance1', 'connectionwire1'), ('module2', 'instance2', 'newconnectionwire1')], 'wire2': [('module1', 'instance1', 'connectionwire2'), ('module2', 'instance2', 'newconnectionwire2')], 'wire100': [('module1', 'instance1', 'connectionwire100')], 'wire99': [('module2', 'instance2', 'newconnectionwire99')]})

Python: load text as python object [duplicate]

This question already has answers here:
How to convert raw javascript object to a dictionary?
(6 answers)
Closed 9 months ago.
I have a such text to load: https://sites.google.com/site/iminside1/paste
I'd prefer to create a python dictionary from it, but any object is OK. I tried pickle, json and eval, but didn't succeeded. Can you help me with this?
Thanks!
The results:
a = open("the_file", "r").read()
json.loads(a)
ValueError: Expecting property name: line 1 column 1 (char 1)
pickle.loads(a)
KeyError: '{'
eval(a)
File "<string>", line 19
from: {code: 'DME', airport: "Домодедово", city: 'Москва', country: 'Россия', terminal: ''},
^
SyntaxError: invalid syntax
Lifted almost straight from the pyparsing examples page:
# read text from web page
import urllib
page = urllib.urlopen("https://sites.google.com/site/iminside1/paste")
html = page.read()
page.close()
start = html.index("<pre>")+len("<pre>")+3 #skip over 3-byte header
end = html.index("</pre>")
text = html[start:end]
print text
# parse dict-like syntax
from pyparsing import (Suppress, Regex, quotedString, Word, alphas,
alphanums, oneOf, Forward, Optional, dictOf, delimitedList, Group, removeQuotes)
LBRACK,RBRACK,LBRACE,RBRACE,COLON,COMMA = map(Suppress,"[]{}:,")
integer = Regex(r"[+-]?\d+").setParseAction(lambda t:int(t[0]))
real = Regex(r"[+-]?\d+\.\d*").setParseAction(lambda t:float(t[0]))
string_ = Word(alphas,alphanums+"_") | quotedString.setParseAction(removeQuotes)
bool_ = oneOf("true false").setParseAction(lambda t: t[0]=="true")
item = Forward()
key = string_
dict_ = LBRACE - Optional(dictOf(key+COLON, item+Optional(COMMA))) + RBRACE
list_ = LBRACK - Optional(delimitedList(item)) + RBRACK
item << (real | integer | string_ | bool_ | Group(list_ | dict_ ))
result = item.parseString(text,parseAll=True)[0]
print result.data[0].dump()
print result.data[0].segments[0].dump(indent=" ")
print result.data[0].segments[0].flights[0].dump(indent=" - ")
print result.data[0].segments[0].flights[0].flightLegs[0].dump(indent=" - - ")
for seg in result.data[6].segments:
for flt in seg.flights:
fltleg = flt.flightLegs[0]
print "%(airline)s %(airlineCode)s %(flightNo)s" % fltleg,
print "%s -> %s" % (fltleg["from"].code, fltleg["to"].code)
Prints:
[['index', 0], ['serviceClass', '??????'], ['prices', [3504, ...
- eTicketing: true
- index: 0
- prices: [3504, 114.15000000000001, 89.769999999999996]
- segments: [[['indexSegment', 0], ['stopsCount', 0], ['flights', ...
- serviceClass: ??????
[['indexSegment', 0], ['stopsCount', 0], ['flights', [[['index', 0], ...
- flights: [[['index', 0], ['time', 'PT2H45M'], ['minAvailSeats', 9], ...
- indexSegment: 0
- stopsCount: 0
- [['index', 0], ['time', 'PT2H45M'], ['minAvailSeats', 9], ['flight...
- - flightLegs: [[['flightNo', '309'], ['eTicketing', 'true'], ['air...
- - index: 0
- - minAvailSeats: 9
- - stops: []
- - time: PT2H45M
- - [['flightNo', '309'], ['eTicketing', 'true'], ['airplane', 'Boe...
- - - airline: ?????????
- - - airlineCode: UN
- - - airplane: Boeing 737-500
- - - availSeats: 9
- - - classCode: I
- - - eTicketing: true
- - - fareBasis: IPROW
- - - flightClass: ECONOMY
- - - flightNo: 309
- - - from: - - [['code', 'DME'], ['airport', '??????????'], ...
- - - airport: ??????????
- - - city: ??????
- - - code: DME
- - - country: ??????
- - - terminal:
- - - fromDate: 2010-10-15
- - - fromTime: 10:40:00
- - - time:
- - - to: - - [['code', 'TXL'], ['airport', 'Berlin-Tegel'], ...
- - - airport: Berlin-Tegel
- - - city: ??????
- - - code: TXL
- - - country: ????????
- - - terminal:
- - - toDate: 2010-10-15
- - - toTime: 11:25:00
airBaltic BT 425 SVO -> RIX
airBaltic BT 425 SVO -> RIX
airBaltic BT 423 SVO -> RIX
airBaltic BT 423 SVO -> RIX
EDIT: fixed grouping and expanded output dump to show how to access individual key fields of results, either by index (within list) or as attribute (within dict).
If you really have to load the bulls... this data is (see my comment), you's propably best of with a regex adding missing quotes. Something like r"([a-zA-Z_][a-zA-Z_0-9]*)\s*\:" to find things to quote and r"\'\1\'\:" as replacement (off the top of my head, I have to test it first).
Edit: After some troulbe with backward-references in Python 3.1, I finally got it working with these:
>>> pattern = r"([a-zA-Z_][a-zA-Z_0-9]*)\s*\:"
>>> test = '{"foo": {bar: 1}}'
>>> repl = lambda match: '"{}":'.format(match.group(1))
>>> eval(re.sub(pattern, repl, test))
{'foo': {'bar': 1}}
Till now with help of delnan and a little investigation I can load it into dict with eval:
pattern = r"\b(?P<word>\w+):"
x = re.sub(pattern, '"\g<word>":',open("the_file", "r").read())
y = x.replace("true", '"true"')
d = eval(y)
Still looking for more efficient and maybe simpler solution.. I don't like to use "eval" for some reasons.
Extension of the DominiCane's version:
import re
quote_keys_regex = re.compile(r'([\{\s,])(\w+)(:)')
def js_variable_to_python(js_variable):
"""Convert a javascript variable into JSON and then load the value"""
# when in_string is not None, it contains the character that has opened the string
# either simple quote or double quote
in_string = None
# cut the string:
# r"""{ a:"f\"irst", c:'sec"ond'}"""
# becomes
# ['{ a:', '"', 'f\\', '"', 'irst', '"', ', c:', "'", 'sec', '"', 'ond', "'", '}']
l = re.split(r'(["\'])', js_variable)
# previous part (to check the escape character antislash)
previous_p = ""
for i, p in enumerate(l):
# parse characters inside a ECMA string
if in_string:
# we are in a JS string: replace the colon by a temporary character
# so quote_keys_regex doesn't have to deal with colon inside the JS strings
l[i] = l[i].replace(':', chr(1))
if in_string == "'":
# the JS string is delimited by simple quote.
# This is not supported by JSON.
# simple quote delimited string are converted to double quote delimited string
# here, inside a JS string, we escape the double quote
l[i] = l[i].replace('"', r'\"')
# deal with delimieters and escape character
if not in_string and p in ('"', "'"):
# we are not in string
# but p is double or simple quote
# that's the start of a new string
# replace simple quote by double quote
# (JSON doesn't support simple quote)
l[i] = '"'
in_string = p
continue
if p == in_string:
# we are in a string and the current part MAY close the string
if len(previous_p) > 0 and previous_p[-1] == '\\':
# there is an antislash just before: the JS string continue
continue
# the current p close the string
# replace simple quote by double quote
l[i] = '"'
in_string = None
# update previous_p
previous_p = p
# join the string
s = ''.join(l)
# add quote arround the key
# { a: 12 }
# becomes
# { "a": 12 }
s = quote_keys_regex.sub(r'\1"\2"\3', s)
# replace the surogate character by colon
s = s.replace(chr(1), ':')
# load the JSON and return the result
return json.loads(s)
It deals only with int, null and string. I don't know about float.
Note that the usage chr(1): the code doesn't work if this character in js_variable.

Categories

Resources