I have a text file which I want to convert to a nested json structure. The text file is :
Report_for Reconciliation
Execution_of application_1673496470638_0001
Spark_version 2.4.7-amzn-0
Java_version 1.8.0_352 (Amazon.com Inc.)
Start_time 2023-01-12 09:45:13.360000
Spark Properties:
Job_ID 0
Submission_time 2023-01-12 09:47:20.148000
Run_time 73957ms
Result JobSucceeded
Number_of_stages 1
Stage_ID 0
Number_of_tasks 16907
Number_of_executed_tasks 16907
Completion_time 73207ms
Stage_executed parquet at RawDataPublisher.scala:53
Job_ID 1
Submission_time 2023-01-12 09:48:34.177000
Run_time 11525ms
Result JobSucceeded
Number_of_stages 2
Stage_ID 1
Number_of_tasks 16907
Number_of_executed_tasks 0
Completion_time 0ms
Stage_executed parquet at RawDataPublisher.scala:53
Stage_ID 2
Number_of_tasks 300
Number_of_executed_tasks 300
Completion_time 11520ms
Stage_executed parquet at RawDataPublisher.scala:53
Job_ID 2
Submission_time 2023-01-12 09:48:46.908000
Run_time 218358ms
Result JobSucceeded
Number_of_stages 1
Stage_ID 3
Number_of_tasks 1135
Number_of_executed_tasks 1135
Completion_time 218299ms
Stage_executed parquet at RawDataPublisher.scala:53
I want the output to be :
{
"Report_for": "Reconciliation",
"Execution_of": "application_1673496470638_0001",
"Spark_version": "2.4.7-amzn-0",
"Java_version": "1.8.0_352 (Amazon.com Inc.)",
"Start_time": "2023-01-12 09:45:13.360000",
"Job_ID 0": {
"Submission_time": "2023-01-12 09:47:20.148000",
"Run_time": "73957ms",
"Result": "JobSucceeded",
"Number_of_stages": "1",
"Stage_ID 0”: {
"Number_of_tasks": "16907",
"Number_of_executed_tasks": "16907",
"Completion_time": "73207ms",
"Stage_executed": "parquet at RawDataPublisher.scala:53"
"Stage": "parquet at RawDataPublisher.scala:53",
},
},
}
I tried defaultdict method but it was generating a json with values as list which was not acceptable to make a table on it. Here's what I did:
import json
from collections import defaultdict
INPUT = 'demofile.txt'
dict1 = defaultdict(list)
def convert():
with open(INPUT) as f:
for line in f:
command, description = line.strip().split(None, 1)
dict1[command].append(description.strip())
OUTPUT = open("demo1file.json", "w")
json.dump(dict1, OUTPUT, indent = 4, sort_keys = False)
and was getting this:
"Report_for": [ "Reconciliation" ],
"Execution_of": [ "application_1673496470638_0001" ],
"Spark_version": [ "2.4.7-amzn-0" ],
"Java_version": [ "1.8.0_352 (Amazon.com Inc.)" ],
"Start_time": [ "2023-01-12 09:45:13.360000" ],
"Job_ID": [
"0",
"1",
"2", ....
]]]
I just want to convert my text to the above json format so that I can build a table on top of it.
There's no way, python or one of it's libraries can figure out your nesting requirements, if a flat text is being given as an input. How should it know Stages are inside Jobs...for example.
You will have to programmatically tell your application how it works.
I hacked an example which should work, you can go from there (assuming input_str is what you posted as your file content):
# define your nesting structure
nesting = {'Job_ID': {'Stage_ID': {}}}
upper_nestings = []
upper_nesting_keys = []
# your resulting dictionary
result_dict = {}
# your "working" dictionaries
current_nesting = nesting
working_dict = result_dict
# parse each line of the input string
for line_str in input_str.split('\n'):
# key is the first word, value are all consecutive words
line = line_str.split(' ')
# if key is in nesting, create new sub-dict, all consecutive entries are part of the sub-dict
if line[0] in current_nesting.keys():
current_nesting = current_nesting[line[0]]
upper_nestings.append(line[0])
upper_nesting_keys.append(line[1])
working_dict[line_str] = {}
working_dict = working_dict[line_str]
else:
# if a new "parallel" or "upper" nesting is detected, reset your nesting structure
if line[0] in upper_nestings:
nests = upper_nestings[:upper_nestings.index(line[0])]
keys = upper_nesting_keys[:upper_nestings.index(line[0])]
working_dict = result_dict
for nest in nests:
working_dict = working_dict[' '.join([nest, keys[nests.index(nest)]])]
upper_nestings = upper_nestings[:upper_nestings.index(line[0])+1]
upper_nesting_keys = upper_nesting_keys[:upper_nestings.index(line[0])]
upper_nesting_keys.append(line[1])
current_nesting = nesting
for nest in upper_nestings:
current_nesting = current_nesting[nest]
working_dict[line_str] = {}
working_dict = working_dict[line_str]
continue
working_dict[line[0]] = ' '.join(line[1:])
print(result_dict)
Results in:
{
'Report_for': 'Reconciliation',
'Execution_of': 'application_1673496470638_0001',
'Spark_version': '2.4.7-amzn-0',
'Java_version': '1.8.0_352 (Amazon.com Inc.)',
'Start_time': '2023-01-12 09:45:13.360000',
'Spark': 'Properties: ',
'Job_ID 0': {
'Submission_time': '2023-01-12 09:47:20.148000',
'Run_time': '73957ms',
'Result': 'JobSucceeded',
'Number_of_stages': '1',
'Stage_ID 0': {
'Number_of_tasks': '16907',
'Number_of_executed_tasks': '16907',
'Completion_time': '73207ms',
'Stage_executed': 'parquet at RawDataPublisher.scala:53'
}
},
'Job_ID 1': {
'Submission_time': '2023-01-12 09:48:34.177000',
'Run_time': '11525ms',
'Result': 'JobSucceeded',
'Number_of_stages': '2',
'Stage_ID 1': {
'Number_of_tasks': '16907',
'Number_of_executed_tasks': '0',
'Completion_time': '0ms',
'Stage_executed': 'parquet at RawDataPublisher.scala:53'
},
'Stage_ID 2': {
'Number_of_tasks': '300',
'Number_of_executed_tasks': '300',
'Completion_time': '11520ms',
'Stage_executed': 'parquet at RawDataPublisher.scala:53'
}
},
'Job_ID 2': {
'Submission_time':
'2023-01-12 09:48:46.908000',
'Run_time': '218358ms',
'Result': 'JobSucceeded',
'Number_of_stages': '1',
'Stage_ID 3': {
'Number_of_tasks': '1135',
'Number_of_executed_tasks': '1135',
'Completion_time': '218299ms',
'Stage_executed': 'parquet at RawDataPublisher.scala:53'
}
}
}
and should pretty much be generically usable for all kinds of nesting definitions from a flat input. Let me know if it works for you!
Related
a = ['{"type": "book",',
'"title": "sometitle",',
'"author": [{"name": "somename"}],',
'"year": "2000",',
'"identifier": [{"type": "ISBN", "id": "1234567890"}],',
'"publisher": "somepublisher"}', '',
'{"type": "book",', '
'"title": "sometitle2",',
'"author": [{"name": "somename2"}],',
'"year": "2001",',
'"identifier": [{"type": "ISBN", "id": "1234567890"}],',
'"publisher": "somepublisher"}', '']
I have this convoluted SList and I would like to ultimately get it into a tidy pandas dataframe.
I have tried a number of things, for example:
i = iter(a)
b = dict(zip(i, i))
Unfortunately, this creates a dictionary that looks even worse:
{'{"type": "book",':
...
Where I had an SList of dictionaries, I now have a dictionary of dictionaries.
I also tried
pd.json_normalize(a)
but this throws an error message AttributeError: 'str' object has no attribute 'values'
I also tried
r = json.dumps(a.l)
loaded_r = json.loads(r)
print(loaded_r)
but this yields a list
['{"type": "book",',
...
Again, in the end I'd like to have a pandas dataframe like this
type title author year ...
book sometitle somename 2000 ...
book sometitle2 somename2 2001
Obviously, I haven't really gotten to the point where I can feed the data to a pandas function. Everytime I did that, the functions screamed at me...
a = ['{"type": "book",',
'"title": "sometitle",',
'"author": [{"name": "somename"}],',
'"year": "2000",',
'"identifier": [{"type": "ISBN", "id": "1234567890"}],',
'"publisher": "somepublisher"}', '',
'{"type": "book",',
'"title": "sometitle2",',
'"author": [{"name": "somename2"}],',
'"year": "2001",',
'"identifier": [{"type": "ISBN", "id": "1234567890"}],',
'"publisher": "somepublisher"}', '']
b = "[%s]" % ''.join([',' if i == '' else i for i in a ]).strip(',')
data = json.loads(b)
df = pd.DataFrame(data)
print(df)
type title author year \
0 book sometitle [{'name': 'somename'}] 2000
1 book sometitle2 [{'name': 'somename2'}] 2001
identifier publisher
0 [{'type': 'ISBN', 'id': '1234567890'}] somepublisher
1 [{'type': 'ISBN', 'id': '1234567890'}] somepublisher
I am implementing an emotion analysis using lstm method where I have already done my training model as well as my prediction part. but my prediction is appearing in one column.. I will show you below.
Here are my codes:
with open('output1.json', 'w') as f:
json.dump(new_data, f)
selection1 = new_data['selection1']
#creating empty list to be able to create a dataframe
names = []
dates = []
commentss = []
labels = []
hotelname = []
for item in selection1:
name = item['name']
hotelname.append(name)
#print ('>>>>>>>>>>>>>>>>>> ', name)
Date = item['reviews']
for d in Date:
names.append(name)
#convert date from 'january 12, 2020' to 2020-01-02
date = pd.to_datetime(d['date']).strftime("%Y-%m-%d")
#adding date to the empty list dates[]
dates.append(date)
#print('>>>>>>>>>>>>>>>>>> ', date)
CommentID = item['reviews']
for com in CommentID:
comment = com['review']
lcomment = comment.lower() # converting all to lowercase
result = re.sub(r'\d+', '', lcomment) # remove numbers
results = (result.translate(
str.maketrans('', '', string.punctuation))).strip() # remove punctuations and white spaces
comments = remove_stopwords(results)
commentss.append(comment)
# print('>>>>>>',comments)
#add the words in comments that are already present in the keys of dictionary
encoded_samples = [[word2id[word] for word in comments if word in word2id.keys()]]
# Padding
encoded_samples = keras.preprocessing.sequence.pad_sequences(encoded_samples, maxlen=max_words)
# Make predictions
label_probs, attentions = model_with_attentions.predict(encoded_samples)
label_probs = {id2label[_id]: prob for (label, _id), prob in zip(label2id.items(), label_probs[0])}
labels.append(label_probs)
#creating dataframe
dataframe={'name': names,'date': dates, 'comment': commentss, 'classification': labels}
table = pd.DataFrame(dataframe, columns=['name', 'date', 'comment', 'classification'])
json = table.to_json('hotel.json', orient='records')
here is the results i obtain:
[
{
"name": "Radisson Blu Azuri Resort & Spa",
"date": "February 02, 2020",
"comment": [
"enjoy",
"daily",
"package",
"start",
"welcoming",
"end",
"recommend",
"hotel"
],
"label": {
"joy": 0.0791392997,
"surprise": 0.0002606699,
"love": 0.4324670732,
"sadness": 0.2866959572,
"fear": 0.0002588668,
"anger": 0.2011781186
}
},
you can find the complete output on this link: https://jsonblob.com/a9b4035c-5576-11ea-afe8-1d95b3a2e3fd
Is it possible to break the label field into separate fields like below??
[
{
"name": "Radisson Blu Azuri Resort & Spa",
"date": "February 02, 2020",
"comment": [
"enjoy",
"daily",
"package",
"start",
"welcoming",
"end",
"recommend",
"hotel"
],
"joy": 0.0791392997,
"surprise": 0.0002606699,
"love": 0.4324670732,
"sadness": 0.2866959572,
"fear": 0.0002588668,
"anger": 0.2011781186
},
Can someone please help me how do i need to modify my codes and make this possible please guys explain to me please..
If you can't do it before you produce the result, you can easily manipulate that dictionary like so:
def move_labels_to_dict_root(result):
labels = result["labels"]
meta_data = result
del meta_data["labels"]
result = {**meta_data, **labels}
return result
and then call move_labels_to_dict_root in a list comprehension like [move_labels_to_dict_root(result) for result in results].
However, I would ask why you want to do this?
I use openpyxl to read data from excel files to provide a json file at the end. The problem is that I cannot figure out an algorithm to do a hierarchical organisation of the json (or python dictionary).
The data form is like the following:
The output should be like this:
{
'id' : '1',
'name' : 'first',
'value' : 10,
'children': [ {
'id' : '1.1',
'name' : 'ab',
'value': 25,
'children' : [
{
'id' : '1.1.1',
'name' : 'abc' ,
'value': 16,
'children' : []
}
]
},
{
'id' : '1.2',
...
]
}
Here is what I have come up with, but i can't go beyond '1.1' because '1.1.1' and '1.1.1.1' and so on will be at the same level as 1.1.
from openpyxl import load_workbook
import re
from json import dumps
wb = load_workbook('resources.xlsx')
sheet = wb.get_sheet_by_name(wb.get_sheet_names()[0])
resources = {}
prev_dict = {}
list_rows = [ row for row in sheet.rows ]
for nrow in range(list_rows.__len__()):
id = str(list_rows[nrow][0].value)
val = {
'id' : id,
'name' : list_rows[nrow][1].value ,
'value' : list_rows[nrow][2].value ,
'children' : []
}
if id[:-2] == str(list_rows[nrow-1][0].value):
prev_dict['children'].append(val)
else:
resources[nrow] = val
prev_dict = resources[nrow]
print dumps(resources)
You need to access your data by ID, so first step is to create a dictionary where the IDs are the keys. For easier data manipulation, string "1.2.3" is converted to ("1","2","3") tuple. (Lists are not allowed as dict keys). This makes the computation of a parent key very easy (key[:-1]).
With this preparation, we could simply populate the children list of each item's parent. But before doing that a special ROOT element needs to be added. It is the parent of top-level items.
That's all. The code is below.
Note #1: It expects that every item has a parent. That's why 1.2.2 was added to the test data. If it is not the case, handle the KeyError where noted.
Note #2: The result is a list.
import json
testdata="""
1 first 20
1.1 ab 25
1.1.1 abc 16
1.2 cb 18
1.2.1 cbd 16
1.2.1.1 xyz 19
1.2.2 NEW -1
1.2.2.1 poz 40
1.2.2.2 pos 98
2 second 90
2.1 ezr 99
"""
datalist = [line.split() for line in testdata.split('\n') if line]
datadict = {tuple(item[0].split('.')): {
'id': item[0],
'name': item[1],
'value': item[2],
'children': []}
for item in datalist}
ROOT = ()
datadict[ROOT] = {'children': []}
for key, value in datadict.items():
if key != ROOT:
datadict[key[:-1]]['children'].append(value)
# KeyError = parent does not exist
result = datadict[ROOT]['children']
print(json.dumps(result, indent=4))
I have a custom data file formatted like this:
{
data = {
friends = {
max = 0 0,
min = 0 0,
},
family = {
cars = {
van = "honda",
car = "ford",
bike = "trek",
},
presets = {
location = "italy",
size = 10,
travelers = False,
},
version = 1,
},
},
}
I want to collect the blocks of data, meaning string between each set of {} while maintaining a hierarhcy. This data is not a typical json format so that is not a possible solution.
My idea was to create a class object like so
class Block:
def __init__(self, header, children):
self.header = header
self.children = children
Where i would then loop through the data line by line 'somehow' collecting the necessary data so my resulting output would like something like this...
Block("data = {}", [
Block("friends = {max = 0 0,\n min = 0 0,}", []),
Block("family = {version = 1}", [...])
])
In short I'm looking for help on ways I can serialize this into useful data I can then easily manipulate. So my approach is to break into objects by using the {} as dividers.
If anyone has suggestions on ways to better approach this I'm all up for ideas. Thank you again.
So far I've just implemented the basic snippets of code
class Block:
def __init__(self, content, children):
self.content = content
self.children = children
def GetBlock(strArr=[]):
print len(strArr)
# blocks = []
blockStart = "{"
blockEnd = "}"
with open(filepath, 'r') as file:
data = file.readlines()
blocks = GetBlock(strArr=data)
You can create a to_block function that takes the lines from your file as an iterator and recursively creates a nested dictionary from those. (Of course you could also use a custom Block class, but I don't really see the benefit in doing so.)
def to_block(lines):
block = {}
for line in lines:
if line.strip().endswith(("}", "},")):
break
key, value = map(str.strip, line.split(" = "))
if value.endswith("{"):
value = to_block(lines)
block[key] = value
return block
When calling it, you have to strip the first line, though. Also, evaluating the "leafs" to e.g. numbers or strings is left as an excercise to the reader.
>>> to_block(iter(data.splitlines()[1:]))
{'data': {'family': {'version': '1,',
'cars': {'bike': '"trek",', 'car': '"ford",', 'van': '"honda",'},
'presets': {'travelers': 'False,', 'size': '10,', 'location': '"italy",'}},
'friends': {'max': '0 0,', 'min': '0 0,'}}}
Or when reading from a file:
with open("data.txt") as f:
next(f) # skip first line
res = to_block(f)
Alternatively, you can do some preprocessing to transform that string into a JSON(-ish) string and then use json.loads. However, I would not go all the way here but instead just wrap the values into "" (and replace the original " with ' before that), otherwise there is too much risk to accidentally turning a string with spaces into a list or similar. You can sort those out once you've created the JSON data.
>>> data = data.replace('"', "'")
>>> data = re.sub(r'= (.+),$', r'= "\1",', data, flags=re.M)
>>> data = re.sub(r'^\s*(\w+) = ', r'"\1": ', data, flags=re.M)
>>> data = re.sub(r',$\s*}', r'}', data, flags=re.M)
>>> json.loads(data)
{'data': {'family': {'version': '1',
'presets': {'size': '10', 'travelers': 'False', 'location': "'italy'"},
'cars': {'bike': "'trek'", 'van': "'honda'", 'car': "'ford'"}},
'friends': {'max': '0 0', 'min': '0 0'}}}
You can also do with ast or json with the help of regex substitutions.
import re
a = """{
data = {
friends = {
max = 0 0,
min = 0 0,
},
family = {
cars = {
van = "honda",
car = "ford",
bike = "trek",
},
presets = {
location = "italy",
size = 10,
travelers = False,
},
version = 1,
},
},
}"""
#with ast
a = re.sub("(\w+)\s*=\s*", '"\\1":', a)
a = re.sub(":\s*((?:\d+)(?: \d+)+)", lambda x:':[' + x.group(1).replace(" ", ",") + "]", a)
import ast
print ast.literal_eval(a)
#{'data': {'friends': {'max': [0, 0], 'min': [0, 0]}, 'family': {'cars': {'car': 'ford', 'bike': 'trek', 'van': 'honda'}, 'presets': {'travelers': False, 'location': 'italy', 'size': 10}, 'version': 1}}}
#with json
import json
a = re.sub(",(\s*\})", "\\1", a)
a = a.replace(":True", ":true").replace(":False", ":false").replace(":None", ":null")
print json.loads(a)
#{u'data': {u'friends': {u'max': [0, 0], u'min': [0, 0]}, u'family': {u'cars': {u'car': u'ford', u'bike': u'trek', u'van': u'honda'}, u'presets': {u'travelers': False, u'location': u'italy', u'size': 10}, u'version': 1}}}
I writing a script that involves adding/removing multipath "objects" from the standard multipath.conf configuration file, example below:
# This is a basic configuration file with some examples, for device mapper
# multipath.
## Use user friendly names, instead of using WWIDs as names.
defaults {
user_friendly_names yes
}
##
devices {
device {
vendor "SolidFir"
product "SSD SAN"
path_grouping_policy multibus
getuid_callout "/lib/udev/scsi_id --whitelisted --device=/dev/%n"
path_selector "service-time 0"
path_checker tur
hardware_handler "0"
failback immediate
rr_weight uniform
rr_min_io 1000
rr_min_io_rq 1
features "0"
no_path_retry 24
prio const
}
}
multipaths {
multipath {
wwid 36f47acc1000000006167347a00000041
alias dwqa-ora-fs
}
multipath {
wwid 36f47acc1000000006167347a00000043
alias dwqa-ora-grid
}
multipath {
wwid 36f47acc1000000006167347a00000044
alias dwqa-ora-dwqa1
}
multipath {
wwid 36f47acc1000000006167347a000000ae
alias dwqa-ora-dwh2d10-1
}
multipath {
wwid 36f47acc1000000006167347a000000f9
alias dwqa-ora-testdg-1
}
}
So what I'm trying to do is read this file in and store it in a nested python dictionary (or list of nested dictionaries). We can ignore the comments lines (starting with #) for now. I have not come up with a clear/concise solution for this.
Here is my partial solution (doesn't give me the expected output yet, but it's close)
def nonblank_lines(f):
for l in f:
line = l.rstrip()
if line:
yield line
def __parse_conf__(self):
conf = []
with open(self.conf_file_path) as f:
for line in nonblank_lines(f):
if line.strip().endswith("{"): # opening bracket, start of new list of dictionaries
current_dictionary_key = line.split()[0]
current_dictionary = { current_dictionary_key : None }
conf.append(current_dictionary)
elif line.strip().endswith("}"): # closing bracket, end of new dictionary
pass
# do nothing...
elif not line.strip().startswith("#"):
if current_dictionary.values() == [None]:
# New dictionary... we should be appending to this one
current_dictionary[current_dictionary_key] = [{}]
current_dictionary = current_dictionary[current_dictionary_key][0]
key = line.strip().split()[0]
val = " ".join(line.strip().split()[1:])
current_dictionary[key] = val
And this is the resulting dictionary (the list 'conf'):
[{'defaults': [{'user_friendly_names': 'yes'}]},
{'devices': None},
{'device': [{'failback': 'immediate',
'features': '"0"',
'getuid_callout': '"/lib/udev/scsi_id --whitelisted --device=/dev/%n"',
'hardware_handler': '"0"',
'no_path_retry': '24',
'path_checker': 'tur',
'path_grouping_policy': 'multibus',
'path_selector': '"service-time 0"',
'prio': 'const',
'product': '"SSD SAN"',
'rr_min_io': '1000',
'rr_min_io_rq': '1',
'rr_weight': 'uniform',
'vendor': '"SolidFir"'}]},
{'multipaths': None},
{'multipath': [{'alias': 'dwqa-ora-fs',
'wwid': '36f47acc1000000006167347a00000041'}]},
{'multipath': [{'alias': 'dwqa-ora-grid',
'wwid': '36f47acc1000000006167347a00000043'}]},
{'multipath': [{'alias': 'dwqa-ora-dwqa1',
'wwid': '36f47acc1000000006167347a00000044'}]},
{'multipath': [{'alias': 'dwqa-ora-dwh2d10-1',
'wwid': '36f47acc1000000006167347a000000ae'}]},
{'multipath': [{'alias': 'dwqa-ora-testdg-1',
'wwid': '36f47acc1000000006167347a000000f9'}]},
{'multipath': [{'alias': 'dwqa-ora-testdp10-1',
'wwid': '"SSolidFirSSD SAN 6167347a00000123f47acc0100000000"'}]}]
Obviously the "None"s should be replaced with nested dictionary below it, but I can't get this part to work.
Any suggestions? Or better ways to parse this file and store it in a python data structure?
Try something like this:
def parse_conf(conf_lines):
config = []
# iterate on config lines
for line in conf_lines:
# remove left and right spaces
line = line.rstrip().strip()
if line.startswith('#'):
# skip comment lines
continue
elif line.endswith('{'):
# new dict (notice the recursion here)
config.append({line.split()[0]: parse_conf(conf_lines)})
else:
# inside a dict
if line.endswith('}'):
# end of current dict
break
else:
# parameter line
line = line.split()
if len(line) > 1:
config.append({line[0]: " ".join(line[1:])})
return config
The function will get into the nested levels on the configuration file (thanks to recursion and the fact that the conf_lines object is an iterator) and make a list of dictionaries that contain other dictionaries. Unfortunately, you have to put every nested dictionary inside a list again, because in the example file you show how multipath can repeat, but in Python dictionaries a key must be unique. So you make a list.
You can test it with your example configuration file, like this:
with open('multipath.conf','r') as conf_file:
config = parse_conf(conf_file)
# show multipath config lines as an example
for item in config:
if 'multipaths' in item:
for multipath in item['multipaths']:
print multipath
# or do something more useful
And the output would be:
{'multipath': [{'wwid': '36f47acc1000000006167347a00000041'}, {'alias': 'dwqa-ora-fs'}]}
{'multipath': [{'wwid': '36f47acc1000000006167347a00000043'}, {'alias': 'dwqa-ora-grid'}]}
{'multipath': [{'wwid': '36f47acc1000000006167347a00000044'}, {'alias': 'dwqa-ora-dwqa1'}]}
{'multipath': [{'wwid': '36f47acc1000000006167347a000000ae'}, {'alias': 'dwqa-ora-dwh2d10-1'}]}
{'multipath': [{'wwid': '36f47acc1000000006167347a000000f9'}, {'alias': 'dwqa-ora-testdg-1'}]}
If you don't use recursion, you will need some way of keeping track of your level. But even then it is difficult to have references to parents or siblings in order to add data (I failed). Here's another take based on Daniele Barresi's mention of recursion on the iterable input:
Data:
inp = """
# This is a basic configuration file with some examples, for device mapper
# multipath.
## Use user friendly names, instead of using WWIDs as names.
defaults {
user_friendly_names yes
}
##
devices {
device {
vendor "SolidFir"
product "SSD SAN"
path_grouping_policy multibus
getuid_callout "/lib/udev/scsi_id --whitelisted --device=/dev/%n"
path_selector "service-time 0"
path_checker tur
hardware_handler "0"
failback immediate
rr_weight uniform
rr_min_io 1000
rr_min_io_rq 1
features "0"
no_path_retry 24
prio const
}
}
multipaths {
multipath {
wwid 36f47acc1000000006167347a00000041
alias dwqa-ora-fs
}
multipath {
wwid 36f47acc1000000006167347a00000043
alias dwqa-ora-grid
}
multipath {
wwid 36f47acc1000000006167347a00000044
alias dwqa-ora-dwqa1
}
multipath {
wwid 36f47acc1000000006167347a000000ae
alias dwqa-ora-dwh2d10-1
}
multipath {
wwid 36f47acc1000000006167347a000000f9
alias dwqa-ora-testdg-1
}
}
"""
Code:
import re
level = 0
def recurse( data ):
""" """
global level
out = []
level += 1
for line in data:
l = line.strip()
if l and not l.startswith('#'):
match = re.search(r"\s*(\w+)\s*(?:{|(?:\"?\s*([^\"]+)\"?)?)", l)
if not match:
if l == '}':
level -= 1
return out # recursion, up one level
else:
key, value = match.groups()
if not value:
print( " "*level, level, key )
value = recurse( data ) # recursion, down one level
else:
print( " "*level, level, key, value)
out.append( [key,value] )
return out # once
result = recurse( iter(inp.split('\n')) )
import pprint
pp = pprint.PrettyPrinter(indent=4)
pp.pprint(result)
Resulting list with nested ["key", value] pairs:
[ ['defaults', [['user_friendly_names', 'yes']]],
[ 'devices',
[ [ 'device',
[ ['vendor', 'SolidFir'],
['product', 'SSD SAN'],
['path_grouping_policy', 'multibus'],
[ 'getuid_callout',
'/lib/udev/scsi_id --whitelisted --device=/dev/%n'],
['path_selector', 'service-time 0'],
['path_checker', 'tur'],
['hardware_handler', '0'],
['failback', 'immediate'],
['rr_weight', 'uniform'],
['rr_min_io', '1000'],
['rr_min_io_rq', '1'],
['features', '0'],
['no_path_retry', '24'],
['prio', 'const']]]]],
[ 'multipaths',
[ [ 'multipath',
[ ['wwid', '36f47acc1000000006167347a00000041'],
['alias', 'dwqa-ora-fs']]],
[ 'multipath',
[ ['wwid', '36f47acc1000000006167347a00000043'],
['alias', 'dwqa-ora-grid']]],
[ 'multipath',
[ ['wwid', '36f47acc1000000006167347a00000044'],
['alias', 'dwqa-ora-dwqa1']]],
[ 'multipath',
[ ['wwid', '36f47acc1000000006167347a000000ae'],
['alias', 'dwqa-ora-dwh2d10-1']]],
[ 'multipath',
[ ['wwid', '36f47acc1000000006167347a000000f9'],
['alias', 'dwqa-ora-testdg-1']]]]]]
Multipath conf is a bit of a pig to parse. This is what I use (originally based on the answer from daniele-barresi), the output is easier to work with than the other examples.
def get_multipath_conf():
def parse_conf(conf_lines, parent=None):
config = {}
for line in conf_lines:
line = line.split('#',1)[0].strip()
if line.endswith('{'):
key = line.split('{', 1)[0].strip()
value = parse_conf(conf_lines, parent=key)
if key+'s' == parent:
if type(config) is dict:
config = []
config.append(value)
else:
config[key] = value
else:
# inside a dict
if line.endswith('}'):
# end of current dict
break
else:
# parameter line
line = line.split(' ',1)
if len(line) > 1:
key = line[0]
value = line[1].strip().strip("'").strip('"')
config[key] = value
return config
return parse_conf(open('/etc/multipath.conf','r'))
This is the output:
{'blacklist': {'devnode': '^(ram|raw|loop|fd|md|dm-|sr|scd|st|sda|sdb)[0-9]*$'},
'defaults': {'find_multipaths': 'yes',
'max_polling_interval': '4',
'polling_interval': '2',
'reservation_key': '0x1'},
'devices': [{'detect_checker': 'no',
'hardware_handler': '1 alua',
'no_path_retry': '5',
'path_checker': 'tur',
'prio': 'alua',
'product': 'iSCSI Volume',
'user_friendly_names': 'yes',
'vendor': 'StorMagic'}]}