I have a custom data file formatted like this:
{
data = {
friends = {
max = 0 0,
min = 0 0,
},
family = {
cars = {
van = "honda",
car = "ford",
bike = "trek",
},
presets = {
location = "italy",
size = 10,
travelers = False,
},
version = 1,
},
},
}
I want to collect the blocks of data, meaning string between each set of {} while maintaining a hierarhcy. This data is not a typical json format so that is not a possible solution.
My idea was to create a class object like so
class Block:
def __init__(self, header, children):
self.header = header
self.children = children
Where i would then loop through the data line by line 'somehow' collecting the necessary data so my resulting output would like something like this...
Block("data = {}", [
Block("friends = {max = 0 0,\n min = 0 0,}", []),
Block("family = {version = 1}", [...])
])
In short I'm looking for help on ways I can serialize this into useful data I can then easily manipulate. So my approach is to break into objects by using the {} as dividers.
If anyone has suggestions on ways to better approach this I'm all up for ideas. Thank you again.
So far I've just implemented the basic snippets of code
class Block:
def __init__(self, content, children):
self.content = content
self.children = children
def GetBlock(strArr=[]):
print len(strArr)
# blocks = []
blockStart = "{"
blockEnd = "}"
with open(filepath, 'r') as file:
data = file.readlines()
blocks = GetBlock(strArr=data)
You can create a to_block function that takes the lines from your file as an iterator and recursively creates a nested dictionary from those. (Of course you could also use a custom Block class, but I don't really see the benefit in doing so.)
def to_block(lines):
block = {}
for line in lines:
if line.strip().endswith(("}", "},")):
break
key, value = map(str.strip, line.split(" = "))
if value.endswith("{"):
value = to_block(lines)
block[key] = value
return block
When calling it, you have to strip the first line, though. Also, evaluating the "leafs" to e.g. numbers or strings is left as an excercise to the reader.
>>> to_block(iter(data.splitlines()[1:]))
{'data': {'family': {'version': '1,',
'cars': {'bike': '"trek",', 'car': '"ford",', 'van': '"honda",'},
'presets': {'travelers': 'False,', 'size': '10,', 'location': '"italy",'}},
'friends': {'max': '0 0,', 'min': '0 0,'}}}
Or when reading from a file:
with open("data.txt") as f:
next(f) # skip first line
res = to_block(f)
Alternatively, you can do some preprocessing to transform that string into a JSON(-ish) string and then use json.loads. However, I would not go all the way here but instead just wrap the values into "" (and replace the original " with ' before that), otherwise there is too much risk to accidentally turning a string with spaces into a list or similar. You can sort those out once you've created the JSON data.
>>> data = data.replace('"', "'")
>>> data = re.sub(r'= (.+),$', r'= "\1",', data, flags=re.M)
>>> data = re.sub(r'^\s*(\w+) = ', r'"\1": ', data, flags=re.M)
>>> data = re.sub(r',$\s*}', r'}', data, flags=re.M)
>>> json.loads(data)
{'data': {'family': {'version': '1',
'presets': {'size': '10', 'travelers': 'False', 'location': "'italy'"},
'cars': {'bike': "'trek'", 'van': "'honda'", 'car': "'ford'"}},
'friends': {'max': '0 0', 'min': '0 0'}}}
You can also do with ast or json with the help of regex substitutions.
import re
a = """{
data = {
friends = {
max = 0 0,
min = 0 0,
},
family = {
cars = {
van = "honda",
car = "ford",
bike = "trek",
},
presets = {
location = "italy",
size = 10,
travelers = False,
},
version = 1,
},
},
}"""
#with ast
a = re.sub("(\w+)\s*=\s*", '"\\1":', a)
a = re.sub(":\s*((?:\d+)(?: \d+)+)", lambda x:':[' + x.group(1).replace(" ", ",") + "]", a)
import ast
print ast.literal_eval(a)
#{'data': {'friends': {'max': [0, 0], 'min': [0, 0]}, 'family': {'cars': {'car': 'ford', 'bike': 'trek', 'van': 'honda'}, 'presets': {'travelers': False, 'location': 'italy', 'size': 10}, 'version': 1}}}
#with json
import json
a = re.sub(",(\s*\})", "\\1", a)
a = a.replace(":True", ":true").replace(":False", ":false").replace(":None", ":null")
print json.loads(a)
#{u'data': {u'friends': {u'max': [0, 0], u'min': [0, 0]}, u'family': {u'cars': {u'car': u'ford', u'bike': u'trek', u'van': u'honda'}, u'presets': {u'travelers': False, u'location': u'italy', u'size': 10}, u'version': 1}}}
Related
I have a text file which I want to convert to a nested json structure. The text file is :
Report_for Reconciliation
Execution_of application_1673496470638_0001
Spark_version 2.4.7-amzn-0
Java_version 1.8.0_352 (Amazon.com Inc.)
Start_time 2023-01-12 09:45:13.360000
Spark Properties:
Job_ID 0
Submission_time 2023-01-12 09:47:20.148000
Run_time 73957ms
Result JobSucceeded
Number_of_stages 1
Stage_ID 0
Number_of_tasks 16907
Number_of_executed_tasks 16907
Completion_time 73207ms
Stage_executed parquet at RawDataPublisher.scala:53
Job_ID 1
Submission_time 2023-01-12 09:48:34.177000
Run_time 11525ms
Result JobSucceeded
Number_of_stages 2
Stage_ID 1
Number_of_tasks 16907
Number_of_executed_tasks 0
Completion_time 0ms
Stage_executed parquet at RawDataPublisher.scala:53
Stage_ID 2
Number_of_tasks 300
Number_of_executed_tasks 300
Completion_time 11520ms
Stage_executed parquet at RawDataPublisher.scala:53
Job_ID 2
Submission_time 2023-01-12 09:48:46.908000
Run_time 218358ms
Result JobSucceeded
Number_of_stages 1
Stage_ID 3
Number_of_tasks 1135
Number_of_executed_tasks 1135
Completion_time 218299ms
Stage_executed parquet at RawDataPublisher.scala:53
I want the output to be :
{
"Report_for": "Reconciliation",
"Execution_of": "application_1673496470638_0001",
"Spark_version": "2.4.7-amzn-0",
"Java_version": "1.8.0_352 (Amazon.com Inc.)",
"Start_time": "2023-01-12 09:45:13.360000",
"Job_ID 0": {
"Submission_time": "2023-01-12 09:47:20.148000",
"Run_time": "73957ms",
"Result": "JobSucceeded",
"Number_of_stages": "1",
"Stage_ID 0”: {
"Number_of_tasks": "16907",
"Number_of_executed_tasks": "16907",
"Completion_time": "73207ms",
"Stage_executed": "parquet at RawDataPublisher.scala:53"
"Stage": "parquet at RawDataPublisher.scala:53",
},
},
}
I tried defaultdict method but it was generating a json with values as list which was not acceptable to make a table on it. Here's what I did:
import json
from collections import defaultdict
INPUT = 'demofile.txt'
dict1 = defaultdict(list)
def convert():
with open(INPUT) as f:
for line in f:
command, description = line.strip().split(None, 1)
dict1[command].append(description.strip())
OUTPUT = open("demo1file.json", "w")
json.dump(dict1, OUTPUT, indent = 4, sort_keys = False)
and was getting this:
"Report_for": [ "Reconciliation" ],
"Execution_of": [ "application_1673496470638_0001" ],
"Spark_version": [ "2.4.7-amzn-0" ],
"Java_version": [ "1.8.0_352 (Amazon.com Inc.)" ],
"Start_time": [ "2023-01-12 09:45:13.360000" ],
"Job_ID": [
"0",
"1",
"2", ....
]]]
I just want to convert my text to the above json format so that I can build a table on top of it.
There's no way, python or one of it's libraries can figure out your nesting requirements, if a flat text is being given as an input. How should it know Stages are inside Jobs...for example.
You will have to programmatically tell your application how it works.
I hacked an example which should work, you can go from there (assuming input_str is what you posted as your file content):
# define your nesting structure
nesting = {'Job_ID': {'Stage_ID': {}}}
upper_nestings = []
upper_nesting_keys = []
# your resulting dictionary
result_dict = {}
# your "working" dictionaries
current_nesting = nesting
working_dict = result_dict
# parse each line of the input string
for line_str in input_str.split('\n'):
# key is the first word, value are all consecutive words
line = line_str.split(' ')
# if key is in nesting, create new sub-dict, all consecutive entries are part of the sub-dict
if line[0] in current_nesting.keys():
current_nesting = current_nesting[line[0]]
upper_nestings.append(line[0])
upper_nesting_keys.append(line[1])
working_dict[line_str] = {}
working_dict = working_dict[line_str]
else:
# if a new "parallel" or "upper" nesting is detected, reset your nesting structure
if line[0] in upper_nestings:
nests = upper_nestings[:upper_nestings.index(line[0])]
keys = upper_nesting_keys[:upper_nestings.index(line[0])]
working_dict = result_dict
for nest in nests:
working_dict = working_dict[' '.join([nest, keys[nests.index(nest)]])]
upper_nestings = upper_nestings[:upper_nestings.index(line[0])+1]
upper_nesting_keys = upper_nesting_keys[:upper_nestings.index(line[0])]
upper_nesting_keys.append(line[1])
current_nesting = nesting
for nest in upper_nestings:
current_nesting = current_nesting[nest]
working_dict[line_str] = {}
working_dict = working_dict[line_str]
continue
working_dict[line[0]] = ' '.join(line[1:])
print(result_dict)
Results in:
{
'Report_for': 'Reconciliation',
'Execution_of': 'application_1673496470638_0001',
'Spark_version': '2.4.7-amzn-0',
'Java_version': '1.8.0_352 (Amazon.com Inc.)',
'Start_time': '2023-01-12 09:45:13.360000',
'Spark': 'Properties: ',
'Job_ID 0': {
'Submission_time': '2023-01-12 09:47:20.148000',
'Run_time': '73957ms',
'Result': 'JobSucceeded',
'Number_of_stages': '1',
'Stage_ID 0': {
'Number_of_tasks': '16907',
'Number_of_executed_tasks': '16907',
'Completion_time': '73207ms',
'Stage_executed': 'parquet at RawDataPublisher.scala:53'
}
},
'Job_ID 1': {
'Submission_time': '2023-01-12 09:48:34.177000',
'Run_time': '11525ms',
'Result': 'JobSucceeded',
'Number_of_stages': '2',
'Stage_ID 1': {
'Number_of_tasks': '16907',
'Number_of_executed_tasks': '0',
'Completion_time': '0ms',
'Stage_executed': 'parquet at RawDataPublisher.scala:53'
},
'Stage_ID 2': {
'Number_of_tasks': '300',
'Number_of_executed_tasks': '300',
'Completion_time': '11520ms',
'Stage_executed': 'parquet at RawDataPublisher.scala:53'
}
},
'Job_ID 2': {
'Submission_time':
'2023-01-12 09:48:46.908000',
'Run_time': '218358ms',
'Result': 'JobSucceeded',
'Number_of_stages': '1',
'Stage_ID 3': {
'Number_of_tasks': '1135',
'Number_of_executed_tasks': '1135',
'Completion_time': '218299ms',
'Stage_executed': 'parquet at RawDataPublisher.scala:53'
}
}
}
and should pretty much be generically usable for all kinds of nesting definitions from a flat input. Let me know if it works for you!
I am working on something which should manage multiple water dispensers. I need to get some data from a json file and then load it into objects after that, append the objects to a list. For some reason list.append changes other object's parameters(more specific, location). Here is my code:
WaterDispenser.py
class WaterDispenser():
def __init__(self, id: int = -1, status: bool = False, location: list=[-1, -1]) -> None:
self.id = id
self.status = status
self.location = location
def Dump(self) -> dict:
"""Dumps the propoerties in a json dictionary
Returns
-------
dict
A dictionary with a collection of propoerties and their names
"""
return {"id": self.id, "status": self.status, "location":[self.location[0], self.location[1]]}
def Load(self, object: dict) -> None:
"""Loads the json dictoinary in memory
Parameters
----------
object : dict, required
The json file with the properties of the dispenser
Returns
-------
None
"""
self.id = object["id"]
self.status = object["status"]
self.location[0] = object["location"][0]
self.location[1] = object["location"][1]
return None
main.py
import json
from WaterDispenser import WaterDispenser
dispensers = []
def LoadDispensers(path: str = "dispensers.json") -> int:
"""Loads the json file in memory.
Parameters
---------
path : str, optional
The path of the file to be loaded. Defaults to "dispensers.json".
Returns
-------
int
Count of dispensers data loaded
"""
global dispensers
dispensers = []
data = json.load(open(path, "r"))
for d in data:
x = WaterDispenser()
x.Load(d)
dispensers.append(x)
return len(dispensers)
if __name__ == '__main__':
print(LoadDispensers())
print([o.Dump() for o in dispensers])
dispensers.json
[
{"id": 0, "status": true, "location": [0, 0]},
{"id": 1, "status": true, "location": [0, 1]},
{"id": 2, "status": false, "location": [1, 1]}
]
Output:
3
[{'id': 0, 'status': True, 'location': [1, 1]}, {'id': 1, 'status': True, 'location': [1, 1]}, {'id': 2, 'status': False, 'location': [1, 1]}]
The functional answer:
Change the init of WaterDispenser to
from typing import Optional
class WaterDispenser():
def __init__(self, id: int = -1, status: bool = False, location: Optional[list] = None) -> None:
self.id = id
self.status = status
self.location = location or [-1, -1]
This should result in the expected response of
3
[{'id': 0, 'status': True, 'location': [0, 0]}, {'id': 1, 'status': True, 'location': [0, 1]}, {'id': 2, 'status': False, 'location': [1, 1]}]
The why:
Generally you want to avoid using mutable values as kwarg values because they're pre-computed (so your default location argument was technically the same object in memory across your WaterDispenser instances). append wasn't the culprit here and you can read more about this all via this SO discussion or read a succinct explanation via this answer to a similar question.
Design note:
It's worth noting that the way you are using Load in the above example could just be folded into WaterDispenser.__init__, so something like
from typing import Dict, Any
class WaterDispenser():
def __init__(self, data: Dict[Any]) -> None:
self.id = data.get("id", -1)
self.status = data.get("status", False)
self.location = data.get("location", [-1, -1])
or if you want to avoid typing
class WaterDispenser():
def __init__(self, data: dict) -> None:
self.id = data.get("id", -1)
self.status = data.get("status", False)
self.location = data.get("location", [-1, -1])
That example still includes your default values but if you removed the secondary arguments from those get calls you could protect against missing data at runtime without having to check to see if you had, say, an impossible location data point like [-1, -1].
Description of the problem
The problem is a classical Bill of Materials (BoM) problem;
Suppose we have the class BomEntry(object) defined as:
class BomEntry:
def __init__(self, part, quantity=0, unit="", children=[]):
self.part = part
self.quantity = quantity
self.unit = unit
self.children = children
part is a django model, and quantity and unit are two of its members.
The Django model has a method make_bom(self) which returns an instance of BomEntry(a class which doesn't use django) . Asm is the django model keeping track of BoM data in the database
def make_bom(self, depth=1):
if not self.is_asm:
return BomEntry(self, 1, "", [])
else:
children = list(Asm.objects.filter(parent=self))
children_bom = [BomEntry(obj.child, obj.quantity, obj.unit, []) for obj in children]
bom = BomEntry(self, 1, "", children=children_bom)
return bom
I'm currently including a parameter to decide the depth of the BoM, but I can't wrap my head around how I would use it.
I want to be able to traverse the nested objects, ending up with an output similar to this:
{
'part': <PartAsm: 100-1822-R1-A>,
'quantity': 1,
'unit': '',
'children':
[
{
'part': <PartAsm: 100-1823-R1-A>,
'quantity': 1,
'unit': '',
'children':
[]
},
{
'part':
<PartAsm: 100-1824-R1-A>,
'quantity': 1,
'unit': '',
'children':
[
{
'part': <PartAsm: 100-1825-R1-A>,
'quantity': Decimal('1.00'),
'unit': 'g',
'children':
[]
},
{
'part': <PartAsm: 100-1826-R1-A>,
'quantity': Decimal('1.00'),
'unit': 'g',
'children':
[]
}
]
}
]
}
The output above was acquired using the console, I would appreciate any advice on looping this or making it recursive. I hope I provided sufficient and clear information
When depth is more than 1, make_bom() should recurse, decrementing depth in the call.
def make_bom(self, depth=1):
if not self.is_asm:
return BomEntry(self, 1, "", [])
else:
if depth > 1:
children = list(Asm.objects.filter(parent=self))
children_bom = [make_bom(BomEntry(obj.child, obj.quantity, obj.unit, []), depth-1) for obj in children]
else:
children_bom = []
bom = BomEntry(self, 1, "", children=children_bom)
return bom
I finally got it working using #Barmar 's valuable input. The make_bom function now looks like:
def make_bom(self, quantity=1, unit="def_unit", depth=1):
if not self.is_asm:
return BomEntry(self, quantity, unit, [])
else:
if depth <= 1:
children = list(Asm.objects.filter(parent=self))
children_bom = [BomEntry(obj.child, quantity*obj.quantity, obj.unit, []) for obj in children]
bom = BomEntry(self, quantity, unit, children=children_bom)
else:
bom = BomEntry(self, quantity, "def_unit",
children=[a.child.make_bom(quantity*a.quantity, a.unit, depth - 1) for a in
list(Asm.objects.filter(parent=self))])
return bom
I added in the quantity and unit params for completeness.
Thanks a million #Barmar
I have a network I want to output to a json file. However, when I output it, node targets become converted to numbers and do not match the node ids which are strings.
For example:
G = nx.DiGraph(data)
G.edges()
results in:
[(22, 'str1'),
(22, 'str2'),
(22, 'str3')]
in python. This is correct.
But in the output, when I write out the data like so...
json.dump(json_graph.node_link_data(G), f,
indent = 4, sort_keys = True, separators=(',',':'))
while the ids for the three target nodes 'str1', 'str2', and 'str3'...
{
"id":"str1"
},
{
"id":"str2"
},
{
"id":"str3"
}
The targets of node 22 have been turned into numbers
{
"source":22,
"target":972
},
{
"source":22,
"target":1261
},
{
"source":22,
"target":1259
}
This happens for all nodes that have string ids
Why is this, and how can I prevent it?
The desired result is that either "target" fields should keep the string ids, or that the string ids become numeric in a way that they match the targets.
Why is this
It's a feature. Not all graph libraries accept strings as identifiers, but all that I know of accept integers.
how can I prevent it?
Replace the ids by node names using the nodes map:
>>> import networkx as nx
>>> import pprint
>>> g = nx.DiGraph()
>>> g.add_edge(1, 'foo')
>>> g.add_edge(2, 'bar')
>>> g.add_edge('foo', 'bar')
>>> res = nx.node_link_data(g)
>>> pprint.pprint(res)
{'directed': True,
'graph': {},
'links': [{'source': 0, 'target': 3},
{'source': 1, 'target': 2},
{'source': 3, 'target': 2}],
'multigraph': False,
'nodes': [{'name': 1}, {'name': 2}, {'name': 'bar'}, {'name': 'foo'}]}
>>> res['links'] = [
{
'source': res['nodes'][link['source']]['name'],
'target': res['nodes'][link['target']]['name']
}
for link in res['links']]
>>> pprint.pprint(res)
{'directed': True,
'graph': {},
'links': [{'source': 1, 'target': 'foo'},
{'source': 2, 'target': 'bar'},
{'source': 'foo', 'target': 'bar'}],
'multigraph': False,
'nodes': [{'name': 1}, {'name': 2}, {'name': 'bar'}, {'name': 'foo'}]}
To make the output conform to the d3 template that is linked in the node_link_data documentation, you can make a couple simple changes to the node_link_data function. Just run the below function and use it instead. All I changed was to trim some of the unnecessary outputs for the template, and to store the graph label instead of an index. The index the original function used for target and destination was created in the function, so it isn't something you can extract from the graph itself, so if you want to be certain that your node labels correspond to your links, it's safest to modify node_link_data.
The D3 Template this creates data for is here
Note that if you use the below data without adding a node or link attribute, you will need to delete the following lines from the d3 template:
.attr("stroke-width", function(d) { return Math.sqrt(d.value); })
and
.attr("fill", function(d) { return color(d.group); })
Modified function:
from itertools import chain, count
import json
import networkx as nx
from networkx.utils import make_str
__author__ = """Aric Hagberg <hagberg#lanl.gov>"""
_attrs = dict(id='id', source='source', target='target', key='key')
def node_link_data(G, attrs=_attrs):
"""Return data in node-link format that is suitable for JSON serialization
and use in Javascript documents.
"""
multigraph = G.is_multigraph()
id_ = attrs['id']
source = attrs['source']
target = attrs['target']
# Allow 'key' to be omitted from attrs if the graph is not a multigraph.
key = None if not multigraph else attrs['key']
if len(set([source, target, key])) < 3:
raise nx.NetworkXError('Attribute names are not unique.')
mapping = dict(zip(G, count()))
data = {}
data['nodes'] = [dict(chain(G.node[n].items(), [(id_, n)])) for n in G]
if multigraph:
data['links'] = [
dict(chain(d.items(),
[(source, u), (target,v), (key, k)]))
for u, v, k, d in G.edges_iter(keys=True, data=True)]
else:
data['links'] = [
dict(chain(d.items(),
[(source, u), (target, v)]))
for u, v, d in G.edges_iter(data=True)]
return data
I'm wondering if anyone has a sort of hacky / cool solution to this problem . I have a text file like so:
NAME:name
ID:id
PERSON:person
LOCATION:location
NAME:name
morenamestuff
ID:id
PERSON:person
LOCATION:location
JUNK
So I have some blocks that all contain lines that can be split into a dict, and some that cannot. How can I take lines without the : character and join them to the previous line? Here's what I'm currently doing
# loop through chunk
# the first element of dat is a Title, so skip that
key_map = dict(x.split(':') for x in dat[1:])
But I of course get an error because the second chunk has a line without the : character. So I wanted my dict to look something like this after correctly splitting it:
# there will be a key_map for each chunk of data
key_map['NAME'] == 'name morenamestuff' # 3rd line appended to previous
key_map['ID'] == 'id'
key_map['PERSON'] = 'person'
key_map['LOCATION'] = 'location
Solution
EDIT: Here's my final solution on github, and the full code here:
parseScript.py
import re
import string
bad_chars = '(){}"<>[] ' # characers we want to strip from the string
key_map = []
# parse file
with open("dat.txt") as f:
data = f.read()
data = data.strip('\n')
data = re.split('}|\[{', data)
# format file
with open("format.dat") as f:
formatData = [x.strip('\n') for x in f.readlines()]
data = filter(len, data)
# strip and split each station
for dat in data[1:-1]:
# perform black magic, don't even try to understand this
dat = dat.translate(string.maketrans("", "", ), bad_chars).split(',')
key_map.append(dict(x.split(':') for x in dat if ':' in x ))
if ':' not in dat[1]:key_map['NAME']+=dat[k][2]
for station in range(0, len(key_map)):
for opt in formatData:
print opt,":",key_map[station][opt]
print ""
dat.txt
View raw here
format.dat
NAME
STID
LONGITUDE
LATITUDE
ELEVATION
STATE
ID
out.dat
View raw here
When in doubt, write your own generator.
Add in itertools.groupby to chunk by groups of text delimited by whitespace breaks.
def chunker(s):
it = iter(s)
out = [next(it)]
for line in it:
if ':' in line or not line:
yield ' '.join(out)
out = []
out.append(line)
if out:
yield ' '.join(out)
usage:
from itertools import groupby
[dict(x.split(':') for x in g) for k,g in groupby(chunker(lines), bool) if k]
Out[65]:
[{'ID': 'id', 'LOCATION': 'location', 'NAME': 'name', 'PERSON': 'person'},
{'ID': 'id',
'LOCATION': 'location',
'NAME': 'name morenamestuff',
'PERSON': 'person'}]
(if those fields are always the same, I'd go with something like creating some namedtuples instead of a bunch of dicts)
from collections import namedtuple
Thing = namedtuple('Thing', 'ID LOCATION NAME PERSON')
[Thing(**dict(x.split(':') for x in g)) for k,g in groupby(chunker(lines), bool) if k]
Out[76]:
[Thing(ID='id', LOCATION='location', NAME='name', PERSON='person'),
Thing(ID='id', LOCATION='location', NAME='name morenamestuff', PERSON='person')]
Here is something that addresses all your requirements. It handles joining of multiple lines, ignoring blank lines, and ignoring junk lines that do not appear within a block. It is implemented as a generator that yields each dictionary as it is completed.
def parser(data):
d = {}
for line in data:
line = line.strip()
if not line:
if d:
yield d
d = {}
else:
if ':' in line:
key, value = line.split(':')
d[key] = value
else:
if d:
d[key] = '{} {}'.format(d[key], line)
if d:
yield d
When run with this data:
ignore me
NAME:name1
ID:id1
PERSON:person1
LOCATION:location1
NAME:name2
morenamestuff
ID:id2
PERSON:person2
LOCATION:location2
junk
and
other
stuff
NAME:name3
morenamestuff
and more
ID:id3
PERSON:person3
more person stuff
LOCATION:location3
JUNK
MORE JUNK
>>> for d in parser(open('data')):
... print d
{'PERSON': 'person1', 'LOCATION': 'location1', 'NAME': 'name1', 'ID': 'id1'}
{'PERSON': 'person2', 'LOCATION': 'location2', 'NAME': 'name2 morenamestuff', 'ID': 'id2'}
{'PERSON': 'person3 more person stuff', 'LOCATION': 'location3', 'NAME': 'name3 morenamestuff and more', 'ID': 'id3'}
You can grab the lot as a list:
>>> results = list(parser(open('data')))
>>> results
[{'PERSON': 'person1', 'LOCATION': 'location1', 'NAME': 'name1', 'ID': 'id1'}, {'PERSON': 'person2', 'LOCATION': 'location2', 'NAME': 'name2 morenamestuff', 'ID': 'id2'}, {'PERSON': 'person3 more person stuff', 'LOCATION': 'location3', 'NAME': 'name3 morenamestuff and more', 'ID': 'id3'}]
I don't find itertools or regex particularly nice to work with, here's a pure-python solution
separator = ':'
output = []
chunk = None
with open('/tmp/stuff.txt') as f:
for line in (x.strip() for x in f):
if not line:
# we are between 'chunks'
chunk, key = None, None
continue
if chunk is None:
# we are at the beginning of a new 'chunk'
chunk, key = {}, None
output.append(chunk)
if separator in line:
key, val = line.split(separator)
chunk[key] = val
else:
chunk[key] += line
not as elegant, as you requested, but this works
dat=[['NAME:name',
'ID:id',
'PERSON:person',
'LOCATION:location'],
['NAME:name',
'morenamestuff',
'ID:id',
'PERSON:person',
'LOCATION:location']]
k=1
key_map = dict(x.split(':') for x in dat[k] if ':' in x )
if ':' not in dat[k][1]:key_map['NAME']+=dat[k][1]
key_map>>
{'ID': 'id',
'LOCATION': 'location',
'NAME': 'namemorenamestuff',
'PERSON': 'person'}
Just add something to lines with no ":".
if line.find(':') == -1:
line=line+':None'
Then you won't get an error.