Accessing python variables in a list - python

In the following code below, how to retrieve the value of id,Id has multiple values in it.How to access the values of id and update it to result1
def parse_results ():
try:
xml = minidom.parseString(new_results)
for xmlchild in xmldoc.childNodes[0].childNodes :
result1 = {}
result1.update ({'firstname': xmlchild.getElementsByTagName("firstname")[0].childNodes[0].nodeValue})
result1.update ({'lastname': xmlchild.getElementsByTagName("lastname")[0].childNodes[0].nodeValue})
result1.update ({'address': address})
if xmlchild.getElementsByTagName("id")[0].childNodes[0].nodeValue:
logging.debug(xmlchild.getElementsByTagName("id")[0].childNodes[0].nodeValue.lower())
result1.update ({'id': id})
Edit:
xmlchild.getElementsByTagName("id")[0].childNodes[0].nodeValue -this statement gives an exception
Adding XML:
<info><firstname>firstname</firstname><lastname>lastname</lastname><id>2</id></info>
<info><firstname>firstname</firstname><lastname>lastname</lastname><id>3</id></info>
<info><firstname>firstname</firstname><lastname>lastname</lastname><id>4</id></info>

Why are you using minidom? It is really boring to use.
I suggest you move to element tree:
import xml.etree.ElementTree as et
d = et.fromstring('''
<doc>
<info><firstname>firstname</firstname><lastname>lastname</lastname><id>2</id></info>
<info><firstname>firstname</firstname><lastname>lastname</lastname><id>3</id></info>
<info><firstname>firstname</firstname><lastname>lastname</lastname><id>4</id></info>
</doc>
''')
result = [dict((el.tag, el.text) for el in info) for info in d.findall('info')]
print result
That prints:
[{'firstname': 'firstname', 'id': '2', 'lastname': 'lastname'},
{'firstname': 'firstname', 'id': '3', 'lastname': 'lastname'},
{'firstname': 'firstname', 'id': '4', 'lastname': 'lastname'}]

Related

How to best iterate through a list within a dictionary?

I have a dictionary as follows:
a = {'name': 'Test', 'lastName': 'Test', 'scores': ['1', '2'], 'subjects': ['te','re'] }
I have tried nested loops, but I'm not sure if that's the best approach.
As an output I need a list of dictionaries for each score and subject :
result1 = { 'name':'Test', 'lastName': 'Test', 'score': '1', 'subjects': 'te'}
result2 = { 'name':'Test', 'lastName': 'Test', 'score': '2', 'subjects': 're'}
How to best iterate through the lists and create such dictionary? The number of scores and subjects will always match.
Any help would be appreciated.
Here is a function which unzip your dictionary.
We first use next to find some list value in the dictionary, its length is the expected output size.
Note that this will fail if the dictionary contains no list at all.
def unzip_dict(d):
# Find one of the list in the dictionary to read its length
length = len(next(value for value in d.values() if isinstance(value, list)))
output = []
# Unzip the dictionary
for i in range(length):
output.append({k: v[i] if isinstance(v, list) else v for k, v in d.items()})
return output
a = {'name': 'Test', 'lastName': 'Test', 'scores': ['1', '2'], 'subjects': ['te', 're']}
print(unzip_dict(a))
Output
[{'lastName': 'Test', 'name': 'Test', 'scores': '1', 'subjects': 'te'},
{'lastName': 'Test', 'name': 'Test', 'scores': '2', 'subjects': 're'}]
Try this:
# setup data
a = {'name': 'Test', 'lastName': 'Test', 'scores': ['1', '2'], 'subjects': ['te','re'] }
# create list of dictionary
out_list = []
for value in a.get('scores'):
for item in a.get('subjects'):
out_list.append({'name': 'Test', 'lastName': 'Test', 'scores':value, 'subjects':item})
Output:
{'name': 'Test', 'lastName': 'Test', 'scores': '1', 'subjects': 'te'}
{'name': 'Test', 'lastName': 'Test', 'scores': '1', 'subjects': 're'}
{'name': 'Test', 'lastName': 'Test', 'scores': '2', 'subjects': 'te'}
{'name': 'Test', 'lastName': 'Test', 'scores': '2', 'subjects': 're'}
You don't need nested for loops, a single for loop is sufficient:
def foo(a):
finarr=[]
for i in range(len(a['scores'])):
fnarr={}
fnarr['name']=a['name']
fnarr['lastName']=a['lastName']
fnarr['score']=a['scores'][i]
fnarr['subject']=a['subjects'][i]
finarr.append(fnarr)
return finarr
Output:
[{'name': 'Test', 'lastName': 'Test', 'score': '1', 'subject': 'te'},
{'name': 'Test', 'lastName': 'Test', 'score': '2', 'subject': 're'}]
you can try this:
res1 = {}
res2= {}
for k,v in a.items():
if(k == "scores"):
res1[k] = v[0]
res2[k] = v[1]
elif(k=="subjects"):
res1[k] = v[0]
res2[k] = v[1]
else:
res1[k] = v
res2[k] = v
print(res1)
print(res2)
you can also take a look to defaultdict i think that would help you on your Task
You can utilize zip to attach scores and subjects to each other and then add it to a new list.
d = {'name': 'Test', 'lastName': 'Test', 'scores': ['1', '2'], 'subjects': ['te','re'] }
template = {'name': d['name'], 'lastName': d['lastName']}
res = []
for subject, score in zip(d['subjects'], d['scores']):
template.update({'subjects': subject, 'scores': score})
res.append(template)
print(res)

Merge two dictionaries based on similarity excluding a key

I have the following three dictionaries in an array:
items = [
{
'FirstName': 'David',
'LastName': 'Smith',
'Language': set(['en'])
},
{
'FirstName': 'David',
'LastName': 'Smith',
'Language': set(['fr'])
},
{
'FirstName': 'Bob',
'LastName': 'Jones',
'Language': set(['en'])
} ]
I want to merge together these dictionaries if the two dictionaries are the same minus the specified key: and add that key together. If using the "Language" key it would merge the array into the following:
[ {
'FirstName': 'David',
'LastName': 'Smith',
'Language': set(['en','fr'])
},{
'FirstName': 'Bob',
'LastName': 'Jones',
'Language': set(['en'])
} ]
Here is what I'm currently doing:
from copy import deepcopy
def _merge_items_on_field(items, field):
'''Given an array of dicts, merge the
dicts together if they are the same except for the 'field'.
If merging dicts, add the unique values of that field together.'''
items = deepcopy(items)
items_merged_on_field = []
for num, item in enumerate(items):
# Remove that key/value from the dict
field_value = item.pop(field)
# Get an array of items *without* that field to compare against
items_without_field = deepcopy(items_merged_on_field)
map(lambda d: d.pop(field), items_without_field)
# If the dict item is found ("else"), add the fields together
# If not ("except"), then add in the dict item to the array
try:
index = items_without_field.index(item)
except ValueError:
item[field] = field_value
items_merged_on_field.append(item)
else:
items_merged_on_field[index][field] = items_merged_on_field[index][field].union(field_value)
return items_merged_on_field
>>> items = [{'LastName': 'Smith', 'Language': set(['en']), 'FirstName': 'David'}, {'LastName': 'Smith', 'Language': set(['fr']), 'FirstName': 'David'}, {'LastName': 'Jones', 'Language': set(['en']), 'FirstName': 'Bob'}]
>>> _merge_items_on_field(items, 'Language')
[{'LastName': 'Smith', 'Language': set(['fr', 'en']), 'FirstName': 'David'}, {'LastName': 'Jones', 'Language': set(['en']), 'FirstName': 'Bob'}]
This seems a bit complicated -- is there a better way to do this?
There are a couple of ways of doing this. The most painless method to my knowledge utilises the pandas library—in particular, a groupby + apply.
import pandas as pd
merged = (
pd.DataFrame(items)
.groupby(['FirstName', 'LastName'], sort=False)
.Language
.apply(lambda x: set.union(*x))
.reset_index()
.to_dict(orient='records')
)
print(merged)
[
{'FirstName': 'David', 'LastName': 'Smith', 'Language': {'en', 'fr'}},
{'FirstName': 'Bob', 'LastName': 'Jones', 'Language': {'en'}}
]
The other method (that I mentioned) uses itertools.groupby, but seeing as you have 30 columns to group on, I'd just recommend sticking to pandas.
If you want to turn this into a function,
def merge(items, field):
df = pd.DataFrame(items)
columns = df.columns.difference([field]).tolist()
return (
df.groupby(columns, sort=False)[field]
.apply(lambda x: set.union(*x))
.reset_index()
.to_dict(orient='records')
)
merged = merge(items, 'Language')
print(merged)
[
{'FirstName': 'David', 'LastName': 'Smith', 'Language': {'en', 'fr'}},
{'FirstName': 'Bob', 'LastName': 'Jones', 'Language': {'en'}}
]
You can use itertools.groupby:
import itertools
d = [{'FirstName': 'David', 'LastName': 'Smith', 'Language': {'en'}}, {'FirstName': 'David', 'LastName': 'Smith', 'Language': {'fr'}}, {'FirstName': 'Bob', 'LastName': 'Jones', 'Language': {'en'}}]
v = [[a, list(b)] for a, b in itertools.groupby(sorted(d, key=lambda x:x['FirstName']), key=lambda x:x['FirstName'])]
final_dict = [{**{'FirstName':a}, **{'LastName':(lambda x:[list(set(x)), x[0]][len(set(x)) == 1])([i['LastName'] for i in b])}, **{'Language':set([list(i['Language'])[0] for i in b])}} for a, b in v]
Output:
[{'FirstName': 'Bob', 'LastName': 'Jones', 'Language': {'en'}}, {'FirstName': 'David', 'LastName': 'Smith', 'Language': {'en', 'fr'}}]
If pandas is not an option:
from itertools import groupby
from functools import reduce
arr = [
{'FirstName': 'David', 'LastName': 'Smith', 'Language': set(['en'])},
{'FirstName': 'David', 'LastName': 'Smith', 'Language': set(['fr'])},
{'FirstName': 'David', 'LastName': 'Jones', 'Language': set(['sp'])}
]
def reduce_field(items, field, op=set.union, sort=False):
def _key(d):
return tuple((k, v) for k, v in d.items() if k != field)
if sort:
items = sorted(items, key=_key)
res = []
for k, g in groupby(items, key=_key):
d = dict(k)
d[field] = reduce(op, (el[field] for el in g))
res.append(d)
return res
reduce_field(arr, 'Language')
You can try it manually :
new_dict={}
#
#
#
d = [{'FirstName': 'David', 'LastName': 'Smith', 'Language': {'en'}},
{'FirstName': 'David', 'LastName': 'Smith', 'Language': {'fr'}},
{'FirstName': 'Bob', 'LastName': 'Jones', 'Language': {'en'}}]
for i in d:
if (i['FirstName'],i['LastName']) not in new_dict:
new_dict[(i['FirstName'],i['LastName'])]=i
else:
new_dict[(i['FirstName'],i['LastName'])]['Language']=set(list(new_dict[(i['FirstName'],i['LastName'])]['Language'])+list(i['Language']))
print(new_dict.values())
output:
# dict_values([{'FirstName': 'Bob',
# 'LastName': 'Jones',
# 'Language': {'en'}},
# {'FirstName': 'David',
# 'LastName': 'Smith',
# 'Language': {'fr', 'en'}}])

Create a list of dictionaries from a list of keys and multiple lists of values

My solution
keys = ['FirstName', 'LastName', 'ID']
name1 = ['Michael', 'Jordan', '224567']
name2 = ['Kyle', 'Hynes', '294007']
name3 = ['Josef', 'Jones', '391107']
dictList = []
dictList.append(dict(zip(keys, name1)))
dictList.append(dict(zip(keys, name2)))
dictList.append(dict(zip(keys, name3)))
Works fine, but is there any other solution, because I will have at least 20000 names, so I am looking how to improve this.
Place all your "name" sublists into the parent list names. Then you can easily use list comprehension:
keys = ['FirstName', 'LastName', 'ID']
names = [
['Michael', 'Jordan', '224567'],
['Kyle', 'Hynes', '294007'],
['Josef', 'Jones', '391107']
]
dictList = [{k:v for k,v in zip(keys, n)} for n in names]
print(dictList)
The output:
[{'FirstName': 'Michael', 'LastName': 'Jordan', 'ID': '224567'}, {'FirstName': 'Kyle', 'LastName': 'Hynes', 'ID': '294007'}, {'FirstName': 'Josef', 'LastName': 'Jones', 'ID': '391107'}]
Do you really need a dictionary? Why not just use a namedtuple:
>>> from collections import namedtuple
>>> Employee = namedtuple('Employee', 'FirstName, LastName, ID')
>>> names_list = [['Michael', 'Jordan', '224567'], ['Kyle', 'Hynes', '294007'], ['Josef', 'Jones', '391107']]
>>> employee_list = map(Employee._make, names_list)
>>> employee_list[0].FirstName
'Michael'
>>> pprint(employee_list)
[Employee(FirstName='Michael', LastName='Jordan', ID='224567'),
Employee(FirstName='Kyle', LastName='Hynes', ID='294007'),
Employee(FirstName='Josef', LastName='Jones', ID='391107')]
pandas makes this too easy.
import pandas as pd
keys = ['FirstName', 'LastName', 'ID']
name1 = ['Michael', 'Jordan', '224567']
name2 = ['Kyle', 'Hynes', '294007']
name3 = ['Josef', 'Jones', '391107']
doc_list = [name1,name2,name3]
df = pd.DataFrame(doc_list,columns = keys)
So you'll have a DataFrame like this:
FirstName LastName ID
0 Michael Jordan 224567
1 Kyle Hynes 294007
2 Josef Jones 391107
If your names are already in a file,read_csv would be better.
pd.read_csv("file_name.csv",header=keys)//remove the header parameter if it is present in your csv.
You should append your dictionaries to the list inside a loop, like this:
In [1152]: names = [name1, name2, name3]
In [1153]: d = []
In [1154]: for name in names:
...: d.append(dict(zip(keys, name)))
...:
In [1155]: d
Out[1155]:
[{'FirstName': 'Michael', 'ID': '224567', 'LastName': 'Jordan'},
{'FirstName': 'Kyle', 'ID': '294007', 'LastName': 'Hynes'},
{'FirstName': 'Josef', 'ID': '391107', 'LastName': 'Jones'}]
Or, if you prefer, a list comprehension:
In [1160]: d = [dict(zip(keys, name)) for name in names]
In [1161]: d
Out[1161]:
[{'FirstName': 'Michael', 'ID': '224567', 'LastName': 'Jordan'},
{'FirstName': 'Kyle', 'ID': '294007', 'LastName': 'Hynes'},
{'FirstName': 'Josef', 'ID': '391107', 'LastName': 'Jones'}]

Split dictionary field

I've managed to figure out how to run a SQL query to display information. I need to keep the data in the same form as the db tables, so I think I should be using a dictionary. So far, my fields are ID and Name, my print looks like this:
[{'ID': '123', 'Name': 'ROBERTSON*ROBERT'}, {'ID': '456', 'Name': 'MICHAELS*MIKE'}, {'ID': '789', 'Name': 'KRISTENSEN*KRISTEN'}, ...]
First, am I appropriately using dictionary?
Next, I need to split the Name field based on the * delimiter. For example:
Before:
{'ID': '789', 'Name': 'KRISTENSEN*KRISTEN'}
After:
{'ID': '789', 'LastName': 'KRISTENSEN', 'FirstName': 'KRISTEN'}
I've tested out a few things of code I've found but keep hitting roadblocks. I've used this to create my dictionary, I'm wondering if I include a split in this line to reduce a step?
query = [dict(zip(['ID', 'Name'],row)) for row in cursor.fetchall()]
Like so maybe:
query = [dict(zip(['ID', 'FirstName', 'LastName'], row[:1] + row[1].split('*'))) for row in cursor.fetchall()]
db_dict = {'ID': '789', 'Name': 'KRISTENSEN*KRISTEN'}
name = db_dict['Name']
def split_name(name):
for index, char in enumerate(name):
if char == '*':
position = index
last_name = name[:position]
first_name = name[position + 1:]
return {'LastName':last_name, 'FirstName':first_name}
new_db_dict = {db_dict.keys()[0] : db_dict.values()[0]}
new_db_dict.update(split_name(name))
print new_db_dict
First, while your use of dictionaries is valid I recommend using namedtuples for representing fixed structures with named fields
from collections import namedtuple
# structure class factory
Person = namedtuple("Person", ("id", "name"))
people = [ Person('123', 'ROBERTSON*ROBERT'), Person('456','MICHAELS*MIKE'), Person('789', 'KRISTENSEN*KRISTEN')]
# different structure
PersonName = namedtuple("Person", ("id", "first", "last"))
# structure transformation
def person_to_personname(person):
"""Transform Person -> PersonName"""
names = person.name.split('*')
if len(names) < 2: # depends on your defaults
last = names[0]
first = ''
else: # assumes first field is last name
last, first = names[:2] # even if other names present, takes first two
return PersonName(person.id, first, last)
people_names = [person_to_personname(person) for person in people]
If all entries have a name split by an asterix
A solution in two steps. Once you've retrieved your current results :
a = [{'ID': '123', 'Name': 'ROBERTSON*ROBERT'}, {'ID': '456', 'Name': 'MICHAELS*MIKE'}, {'ID': '789', 'Name': 'KRISTENSEN*KRISTEN'}]
result = [{'ID' : entry['ID'], 'LastName' : entry['Name'].split('*')[0], 'FirstName' : entry['Name'].split('*')[1]} for entry in a]
now if you print result :
[{'FirstName': 'ROBERT', 'ID': '123', 'LastName': 'ROBERTSON'},
{'FirstName': 'MIKE', 'ID': '456', 'LastName': 'MICHAELS'},
{'FirstName': 'KRISTEN', 'ID': '789', 'LastName': 'KRISTENSEN'}]
Otherwise (assuming that the field 'Name' is at least populated)
results = []
for entry in a:
name = entry['Name'].split('*')
result = dict(ID = entry['ID'], LastName = name[0])
if len(name) > 1:
result['FirstName'] = name[1]
results.append(result)

lxml memory usage when parsing huge xml in python

I am a python newbie. I am trying to parse a huge xml file in my python module using lxml. In spite of clearing the elements at the end of each loop, my memory shoots up and crashes the application. I am sure I am missing something here. Please helpme figure out what that is.
Following are main functions I am using -
from lxml import etree
def parseXml(context,attribList):
for _, element in context:
fieldMap={}
rowList=[]
readAttribs(element,fieldMap,attribList)
readAllChildren(element,fieldMap,attribList)
for row in rowList:
yield row
element.clear()
def readAttribs(element,fieldMap,attribList):
for atrrib in attribList:
fieldMap[attrib]=element.get(attrib,'')
def readAllChildren(element,fieldMap,attribList,rowList):
for childElem in element:
readAttribs(childEleme,fieldMap,attribList)
if len(childElem) > 0:
readAllChildren(childElem,fieldMap,attribList)
rowlist.append(fieldMap.copy())
childElem.clear()
def main():
attribList=['name','age','id']
context=etree.iterparse(fullFilePath, events=("start",))
for row in parseXml(context,attribList)
print row
Thanks!!
Example xml and the nested dictionary -
<root xmlns='NS'>
<Employee Name="Mr.ZZ" Age="30">
<Experience TotalYears="10" StartDate="2000-01-01" EndDate="2010-12-12">
<Employment id = "1" EndTime="ABC" StartDate="2000-01-01" EndDate="2002-12-12">
<Project Name="ABC_1" Team="4">
</Project>
</Employment>
<Employment id = "2" EndTime="XYZ" StartDate="2003-01-01" EndDate="2010-12-12">
<PromotionStatus>Manager</PromotionStatus>
<Project Name="XYZ_1" Team="7">
<Award>Star Team Member</Award>
</Project>
</Employment>
</Experience>
</Employee>
</root>
ELEMENT_NAME='element_name'
ELEMENTS='elements'
ATTRIBUTES='attributes'
TEXT='text'
xmlDef={ 'namespace' : 'NS',
'content' :
{ ELEMENT_NAME: 'Employee',
ELEMENTS: [{ELEMENT_NAME: 'Experience',
ELEMENTS: [{ELEMENT_NAME: 'Employment',
ELEMENTS: [{
ELEMENT_NAME: 'PromotionStatus',
ELEMENTS: [],
ATTRIBUTES:[],
TEXT:['PromotionStatus']
},
{
ELEMENT_NAME: 'Project',
ELEMENTS: [{
ELEMENT_NAME: 'Award',
ELEMENTS: {},
ATTRIBUTES:[],
TEXT:['Award']
}],
ATTRIBUTES:['Name','Team'],
TEXT:[]
}],
ATTRIBUTES: ['TotalYears','StartDate','EndDate'],
TEXT:[]
}],
ATTRIBUTES: ['TotalYears','StartDate','EndDate'],
TEXT:[]
}],
ATTRIBUTES: ['Name','Age'],
TEXT:[]
}
}
Welcome to Python and Stack Overflow!
It looks like you've followed some good advice looking at lxml and especially etree.iterparse(..), but I think your implementation is approaching the problem from the wrong angle. The idea of iterparse(..) is to get away from collecting and storing data, and instead processing tags as they get read in. Your readAllChildren(..) function is saving everything to rowList, which grows and grows to cover the whole document tree. I made a few changes to show what's going on:
from lxml import etree
def parseXml(context,attribList):
for event, element in context:
print "%s element %s:" % (event, element)
fieldMap = {}
rowList = []
readAttribs(element, fieldMap, attribList)
readAllChildren(element, fieldMap, attribList, rowList)
for row in rowList:
yield row
element.clear()
def readAttribs(element, fieldMap, attribList):
for attrib in attribList:
fieldMap[attrib] = element.get(attrib,'')
print "fieldMap:", fieldMap
def readAllChildren(element, fieldMap, attribList, rowList):
for childElem in element:
print "Found child:", childElem
readAttribs(childElem, fieldMap, attribList)
if len(childElem) > 0:
readAllChildren(childElem, fieldMap, attribList, rowList)
rowList.append(fieldMap.copy())
print "len(rowList) =", len(rowList)
childElem.clear()
def process_xml_original(xml_file):
attribList=['name','age','id']
context=etree.iterparse(xml_file, events=("start",))
for row in parseXml(context,attribList):
print "Row:", row
Running with some dummy data:
>>> from cStringIO import StringIO
>>> test_xml = """\
... <family>
... <person name="somebody" id="5" />
... <person age="45" />
... <person name="Grandma" age="62">
... <child age="35" id="10" name="Mom">
... <grandchild age="7 and 3/4" />
... <grandchild id="12345" />
... </child>
... </person>
... <something-completely-different />
... </family>
... """
>>> process_xml_original(StringIO(test_xml))
start element: <Element family at 0x105ca58>
fieldMap: {'age': '', 'name': '', 'id': ''}
Found child: <Element person at 0x105ca80>
fieldMap: {'age': '', 'name': 'somebody', 'id': '5'}
len(rowList) = 1
Found child: <Element person at 0x105c468>
fieldMap: {'age': '45', 'name': '', 'id': ''}
len(rowList) = 2
Found child: <Element person at 0x105c7b0>
fieldMap: {'age': '62', 'name': 'Grandma', 'id': ''}
Found child: <Element child at 0x106e468>
fieldMap: {'age': '35', 'name': 'Mom', 'id': '10'}
Found child: <Element grandchild at 0x106e148>
fieldMap: {'age': '7 and 3/4', 'name': '', 'id': ''}
len(rowList) = 3
Found child: <Element grandchild at 0x106e490>
fieldMap: {'age': '', 'name': '', 'id': '12345'}
len(rowList) = 4
len(rowList) = 5
len(rowList) = 6
Found child: <Element something-completely-different at 0x106e4b8>
fieldMap: {'age': '', 'name': '', 'id': ''}
len(rowList) = 7
Row: {'age': '', 'name': 'somebody', 'id': '5'}
Row: {'age': '45', 'name': '', 'id': ''}
Row: {'age': '7 and 3/4', 'name': '', 'id': ''}
Row: {'age': '', 'name': '', 'id': '12345'}
Row: {'age': '', 'name': '', 'id': '12345'}
Row: {'age': '', 'name': '', 'id': '12345'}
Row: {'age': '', 'name': '', 'id': ''}
start element: <Element person at 0x105ca80>
fieldMap: {'age': '', 'name': '', 'id': ''}
start element: <Element person at 0x105c468>
fieldMap: {'age': '', 'name': '', 'id': ''}
start element: <Element person at 0x105c7b0>
fieldMap: {'age': '', 'name': '', 'id': ''}
start element: <Element child at 0x106e468>
fieldMap: {'age': '', 'name': '', 'id': ''}
start element: <Element grandchild at 0x106e148>
fieldMap: {'age': '', 'name': '', 'id': ''}
start element: <Element grandchild at 0x106e490>
fieldMap: {'age': '', 'name': '', 'id': ''}
start element: <Element something-completely-different at 0x106e4b8>
fieldMap: {'age': '', 'name': '', 'id': ''}
It's a little hard to read but you can see it's climbing the whole tree down from the root tag on the first pass, building up rowList for every element in the entire document. You'll also notice it's not even stopping there, since the element.clear() call comes after the yield statment in parseXml(..), it doesn't get executed until the second iteration (i.e. the next element in the tree).
Incremental processing FTW
A simple fix is to let iterparse(..) do its job: parse iteratively! The following will pull the same information and process it incrementally instead:
def do_something_with_data(data):
"""This just prints it out. Yours will probably be more interesting."""
print "Got data: ", data
def process_xml_iterative(xml_file):
# by using the default 'end' event, you start at the _bottom_ of the tree
ATTRS = ('name', 'age', 'id')
for event, element in etree.iterparse(xml_file):
print "%s element: %s" % (event, element)
data = {}
for attr in ATTRS:
data[attr] = element.get(attr, u"")
do_something_with_data(data)
element.clear()
del element # for extra insurance
Running on the same dummy XML:
>>> print test_xml
<family>
<person name="somebody" id="5" />
<person age="45" />
<person name="Grandma" age="62">
<child age="35" id="10" name="Mom">
<grandchild age="7 and 3/4" />
<grandchild id="12345" />
</child>
</person>
<something-completely-different />
</family>
>>> process_xml_iterative(StringIO(test_xml))
end element: <Element person at 0x105cc10>
Got data: {'age': u'', 'name': 'somebody', 'id': '5'}
end element: <Element person at 0x106e468>
Got data: {'age': '45', 'name': u'', 'id': u''}
end element: <Element grandchild at 0x106e148>
Got data: {'age': '7 and 3/4', 'name': u'', 'id': u''}
end element: <Element grandchild at 0x106e490>
Got data: {'age': u'', 'name': u'', 'id': '12345'}
end element: <Element child at 0x106e508>
Got data: {'age': '35', 'name': 'Mom', 'id': '10'}
end element: <Element person at 0x106e530>
Got data: {'age': '62', 'name': 'Grandma', 'id': u''}
end element: <Element something-completely-different at 0x106e558>
Got data: {'age': u'', 'name': u'', 'id': u''}
end element: <Element family at 0x105c6e8>
Got data: {'age': u'', 'name': u'', 'id': u''}
This should greatly improve both the speed and memory performance of your script. Also, by hooking the 'end' event, you're free to clear and delete elements as you go, rather than waiting until all children have been processed.
Depending on your dataset, it might be a good idea to only process certain types of elements. The root element, for one, probably isn't very meaningful, and other nested elements may also fill your dataset with a lot of {'age': u'', 'id': u'', 'name': u''}.
Or, with SAX
As an aside, when I read "XML" and "low-memory" my mind always jumps straight to SAX, which is another way you could attack this problem. Using the builtin xml.sax module:
import xml.sax
class AttributeGrabber(xml.sax.handler.ContentHandler):
"""SAX Handler which will store selected attribute values."""
def __init__(self, target_attrs=()):
self.target_attrs = target_attrs
def startElement(self, name, attrs):
print "Found element: ", name
data = {}
for target_attr in self.target_attrs:
data[target_attr] = attrs.get(target_attr, u"")
# (no xml trees or elements created at all)
do_something_with_data(data)
def process_xml_sax(xml_file):
grabber = AttributeGrabber(target_attrs=('name', 'age', 'id'))
xml.sax.parse(xml_file, grabber)
You'll have to evaluate both options based on what works best in your situation (and maybe run a couple benchmarks, if this is something you'll be doing often).
Be sure to follow up with how things work out!
Edit based on follow-up comments
Implementing either of the above solutions may require some changes to the overall structure of your code, but anything you have should still be doable. For instance, processing "rows" in batches, you could have:
def process_xml_batch(xml_file, batch_size=10):
ATTRS = ('name', 'age', 'id')
batch = []
for event, element in etree.iterparse(xml_file):
data = {}
for attr in ATTRS:
data[attr] = element.get(attr, u"")
batch.append(data)
element.clear()
del element
if len(batch) == batch_size:
do_something_with_batch(batch)
# Or, if you want this to be a genrator:
# yield batch
batch = []
if batch:
# there are leftover items
do_something_with_batch(batch) # Or, yield batch

Categories

Resources