HDI: write large string xml into file (python xml.dom.minidom)

HDI: write large string xml into file (python xml.dom.minidom) - python

I'm currently building large xml files with xml.dom.minidom and then writing them out to file via the toprettyxml. is there a way to stream the xml to a document because I'm hitting memory errors.
def run(self):
while True:
domain = self.queue.get()
try:
conn = boto.connect_sdb(awsa, awss)
sdbdomain = conn.get_domain(domain)
s3conn = boto.connect_s3(awsa, awss)
archbucket = s3conn.get_bucket("simpledbbu")
doc = None
doc = Document()
root = doc.createElement("items")
doc.appendChild(root)
countermax = 0
counter = 0
for item in sdbdomain:
node = doc.createElement("item")
node.setAttribute("itemName", item.name)
for k,v in item.items():
if not isinstance(v, basestring):
i = 0
for val in v:
node.setAttribute("{0}::{1}".format(k,i),val)
i += 1
else:
node.setAttribute(k,v)
root.appendChild(node)
k = Key(archbucket)
k.key = "{0}/{1}.xml".format(datetime.date.today().strftime("%Y%m%d"),sdbdomain.name)
#x = doc.toprettyxml(indent=" ")
f = open(domain + ".xml", "w")
f.truncate()
f.write(doc.toprettyxml(indent=" "))
f.close()
#k.content_type.encode('ascii')
k.set_contents_from_filename(f.name)
os.remove(os.path.join(os.getcwd(),f.name))
except:
print "failed to load domain: {0}".format(domain)
print formatExceptionInfo()
finally:
self.queue.task_done()

building large xml files with xml.dom.minidom and then writing them out to file via the toprettyxml.
If you run out of memory you should probably stop doing that.
You can build XML with simple string manipulation.
with open(domain + ".xml", "w") as f:
f.write( "<?xml..." )
f.write( "<items>" )
for item in sdbdomain:
buffer= []
for k,v in item.items():
if not isinstance(v, basestring):
for i, val in enumerate(v):
txt= '{0}::{1}="{2}"'.format(k,i,val)
else:
txt= '{0}="{1}"'.format(k,v)
buffer.append( txt )
f.write( " <item {0}/>\n".format( " ".join(buffer) ))
f.write( "</items>" )
k= ................
k.set_contents_from_filename(f.name)
Something like that ought to allow you to write the XML to a temporary file without making a large DOM object in memory.

Related

Delete comments with python function

Anybody can advise what could be wrong with my code?
I am trying to make a method that removes the single line comments from the content.
Also, the method should return the single line comments that start with '#'.
import os
def deleteComments(file):
try:
my_file = open(file, 'r')
data = my_file.read()
clean = ""
comment= 0
if i[0] == "#":
comment += 1
else:
pass
with open("clean-", "w") as f:
f.write(clean)
f.close()
my_file.close()
except:
print("An error occurred with accessing the files")
return file
def deleteComment(file):
try:
my_file = open(file, 'r')
data = my_file.read()
clean = ""
comment= 0
if i[0] == "#":
comment += 1
else:
pass
with open("clean-", "w") as f:
f.write(clean)
f.close()
my_file.close()
except:
print("An error occurred with accessing the files")
return file

This should make it work.
import os
def deleteComments(file):
try:
my_file = open(file, 'r')
data = my_file.read()
clean = ""
comments_count = 0
for i in data.split('\n'):
if i[0] == "#":
clean += i
clean += '\n'
comments_count += 1
else:
pass
name = os.path.basename(path)
with open("clean-" + name, "w") as f:
f.write(clean)
f.close()
my_file.close()
return comments_count
except:
print("An error occurred with accessing the files")
return file

a bytes-like object is required, not 'str' JSON File opened as STR

I've only learnt the basics of Python please forgive me but I was not able to determine the fix from the other posts. I open my JSON files with 'r' and I think I'm writing to them in r but it doesn't like that. Changing it to 'r' doesn't help :(
For the following section:
if isinstance(to_write, list):
self.log_file.write(''.join(to_write) + "<r/>")
else:
self.log_file.write(str(to_write) + "<r/>")
self.log_file.flush()
The error I get is: a bytes-like object is required, not 'str'
import math
import time
from random import randint
import json
from instagram.client import InstagramAPI
class Bot:
def __init__(self, config_file, tags_file):
# Loading the configuration file, it has the access_token, user_id and others configs
self.config = json.load(config_file)
# Loading the tags file, it will be keep up to date while the script is running
self.tags = json.load(tags_file)
# Log file to output to html the debugging info about the script
self.filename = self.config["path"] + self.config["prefix_name"] + time.strftime("%d%m%Y") + ".html"
self.log_file = open(self.filename, "wb")
# Initializing the Instagram API with our access token
self.api = InstagramAPI(access_token=self.config["access_token"], client_secret=self.config['client_secret'])
# Likes per tag rate
self.likes_per_tag = math.trunc(min(self.config["follows_per_hour"],
self.config["likes_per_hour"]) / len(self.tags["tags"]))
def save_tags(self):
j = json.dumps(self.tags, indent=4)
f = open('tags.json', 'w')
print >> f, j
f.close()
def insta_write(self, to_write):
if self.filename != self.config["path"] + self.config["prefix_name"] + time.strftime("%d%m%Y") + ".html":
self.log_file.close()
self.filename = self.config["path"] + self.config["prefix_name"] + time.strftime("%d%m%Y") + ".html"
self.log_file = open(self.filename, "wb")
if isinstance(to_write, list):
self.log_file.write(''.join(to_write) + "<r/>")
else:
self.log_file.write(str(to_write) + "<r/>")
self.log_file.flush()
def going_sleep(self, timer):
sleep = randint(timer, 2 * timer)
self.insta_write("SLEEP " + str(sleep))
time.sleep(sleep)
def like_and_follow(self, media, likes_for_this_tag):
try:
var = self.api.user_relationship(user_id=media.user.id)
if self.config["my_user_id"] != media.user.id:
self.insta_write("--------------")
self.insta_write(var)
if var.outgoing_status == 'none':
self.insta_write("LIKE RESULT:")
self.insta_write(self.api.like_media(media_id=media.id))
self.insta_write("FOLLOW RESULT:")
self.insta_write(self.api.follow_user(user_id=media.user.id))
likes_for_this_tag -= 1
self.going_sleep(self.config["sleep_timer"])
else:
self.going_sleep(self.config["sleep_timer"] / 2)
except Exception as e:
self.insta_write(str(e))
self.insta_write("GOING SLEEP 30 min")
time.sleep(1800)
self.like_and_follow(media, likes_for_this_tag)
return likes_for_this_tag
def run(self):
while True:
for tag in self.tags["tags"].keys():
tag = str(tag)
self.insta_write("--------------------")
self.insta_write("TAG: " + tag)
self.insta_write("--------------------")
self.insta_write("--------------------")
self.insta_write("DICTIONARY STATUS:")
for keys, values in self.tags["tags"].items():
self.insta_write(keys)
if values is not None:
self.insta_write(values)
likes_for_this_tag = self.likes_per_tag
while likes_for_this_tag > 0 and self.tags["tags"][tag] != 0:
if self.tags["tags"][tag] is None:
media_tag, self.tags["tags"][tag] = self.api.tag_recent_media(tag_name=tag,
count=likes_for_this_tag)
else:
media_tag, self.tags["tags"][tag] = self.api.tag_recent_media(tag_name=tag,
count=likes_for_this_tag,
max_tag_id=self.tags["tags"][tag])
self.insta_write("API CALL DONE")
if len(media_tag) == 0 or self.tags["tags"][tag] is None:
self.tags["tags"][tag] = 0
likes_for_this_tag = 0
else:
self.insta_write(self.tags["tags"][tag])
self.tags["tags"][tag] = self.tags["tags"][tag].split("&")[-1:][0].split("=")[1]
self.save_tags()
for m in media_tag:
likes_for_this_tag = self.like_and_follow(m, likes_for_this_tag)
if reduce(lambda r, h: r and h[1] == 0, self.tags["tags"].items(), True):
self.insta_write("END")
exit(1)
if __name__ == '__main__':
bot = Bot(open("config_bot.json", "r"), open("tags.json", "r"))
bot.run()

You opened the file as binary:
self.log_file = open(self.filename, "wb")
but are writing str Unicode strings to it. Either open the file in text mode (with an encoding set) or encode each string, separately.
Opening the file in text mode is easiest:
self.log_file = open(self.filename, "w", encoding="utf8")

In my case, the reason for the error was the conflict between json.load function and another function from another module w/ the same name load. Specifying explicitly which load function to use i.e. json.load, solved the problem.

Make python configobj to not put a space before and after the '='

Simple question. It is possible to make configobj to not put a space before and after the '=' in a configuration entry ?
I'm using configobj to read and write a file that is later processed by a bash script, so putting an antry like:
VARIABLE = "value"
breaks the bash script, it needs to always be:
VARIABLE="value"
Or if someone has another suggestion about how to read and write a file with this kind of entries (and restrictions) is fine too.
Thanks

I was looking into same and modified configobj.py by changing line 1980 in:
def _write_line(self, indent_string, entry, this_entry, comment)
from:
self._a_to_u(' = ')
to:
self._a_to_u('=')
After the change the output is without the space before and after equal sign.

Configobj is for reading and writing ini-style config files. You are apparently trying to use it to write bash scripts. That's not something that is likely to work.
Just write the bash-script like you want it to be, perhaps using a template or something instead.
To make ConfigParses not write the spaces around the = probably requires that you subclass it. I would guess that you have to modify the write method, but only reading the code can help there. :-)

Well, as suggested, I ended up writing my own parser for this that can be used exactly in the same way as ConfigObj:
config = MyConfigParser("configuration_file")
print config["CONFIG_OPTION_1"]
config["CONFIG_OPTION_1"]= "Value 1"
print config["CONFIG_OPTION_1
config.write()
This is the code if someone is interested or wants to give suggestions (I started coding in python not so long ago so probably there are lots of room for improvement). It respects the comments and the order of the options in the file, and correctly scapes and adds double quotes where needed:
import os
import sys
class MyConfigParser:
name = 'MyConfigParser'
debug = False
fileName = None
fileContents = None
configOptions = dict()
def __init__(self, fileName, debug=False):
self.fileName = fileName
self.debug = debug
self._open()
def _open(self):
try:
with open(self.fileName, 'r') as file:
for line in file:
#If it isn't a comment get the variable and value and put it on a dict
if not line.startswith("#") and len(line) > 1:
(key, val) = line.rstrip('\n').split('=')
val = val.strip()
val = val.strip('\"')
val = val.strip('\'')
self.configOptions[key.strip()] = val
except:
print "ERROR: File " + self.fileName + " Not Found\n"
def write(self):
try:
#Write the file contents
with open(self.fileName, 'r+') as file:
lines = file.readlines()
#Truncate file so we don't need to close it and open it again
#for writing
file.seek(0)
file.truncate()
i = 0
#Loop through the file to change with new values in dict
for line in lines:
if not line.startswith("#") and len(line) > 1:
(key, val) = line.rstrip('\n').split('=')
try:
if key in line:
newVal = self.configOptions[key]
#Only update if the variable value has changed
if val != newVal:
newLine = key + "=\"" + newVal + "\"\n"
line = newLine
except:
continue
i +=1
file.write(line)
except IOError as e:
print "ERROR opening file " + self.fileName + ": " + e.strerror + "\n"
#Redefinition of __getitem__ and __setitem__
def __getitem__(self, key):
try:
return self.configOptions.__getitem__(key)
except KeyError as e:
if isinstance(key,int):
keys = self.configOptions.keys()
return self.configOptions[keys[key]]
else:
raise KeyError("Key " +key+ " doesn't exist")
def __setitem__(self,key,value):
self.configOptions[key] = value

As suggested above, it is possible to remove the spaces either side of the equals sign by making a small change to the _write_line method. This can be done conveniently by subclassing ConfigObj and overwriting _write_line as follows -
from configobj import ConfigObj
class MyConfigObj(ConfigObj):
def __init__(self, *args, **kwargs):
ConfigObj.__init__(self, *args, **kwargs)
def _write_line(self, indent_string, entry, this_entry, comment):
"""Write an individual line, for the write method"""
# NOTE: the calls to self._quote here handles non-StringType values.
if not self.unrepr:
val = self._decode_element(self._quote(this_entry))
else:
val = repr(this_entry)
return '%s%s%s%s%s' % (indent_string,
self._decode_element(self._quote(entry, multiline=False)),
self._a_to_u('='),
val,
self._decode_element(comment))
Then just use MyConfigObj in place of ConfigObj and all the functionality of ConfigObj is maintained

As Lennart suggests, configobj is probably not the right tool for the job: how about:
>>> import pipes
>>> def dict2bash(d):
... for k, v in d.iteritems():
... print "%s=%s" % (k, pipes.quote(v))
...
>>> dict2bash({'foo': "bar baz quux"})
foo='bar baz quux'
since configobj returns something that looks a lot like a dict, you could probably still use it to read the data you are trying to process.

First of all, thanks Juancho. That's what i was looking for. But i edited the ConfigParser a little bit. Now it can handle bash script arrays in form of:
# Network interfaces to be configured
ifaces=( "eth0" "eth1" "eth2" "eth3" )
If you set a value it just proves if an value is a list and if, it sets the quotes correctly. So you can set values still the same way, even it is a list:
ifaces = ['eth0', 'eth1', 'eth2', 'eth3']
conf['ifaces'] = ifaces
Here's the code:
import os
import sys
class MyConfigParser:
name = 'MyConfigParser'
debug = False
fileName = None
fileContents = None
configOptions = dict()
qouteOptions = dict()
def __init__(self, fileName, debug=False):
self.fileName = fileName
self.debug = debug
self._open()
def _open(self):
try:
with open(self.fileName, 'r') as file:
for line in file:
#If it isn't a comment get the variable and value and put it on a dict
if not line.startswith("#") and len(line) > 1:
(key, val) = line.rstrip('\n').split('=')
val = val.strip()
val = val.strip('\"')
val = val.strip('\'')
self.configOptions[key.strip()] = val
if val.startswith("("):
self.qouteOptions[key.strip()] = ''
else:
self.qouteOptions[key.strip()] = '\"'
except:
print "ERROR: File " + self.fileName + " Not Found\n"
def write(self):
try:
#Write the file contents
with open(self.fileName, 'r+') as file:
lines = file.readlines()
#Truncate file so we don't need to close it and open it again
#for writing
file.seek(0)
file.truncate()
#Loop through the file to change with new values in dict
for line in lines:
if not line.startswith("#") and len(line) > 1:
(key, val) = line.rstrip('\n').split('=')
try:
if key in line:
quotes = self.qouteOptions[key]
newVal = quotes + self.configOptions[key] + quotes
#Only update if the variable value has changed
if val != newVal:
newLine = key + "=" + newVal + "\n"
line = newLine
except:
continue
file.write(line)
except IOError as e:
print "ERROR opening file " + self.fileName + ": " + e.strerror + "\n"
#Redefinition of __getitem__ and __setitem__
def __getitem__(self, key):
try:
return self.configOptions.__getitem__(key)
except KeyError as e:
if isinstance(key,int):
keys = self.configOptions.keys()
return self.configOptions[keys[key]]
else:
raise KeyError("Key " + key + " doesn't exist")
def __setitem__(self, key, value):
if isinstance(value, list):
self.qouteOptions[key] = ''
value_list = '('
for item in value:
value_list += ' \"' + item + '\"'
value_list += ' )'
self.configOptions[key] = value_list
else:
self.qouteOptions[key] = '\"'
self.configOptions[key] = value

Finding a line in a file then reading next few lines in Python

I have a plain text file with the following data:
id=1
name=Scott
occupation=Truck driver
age=23
id=2
name=Dave
occupation=Waiter
age=16
id=3
name=Susan
occupation=Computer programmer
age=29
I'm trying to work out the best way to get to any point in the file given an id string, then grab the rows underneath to extract the data for use in my program. I can do something like:
def get_person_by_id(id):
file = open('rooms', 'r')
for line in file:
if ("id=" + id) in line:
print(id + " found")
But I'm not sure how I can now go through the next bunch of lines and do line.split("=") or similar to extract the info (put into a list or dict or whatever) that I can use my program. Any pointers?

One option would be to load the entire thing into memory, which would save you from reading the file every time:
with open('rooms') as f:
chunks = f.read().split('\n\n')
people_by_id = {}
for chunk in chunks:
data = dict(row.split('=', 1) for row in chunk.split('\n'))
people_by_id[data['id']] = data
del data['id']
def get_person_by_id(id):
return people_by_id.get(id)

How about exiting from a for loop after finding the correct line:
def get_person_by_id(id):
file = open('rooms', 'r')
for line in file:
if ("id=" + id) in line:
print(id + " found")
break
#now you can continue processing your file:
next_line = file.readline()

Maybe:
d = dict()
with open(filename) as f:
for line in f:
k,v = line.split('=')
if 'id=' in line:
d[v] = {}
d[d.keys()[-1]][k] = v

And here is an iterative solution.
objects = []
current_object = None
with open("info.txt", "rb") as f:
for line in f:
line = line.strip("\r\n")
if not line:
current_object = None
continue
if current_object is None:
current_object = {}
objects.append(current_object)
key,_,value = line.partition('=')
current_object[key] = value
print objects

Another example of an iterative parser:
from itertools import takewhile
def entries(f):
e = {}
def read_one():
one = {}
for line in takewhile(lambda x: '=' in x, f):
key, val = line.strip().split('=')
one[key] = val
return one
while True:
one = read_one()
if not one:
break
else:
e[one.pop('id')] = one
return e
Example:
>>> with open('data.txt') as f:
..: print entries(f)['2']
{'age': '16', 'occupation': 'Waiter', 'name': 'Dave'}

Get all the person's attributes and values (i.e. id, name, occupation, age, etc..), till you find
an empy line.
def get_person_by_id(id):
person = {}
file = open('rooms', 'r')
for line in file:
if found == True:
if line.strip():
attr, value = line.split("="):
else:
return person
elif ("id=" + id) in line:
print(id + " found")
found = True
attr, value = line.split("=")
person[attr] = value
return person

This solution is a bit more forgiving of empty lines within records.
def read_persons(it):
person = dict()
for l in it:
try:
k, v = l.strip('\n').split('=', 1)
except ValueError:
pass
else:
if k == 'id': # New record
if person:
yield person
person = dict()
person[k] = v
if person:
yield person

python parse configuration file, containing list of filenames

I would like to parse a config file, containing list of filenames, divided in sections:
[section1]
path11/file11
path12/file12
...
[section2]
path21/file21
..
I tried ConfigParser, but it requires pairs of name-value. How can I parse such a file?

Likely you have to implement the parser on your own.
Blueprint:
key = None
current = list()
for line in file(...):
if line.startswith('['):
if key:
print key, current
key = line[1:-1]
current = list()
else:
current.append(line)

Here is an iterator/generator solution:
data = """\
[section1]
path11/file11
path12/file12
...
[section2]
path21/file21
...""".splitlines()
def sections(it):
nextkey = next(it)
fin = False
while not fin:
key = nextkey
body = ['']
try:
while not body[-1].startswith('['):
body.append(next(it))
except StopIteration:
fin = True
else:
nextkey = body.pop(-1)
yield key, body[1:]
print dict(sections(iter(data)))
# if reading from a file, do: dict(sections(file('filename.dat')))

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

HDI: write large string xml into file (python xml.dom.minidom) - python

Related

Delete comments with python function

a bytes-like object is required, not 'str' JSON File opened as STR

Make python configobj to not put a space before and after the '='

Finding a line in a file then reading next few lines in Python

python parse configuration file, containing list of filenames

Categories

Resources