Change Element text in xml-file - python

In some metadata.xml files I have to modify some xml element texts. I am using a small python script for that. The first part did work without problems:
ns = etree.FunctionNamespace(None)
tree = etree.parse(os.path.join(root,*metadata.xml*))
ns = tree.getroot().nsmap
try:
id_xml = tree.xpath("/re:EarthObservation/gml:metaDataProperty/re:EarthObservationMetaData/hma:identifier/text()", namespaces = ns)[0]
id = id_xml.split("-NAC")[0]
id_xml.text = id
except:
pass
Now the secont part wont work, even if I use the same routine, but on another xml file:
ns = etree.FunctionNamespace(None)
tree = etree.parse(os.path.join(root, file))
ns = tree.getroot().nsmap
try:
id_if = tree.xpath("/re:RpcMetadata/re:parentImageFile/text()", namespaces = ns)[0]
id_meta = tree.xpath("/re:RpcMetadata/re:parentMetadataFile/text()", namespaces = ns)[0]
new_if = id_if.split("-NAC")[0]
new_meta = id_meta.split("-NAC")[0]
new_if = "{}_some_text".format(new_if)
new_meta = "{}_some_text".format(new_meta)
id_if.text = new_if
id_meta.text = new_meta
except Exception as e:
print(e)
Someone got an idea what im doing wrong?
The exception is says " 'lxml.etree._ElementUnicodeResult' object has no attribute 'text' "

Related

Python null check import xls

There is a python code which reads from a field xls file
The script works, but there are problems when there are empty fields in the file
The script does not read the field if the file has an empty field
My code, for example, here the NORD field is empty:
from msexcel8com import *
def convert(dsIn, dsOut):
import sys
sys.setdefaultencoding("utf-8")
import msexcel8com
xlsApp = msexcel8com.Application()
xlsApp.Workbooks.Open(unicode(dsIn["PATH_TO_XLS"]))
xlsWorkbook = xlsApp.Workbooks.Item(1)
xlsWorksheet = xlsWorkbook.Worksheets.Item(1)
xlsWorksheet.Cells.SpecialCells(11, None).Activate()
rowsCount = xlsApp.ActiveCell.Row
import msxml2
dsOut.clear()
outXML = msxml2.DOMDocument()
RootNode = outXML.createElement("MSG")
RootNode.setAttribute("FORMAT", "IMPORT_LN")
ChildNodes = outXML.appendChild(RootNode)
i, k, c = 1, 1, 2
while i < rowsCount:
i = i + 1
if k > c:
k = 0
dsOut.append()
dsOut["XML_OUT"] = unicode.encode(outXML.xml, "utf-8")
outXML = msxml2.DOMDocument()
RootNode = outXML.createElement("MSG")
RootNode.setAttribute("FORMAT", "IMPORT_LN")
ChildNodes = outXML.appendChild(RootNode)
try:
TMPNode = outXML.createElement("CLIENT")
TMPNode.setAttribute("NCODE", xlsWorksheet.Cells.Item(i, 1).Value)
TMPNode.setAttribute("NORD", xlsWorksheet.Cells.Item(i, 2).Value)
ChildNodes.appendChild(TMPNode)
k = k + 1
except Exception as e:
print(e)
dsOut.append()
dsOut["XML_OUT"] = unicode.encode(outXML.xml, "utf-8")
try:
xlsApp.Workbooks.Close()
except Exception as e:
print(e)
try:
xlsApp.Quit()
except Exception as e:
print(e)
How to make sure that even if there is an empty field, return as null and the rest of the values?
I couldn't resist the temptation of writing this without all that Excel automation.
Assuming an Excel file that looks something like this called so59715137.xls:
import xlrd # assuming it's an .xls, not .xlsx
import xml.etree.ElementTree as et
def read_rows(xls_filename, column_labels):
book = xlrd.open_workbook(xls_filename)
sh = book.sheet_by_index(0)
for rx in range(sh.nrows):
yield dict(zip(column_labels, sh.row_values(rx)))
def convert(xls_filename):
xml_root = et.Element("MSG", {"FORMAT": "IMPORT_LN"})
for row in read_rows(xls_filename, ("NCODE", "NORD")):
print(row) # for debugging
if row.get("NCODE") and row.get("NORD"): # both attributes must be truthy
et.SubElement(xml_root, "CLIENT", attrib=row) # just use the dict as attributes
return et.tostring(xml_root, encoding="unicode")
xml_content = convert("so59715137.xls")
print("-------------------------------")
print(xml_content)
# TODO: write to file
outputs (with debugging output included, so you see it reads but elides the rows that are missing data)
{'NCODE': 'foo', 'NORD': 'faa'}
{'NCODE': 'blep', 'NORD': 'blop'}
{'NCODE': 'missing', 'NORD': ''}
{'NCODE': '', 'NORD': 'other-missing'}
-------------------------------
<MSG FORMAT="IMPORT_LN"><CLIENT NCODE="foo" NORD="faa" /><CLIENT NCODE="blep" NORD="blop" /></MSG>
From there on out, it's easy to read/write your dsIn/dsOut structures.

Python or expression on exception

I have this code:
try:
info_model = Doc2Vec.load('models/info_model')
salary_model = Doc2Vec.load('models/salary_model')
education_model = Doc2Vec.load('models/education_model')
experience_model = Doc2Vec.load('models/experience_model')
skills_model = Doc2Vec.load('models/skills_model')
except:
info_model = lrn.info_model()
salary_model = lrn.salary_model()
education_model = lrn.education_model()
experience_model = lrn.experience_model()
skills_model = lrn.skills_model()
Basically, it checks if the file exists and creates it if not. But for correct work I would like to check every of this variables one by one. For that I would need to use try/except to each one separately.
I came up with sth like this:
experience_model = Doc2Vec.load('models/experience_model') or lrn.experience_model()
But this line still gives me FileNotFound exception. Is there a workaround? or should I do try/exception statement for each variable?
You could define a helper like this:
def load_or_default(filename, default):
try:
return Doc2Vec.load(filename)
except FileNotFoundError:
return default()
info_model = load_or_default('models/info_model', lrn.info_model)
salary_model = load_or_default('models/salary_model', lrn.salary_model)
education_model = load_or_default('models/education_model', lrn.education_model)
experience_model = load_or_default('models/experience_model', lrn.experience_model)
skills_model = load_or_default('models/skills_model', lrn.skills_model)
It's worth noting how the default object is only called within the function.

cookie_str = match.group(1).AttributeError: 'NoneType' object has no attribute 'group'

I am working on Stock predicting project.I want to download historical data from yahoo finance and save them in CSV format.
Since I am beginner in Python I am unable to correct the error.
My code is as follows:
import re
import urllib2
import calendar
import datetime
import getopt
import sys
import time
crumble_link = 'https://finance.yahoo.com/quote/{0}/history?p={0}'
crumble_regex = r'CrumbStore":{"crumb":"(.*?)"}'
cookie_regex = r'Set-Cookie: (.*?); '
quote_link = 'https://query1.finance.yahoo.com/v7/finance/download/{}?period1={}&period2={}&interval=1d&events=history&crumb={}'
def get_crumble_and_cookie(symbol):
link = crumble_link.format(symbol)
response = urllib2.urlopen(link)
match = re.search(cookie_regex, str(response.info()))
cookie_str = match.group(1)
text = response.read()
match = re.search(crumble_regex, text)
crumble_str = match.group(1)
return crumble_str, cookie_str
def download_quote(symbol, date_from, date_to):
time_stamp_from = calendar.timegm(datetime.datetime.strptime(date_from, "%Y-%m-%d").timetuple())
time_stamp_to = calendar.timegm(datetime.datetime.strptime(date_to, "%Y-%m-%d").timetuple())
attempts = 0
while attempts < 5:
crumble_str, cookie_str = get_crumble_and_cookie(symbol)
link = quote_link.format(symbol, time_stamp_from, time_stamp_to, crumble_str)
#print link
r = urllib2.Request(link, headers={'Cookie': cookie_str})
try:
response = urllib2.urlopen(r)
text = response.read()
print "{} downloaded".format(symbol)
return text
except urllib2.URLError:
print "{} failed at attempt # {}".format(symbol, attempts)
attempts += 1
time.sleep(2*attempts)
return ""
if __name__ == '__main__':
print get_crumble_and_cookie('KO')
from_arg = "from"
to_arg = "to"
symbol_arg = "symbol"
output_arg = "o"
opt_list = (from_arg+"=", to_arg+"=", symbol_arg+"=")
try:
options, args = getopt.getopt(sys.argv[1:],output_arg+":",opt_list)
except getopt.GetoptError as err:
print err
for opt, value in options:
if opt[2:] == from_arg:
from_val = value
elif opt[2:] == to_arg:
to_val = value
elif opt[2:] == symbol_arg:
symbol_val = value
elif opt[1:] == output_arg:
output_val = value
print "downloading {}".format(symbol_val)
text = download_quote(symbol_val, from_val, to_val)
with open(output_val, 'wb') as f:
f.write(text)
print "{} written to {}".format(symbol_val, output_val)
And the Error message that I am getting is :
File "C:/Users/Murali/PycharmProjects/generate/venv/tcl/generate2.py", line
49, in <module>
print get_crumble_and_cookie('KO')
File "C:/Users/Murali/PycharmProjects/generate/venv/tcl/generate2.py", line
19, in get_crumble_and_cookie
cookie_str = match.group(1)
AttributeError: 'NoneType' object has no attribute 'group'
So how can we resolve this problem that has popped up?
Look at these two commands:
match = re.search(cookie_regex, str(response.info()))
cookie_str = match.group(1)
The first one takes the string response.info() does a regular expression search to match cookie_regex. Then match.group(1) is supposed to take the match from it. The problem however is that if you do a print match in between these commands, you'll see that the re.search() returned nothing. This means match.group() has nothing to "group", which is why it errors out.
If you take a closer look at response.info() (you could just add a print response.info() command in your script to see it), you'll see that there's a line in response code that starts with "set-cookie:", the code after which you're trying to capture. However, you have your cookie_regex string set to look for a line with "Set-Cookie:". Note the capital letters. When I change that string to all lower-case, the error goes away:
cookie_regex = r'set-cookie: (.*?); '
I did run into another error after that, where print "downloading {}".format(symbol_val) stops because symbol_val hasn't been defined. It seems that this variable is only declared and assigned when opt[2:] == symbol_arg:. So you may want to rewrite that part to cover all cases.

python lxml etree namespaces on creation

I am using lxml etree to create xml or REST call. I have problem with namespaces since if not formulated correctly I get a syntax error from server.
As you can see in the following 2 examples I should be getting eg ns1, ns2, ns4, ns5 but the xml goes over with ns15, ns16 but at the end it has the e.g "" or " " - I know this explains it but for the nature of my REST call I need it as the example is.
How can I prevent that
I have to get the following xml
<ns5:prenosPodatkovRazporedaZahtevaSporocilo xmlns="http://xxx.yyy/sheme/pdr/skupno/v1" xmlns:ns2="http://xxx.yyy/sheme/pdr/v1" xmlns:ns3="http://xxx.yyy/sheme/kis/skupno/v2" xmlns:ns4="http://xxx.yyy/sheme/kis/v2" xmlns:ns5="http://xxx.yyy/sheme/pdr/sporocila/v1">
<ns5:podatkiRazporeda>
<ns2:podatkiRazporeda>
<ns2:delitvenaEnota>
<sifra>80</sifra>
</ns2:delitvenaEnota>
<ns2:vrstaRazporeda>
<sifra>4</sifra>
</ns2:vrstaRazporeda>
<ns2:tipRazporeda>
<sifra>D</sifra>
</ns2:tipRazporeda>
<ns2:obdobje>
<ns2:mesec>12</ns2:mesec>
<ns2:leto>2017</ns2:leto>
</ns2:obdobje>
<ns2:skupina>0</ns2:skupina>
<ns2:izvor>P_738</ns2:izvor>
<ns2:oznakeDelaZaDneve>
<ns2:oznakaDelaZaDan>
<ns2:dan>1</ns2:dan>
<ns2:oznakaDela>D4</ns2:oznakaDela>
</ns2:oznakaDelaZaDan>
....
</ns2:oznakeDelaZaDneve>
<ns2:organizacijskaEnota>
<sifra>738</sifra>
</ns2:organizacijskaEnota>
<ns2:zaposlenec>
<ns4:osebnaStevilka>10357</ns4:osebnaStevilka>
</ns2:zaposlenec>
</ns2:podatkiRazporeda>
</ns5:podatkiRazporeda>
Where I am getting this xml.
Mind the namespace marks.
<ns0:prenosPodatkovRazporedaOdgovorSporocilo xmlns:ns="http://rccirc.si/sheme/pdr/skupno/v1" xmlns:ns2="http://rccirc.si/sheme/pdr/v1" xmlns:ns3="http://rccirc.si/sheme/kis/skupno/v2" xmlns:ns4="http://rccirc.si/sheme/kis/v2" xmlns:ns5="http://rccirc.si/sheme/pdr/sporocila/v1" xmlns:ns0="ns5">
<ns0:podatkiRazporeda>
<ns1:podatkiRazporeda xmlns:ns1="ns2">
<ns1:vrstaRazporeda>
<sifra>647</sifra>
</ns1:vrstaRazporeda>
<ns1:tipRazporeda>
<sifra>D</sifra>
</ns1:tipRazporeda>
<ns1:obdobje>
<ns1:mesec>1</ns1:mesec>
<ns1:leto>2018</ns1:leto>
</ns1:obdobje>
<ns1:skupina>0</ns1:skupina>
<ns1:izvor>0</ns1:izvor>
<ns1:organizacijskaEnota>
<sifra>250</sifra>
</ns1:organizacijskaEnota>
<ns6:delitvenaenota xmlns:ns6="ns3">
<sifra>80</sifra>
</ns6:delitvenaenota>
<ns1:oznakeDelaZaDneve>
<oznakeDelaZaDneve>
<ns1:dan>29</ns1:dan>
<ns1:oznakaDela>1930-0730</ns1:oznakaDela>
</oznakeDelaZaDneve>
</ns1:oznakeDelaZaDneve>
<ns1:zaposlenec>
<ns7:osebnaStevilka xmlns:ns7="ns4">Z1</ns7:osebnaStevilka>
</ns1:zaposlenec>
</ns1:podatkiRazporeda>
.......
<ns11:podatkiRazporeda xmlns:ns11="ns2">
<ns11:vrstaRazporeda>
<sifra>647</sifra>
</ns11:vrstaRazporeda>
<ns11:tipRazporeda>
<sifra>D</sifra>
</ns11:tipRazporeda>
<ns11:obdobje>
<ns11:mesec>1</ns11:mesec>
<ns11:leto>2018</ns11:leto>
</ns11:obdobje>
<ns11:skupina>0</ns11:skupina>
<ns11:izvor>0</ns11:izvor>
<ns11:organizacijskaEnota>
<sifra>250</sifra>
</ns11:organizacijskaEnota>
<ns12:delitvenaenota xmlns:ns12="ns3">
<sifra>80</sifra>
</ns12:delitvenaenota>
<ns11:oznakeDelaZaDneve>
<oznakeDelaZaDneve>
<ns11:dan>3</ns11:dan>
<ns11:oznakaDela>0730-1530</ns11:oznakaDela>
</oznakeDelaZaDneve>
.....
</ns11:oznakeDelaZaDneve>
<ns11:zaposlenec>
<ns13:osebnaStevilka xmlns:ns13="ns4">Z1</ns13:osebnaStevilka>
</ns11:zaposlenec>
</ns11:podatkiRazporeda>
</ns0:podatkiRazporeda>
</ns0:prenosPodatkovRazporedaOdgovorSporocilo>
Here is my code.
root = etree.Element('{ns5}prenosPodatkovRazporedaOdgovorSporocilo', nsmap = {'ns': "http://xxx.yyy/sheme/pdr/skupno/v1",'ns2':"http://xxx.yyy/sheme/pdr/v1" ns3':"http://xxx.yyy/sheme/kis/skupno/v2",ns4': "http://xxx.yyy/sheme/kis/v2",ns5': "http://xxx.yyy/sheme/pdr/sporocila/v1"})
podatkiRazporedaMain = etree.SubElement(root, '{ns5}podatkiRazporeda')
#follwed by creating sub elements etc.
for rec in grouped_workers:
podatkiRazporeda = etree.SubElement(podatkiRazporedaMain, '{ns2}podatkiRazporeda')
vrstaRazporeda= etree.SubElement(podatkiRazporeda, '{ns2}vrstaRazporeda')
vrstaRazporedaSifra = etree.SubElement(vrstaRazporeda, 'sifra')
vrstaRazporedaSifra.text = "647"
tipRazporeda= etree.SubElement(podatkiRazporeda, '{ns2}tipRazporeda')
tipRazporedaSifra = etree.SubElement(tipRazporeda, 'sifra')
tipRazporedaSifra.text = 'D'
for rr in rec["data"]:
oznakaDelaZaDan = etree.SubElement(oznakeDelaZaDneve, 'oznakeDelaZaDneve')
dan= etree.SubElement(oznakaDelaZaDan, '{ns2}dan')
dan.text = str(rr["rw_date"].day)
oznakaDela = etree.SubElement(oznakaDelaZaDan, '{ns2}oznakaDela')
oznakaDela.text = str(rr["rw_shift"])
#print etree.tostring(root, pretty_print=True, xml_declaration=False, encoding='UTF-8')
fle = os.path.join(request.folder, 'private', str(647) + '.xml')
with open(fle, 'wb') as f:
f.write(etree.tostring(root, pretty_print=True, xml_declaration=False, encoding='UTF-8'))#,inclusive_ns_prefixes=None))
#etree..write(fle, pretty_print=True, xml_declaration=False, encoding='UTF-8')
print "Done"
So why are ns incremented?
Hope I was clear
Than you
So as it turns out when you are creating tags you should not write
vrstaRazporeda= etree.SubElement(podatkiRazporeda, '{ns2}vrstaRazporeda')
vrstaRazporedaSifra = etree.SubElement(vrstaRazporeda, 'sifra').text = "647"
But
vrstaRazporeda= etree.SubElement(podatkiRazporeda, '{http://xxx.yyy/sheme/pdr/v1}vrstaRazporeda')
vrstaRazporedaSifra = etree.SubElement(vrstaRazporeda, 'sifra').text = "647"
so the whole url - this seemed to solve the issue.

Cherrypy and Parsing XML Data from multiple files

So this is sort of a piggy-back post of another question I had. I've successfully pulled data from multiple xml files and am able to get the data to display within the terminal using the print function, but when I try to use the return function to show the data in the browser, I only get the data from the first file. Any ideas on why I only get data from the first file rather than all of them? Thanks!
from xml.dom.minidom import parse, parseString
import os, glob, re
import cherrypy
class Root(object):
def index(self):
path = 'C:\Vestigo\XML'
TOTALXML = len(glob.glob(os.path.join(path, '*.xml')))
print TOTALXML
i = 0
for XMLFile in glob.glob(os.path.join(path, '*.xml')):
xmldoc = parse(XMLFile)
order_number = xmldoc.getElementsByTagName('Extrinsic')[0].firstChild.data
order_name = xmldoc.getElementsByTagName('DeliverTo')[0].firstChild.data
street1 = xmldoc.getElementsByTagName('Street1')[0].firstChild.data
state = xmldoc.getElementsByTagName('State')[0].firstChild.data
zip_code = xmldoc.getElementsByTagName('PostalCode')[0].firstChild.data
OUTPUTi = order_number+' '+order_name+' '+street1+' '+state+' '+zip_code
i += 1
print OUTPUTi
return (OUTPUTi, """<br><br>Quit""")
index.exposed = True
def exit(self):
raise SystemExit(0)
exit.exposed = True
def start():
import webbrowser
cherrypy.tree.mount(Root(), '/')
cherrypy.engine.start_with_callback(
webbrowser.open,
('http://localhost:8080/',),
)
cherrypy.engine.block()
if __name__=='__main__':
start()
You are not collecting the data anywhere; you store everything in a variable named OUTPUTi, then only return the last iteration of that variable. Python does not magically make that variable use the i counter.
Use a list to collect the strings:
TOTALXML = len(glob.glob(os.path.join(path, '*.xml')))
print TOTALXML
OUTPUT = []
for XMLFile in glob.glob(os.path.join(path, '*.xml')):
xmldoc = parse(XMLFile)
order_number = xmldoc.getElementsByTagName('Extrinsic')[0].firstChild.data
order_name = xmldoc.getElementsByTagName('DeliverTo')[0].firstChild.data
street1 = xmldoc.getElementsByTagName('Street1')[0].firstChild.data
state = xmldoc.getElementsByTagName('State')[0].firstChild.data
zip_code = xmldoc.getElementsByTagName('PostalCode')[0].firstChild.data
OUTPUT.append(order_number+' '+order_name+' '+street1+' '+state+' '+zip_code)
print OUTPUT[-1]
OUTPUT = ''.join(OUTPUT)
return (OUTPUT, """<br><br>Quit""")

Categories

Resources