I am in the process of stripping a couple million XMLs of sensitive data. How can I add a try and except to get around this error which seems to have occurred because a couple of malformed xmls out to the bunch.
xml.parsers.expat.ExpatError: mismatched tag: line 1, column 28691
#!/usr/bin/python
import sys
from xml.dom import minidom
def getCleanString(word):
str = ""
dummy = 0
for character in word:
try:
character = character.encode('utf-8')
str = str + character
except:
dummy += 1
return str
def parsedelete(content):
dom = minidom.parseString(content)
for element in dom.getElementsByTagName('RI_RI51_ChPtIncAcctNumber'):
parentNode = element.parentNode
parentNode.removeChild(element)
return dom.toxml()
for line in sys.stdin:
if line > 1:
line = line.strip()
line = line.split(',', 2)
if len(line) > 2:
partition = line[0]
id = line[1]
xml = line[2]
xml = getCleanString(xml)
xml = parsedelete(xml)
strng = '%s\t%s\t%s' %(partition, id, xml)
sys.stdout.write(strng + '\n')
Catching exceptions is straight forward. Add import xml to your import statements and wrap the problem code in a try/except handler.
def parsedelete(content):
try:
dom = minidom.parseString(content)
except xml.parsers.expat.ExpatError, e:
# not sure how you want to handle the error... so just passing back as string
return str(e)
for element in dom.getElementsByTagName('RI_RI51_ChPtIncAcctNumber'):
parentNode = element.parentNode
parentNode.removeChild(element)
return dom.toxml()
Related
I am trying to extract IPv4 addresses from a text file and save them as a list to a new file, however, I can not use regex to parse the file, Instead, I have check the characters individually. Not really sure where to start with that, everything I find seems to have import re as the first line.
So far this is what I have,
#Opens and prints wireShark txt file
fileObject = open("wireShark.txt", "r")
data = fileObject.read()
print(data)
#Save IP adresses to new file
with open('wireShark.txt') as fin, open('IPAdressess.txt', 'wt') as fout:
list(fout.write(line) for line in fin if line.rstrip())
#Opens and prints IPAdressess txt file
fileObject = open("IPAdressess.txt", "r")
data = fileObject.read()
print(data)
#Close Files
fin.close()
fout.close()
So I open the file, and I have created the file that I will put the extracted IP's in, I just don't know ow to pull them without using REGEX.
Thanks for the help.
Here is a possible solution.
The function find_first_digit, position the index at the next digit in the text if any and return True. Else return False
The functions get_dot and get_num read a number/dot and, lets the index at the position just after the number/dot and return the number/dot as str. If one of those functions fails to get the number/dot raise an MissMatch exception.
In the main loop, find a digit, save the index and then try to get an ip.
If sucess, write it to output file.
If any of the called functions raises a MissMatch exception, set the current index to the saved index plus one and start over.
class MissMatch(Exception):pass
INPUT_FILE_NAME = 'text'
OUTPUT_FILE_NAME = 'ip_list'
def find_first_digit():
while True:
c = input_file.read(1)
if not c: # EOF found!
return False
elif c.isdigit():
input_file.seek(input_file.tell() - 1)
return True
def get_num():
num = input_file.read(1) # 1st digit
if not num.isdigit():
raise MissMatch
if num != '0':
for i in range(2): # 2nd 3th digits
c = input_file.read(1)
if c.isdigit():
num += c
else:
input_file.seek(input_file.tell() - 1)
break
return num
def get_dot():
if input_file.read(1) == '.':
return '.'
else:
raise MissMatch
with open(INPUT_FILE_NAME) as input_file, open(OUTPUT_FILE_NAME, 'w') as output_file:
while True:
ip = ''
if not find_first_digit():
break
saved_position = input_file.tell()
try:
ip = get_num() + get_dot() \
+ get_num() + get_dot() \
+ get_num() + get_dot() \
+ get_num()
except MissMatch:
input_file.seek(saved_position + 1)
else:
output_file.write(ip + '\n')
I'm trying to get the values inside a XML, but with this code I only get the label name value, I want the value itself from the label name.
XML Text:
<root><label_params><label_param><name>BranchName</name><value></value></label_param><label_param><name>CustomerCode</name><value></value></label_param><label_param><name>SealNumber</name><value>0110000000420</value></label_param><label_param><name>CustomerName</name><value>PUNTO EDUCATIVO LTDA</value></label_param><label_param><name>LpnTypeCode</name><value>LPN</value></label_param><label_param><name>OutboundNumber</name><value>1685147.1</value></label_param><label_param><name>ReferenceNumber</name><value>18072019_pall_cerr</value></label_param><label_param><name>DeliveryAddress1</name><value>Sin Direccion</value></label_param><label_param><name>NroCita</name><value></value></label_param><label_param><name>FechaEnt</name><value>19/07/2019</value></label_param><label_param><name>Porder</name><value>18072019_pall_cerr</value></label_param><label_param><name>Factura</name><value></value></label_param><label_param><name>IdLpnCode</name><value>C0000000015</value></label_param><label_param><name>TotalBultos</name><value></value></label_param><label_param><name>ANDENWMS</name><value>ANDEN15</value></label_param><label_param><name>LpnPadre</name><value>C0000000015</value></label_param><label_param><name>Cerrados</name><value>4</value></label_param><label_param><name>NoCerrados</name><value>2</value></label_param><label_param><name>TOTALPALLET</name><value></value></label_param></label_params></root>
Python Code
from xml.dom.minidom import parse
doc = parse("DataXML.xml")
my_node_list = doc.getElementsByTagName("name")
my_n_node = my_node_list[0]
my_child = my_n_node.firstChild
my_text = my_child.data
print(my_text)
Here you go:
from xml.dom.minidom import parse
doc = parse("../data/DataXML.xml")
my_node_list = doc.getElementsByTagName("label_param")
for node in my_node_list:
name_node = node.getElementsByTagName("name")
value_node = node.getElementsByTagName("value")
print("Name: "+name_node[0].firstChild.data)
if(value_node[0].firstChild != None):
print("Value: "+value_node[0].firstChild.data)
else:
print("Value: Empty")
I have this method:
lines = rec.split("\n")
rec = ''
size = len(lines)
i=0
for line in lines:
try:
self.on_data(json.load(line))
except:
logging.warning ('warning, could not parse line:', line)
if i == size - 1:
# if it is the last element, we can keep it, since it might not be complete
rec+=line
finally:
i += 1
I am getting this error:
Message: 'warning, could not parse line:'
Arguments: ('{"readersCount":0,"uuid":"17f5fe87-5140-4f34-ac32-d325beb6b2a1","key":"bar","lockRequestCount":0,"type":"lock","acquired":true}',)
it looks like I need to read this first element of a tuple or something? the JSON looks ok?
As stated in the comments already, you need self.on_data(json.loads(line)):
lines = rec.split("\n")
rec = ''
size = len(lines)
i=0
for line in lines:
try:
self.on_data(json.loads(line))
except:
logging.warning ('warning, could not parse line:', line)
if i == size - 1:
# if it is the last element, we can keep it, since it might not be complete
rec+=line
finally:
i += 1
I am trying to parse the wikicfp.v1.2008.xml and wikicfp.v1.2009.xml and wikicfp.v1.2010.xml. The three available versions in the link below.
https://github.com/creswick/wikicfp-parser/tree/master/data
I tried with XML.etree.ElementTree and with beautifulsoup, but I got a lot of encoding errors like
not well-formed (invalid token): line 949, column 40
UnicodeDecodeError: 'charmap' codec can't decode byte 0x9d in position 63563: character maps to undefined>
I couldn't progress because of the errors. I aim to parse each row and save it in a SQL script file or CSV file for later usage.
from bs4 import BeautifulSoup
out_file = open("final.sql","w")
out_file.write("--DROP TABLE event1;\n")
out_file.write("CREATE TABLE event1 (eventid int, fullname TEXT, location TEXT, begindate TINYTEXT , finishdate TINYTEXT , weblink TEXT, info TEXT, PRIMARY KEY (eventid));\n")
out_file.close()
infile = open("wikicfp.v1.2009.xml",encoding='utf-8-sig')
contents = infile.read()
soup = BeautifulSoup(contents)
rows = soup.find_all('row')
c = 0
for count in rows:
tempsoup = rows[c]
try:
ei = tempsoup.findAll("field", {"name":"eventid"})
if not ei[0].contents[0].strip():
ei = "No info"
eventid = ei[0].contents[0].strip()
except Exception:
eventid = 0
try:
fn = tempsoup.findAll("field", {"name":"fullname"})
s = fn[0].contents[0].strip()
fullname = s.decode('utf-8')
fullname = fullname.replace("'","_")
except Exception:
fullname = "No info"
try:
l = tempsoup.findAll("field", {"name":"location"})
s = l[0].contents[0].strip()
location = s.decode('utf-8')
location = location.replace("'","_")
except Exception:
location = "No info"
try:
bd = tempsoup.findAll("field", {"name":"begindate"})
s = bd[0].contents[0].strip()
begindate = s.decode('utf-8')
except Exception:
begindate = "No info"
try:
fd = tempsoup.findAll("field", {"name":"finishdate"})
s = fd[0].contents[0].strip()
finishdate = s.decode('utf-8')
except Exception:
finishdate = "No info"
try:
wl = tempsoup.findAll("field", {"name":"weblink"})
s = wl[0].contents[0].strip()
weblink = s.decode('utf-8')
except Exception:
weblink = "No info"
try:
i = tempsoup.findAll("field", {"name":"info"})
s = i[0].contents[0].strip()
info = s.decode('utf-8')
info = info.replace("'","_")
except Exception:
info = "No info"
with open("final.sql","a") as out_file:
out_file.write("INSERT INTO event VALUES (")
out_file.write(eventid)
out_file.write(", '")
out_file.write(fullname)
out_file.write("', '")
out_file.write(location)
out_file.write("','")
out_file.write(begindate)
out_file.write("','")
out_file.write(finishdate)
out_file.write("','")
out_file.write(weblink)
out_file.write("','")
out_file.write(info)
out_file.write("');\n")
c=c+1
out_file.close()
infile.close()
another start of try
from bs4 import BeautifulSoup
with open("wikicfp.v1.2009.xml") as fp:
soup = BeautifulSoup(fp, 'xml')
rows = soup.find_all('row')
another try
import xml.etree.ElementTree as ET
tree = ET.parse('wikicfp.v1.2009.xml')
root = tree.getroot()
It looks like your xml-files are containing invalid characters. I tried sveral different text editors (Notepad++, bracket, Notepad, ...) and all of them ran into several positions, they couldn't encode properly (e.g. in the 2008-xml at the end of line 56964). So the xml-parser fails to parse the xml right there. You could use lxml and its Parsers recover-option to ignore those characters:
import lxml.etree as ET
tree = ET.parse('wikicfp.v1.2008.xml',
ET.XMLParser(encoding='ISO-8859-1', ns_clean=True, recover=True))
root = tree.getroot()
rows = root.findall('row')
for row in rows:
fields = row.findall('field')
for field in fields:
print(field)
You can get lxml by simply typing pip install lxml in your bash
so I'm having this trouble with the decode. I found it in other threads how to do it for simple strings, with the u'string'.encode. But I can't find a way to make it work with files.
Any help would be appreciated!
Here's the code.
text = file.read()
text.replace(txt.encode('utf-8'), novo_txt.encode('utf-8'))
file.seek(0) # rewind
file.write(text.encode('utf-8'))
and here's the whole code, should it help.
#!/usr/bin/env python
# coding: utf-8
"""
Script to helps on translate some code's methods from
portuguese to english.
"""
from multiprocessing import Pool
from mock import MagicMock
from goslate import Goslate
import fnmatch
import logging
import os
import re
import urllib2
_MAX_PEERS = 1
try:
os.remove('traducoes.log')
except OSError:
pass
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
handler = logging.FileHandler('traducoes.log')
logger.addHandler(handler)
def fileWalker(ext, dirname, names):
"""
Find the files with the correct extension
"""
pat = "*" + ext[0]
for f in names:
if fnmatch.fnmatch(f, pat):
ext[1].append(os.path.join(dirname, f))
def encontre_text(file):
"""
find on the string the works wich have '_' on it
"""
text = file.read().decode('utf-8')
return re.findall(r"\w+(?<=_)\w+", text)
#return re.findall(r"\"\w+\"", text)
def traduza_palavra(txt):
"""
Translate the word/phrase to english
"""
try:
# try connect with google
response = urllib2.urlopen('http://google.com', timeout=2)
pass
except urllib2.URLError as err:
print "No network connection "
exit(-1)
if txt[0] != '_':
txt = txt.replace('_', ' ')
txt = txt.replace('media'.decode('utf-8'), 'média'.decode('utf-8'))
gs = Goslate()
#txt = gs.translate(txt, 'en', gs.detect(txt))
txt = gs.translate(txt, 'en', 'pt-br') # garantindo idioma tupiniquim
txt = txt.replace(' en ', ' br ')
return txt.replace(' ', '_') # .lower()
def subistitua(file, txt, novo_txt):
"""
should rewrite the file with the new text in the future
"""
text = file.read()
text.replace(txt.encode('utf-8'), novo_txt.encode('utf-8'))
file.seek(0) # rewind
file.write(text.encode('utf-8'))
def magica(File):
"""
Thread Pool. Every single thread should play around here with
one element from list os files
"""
global _DONE
if _MAX_PEERS == 1: # inviavel em multithread
logger.info('\n---- File %s' % File)
with open(File, "r+") as file:
list_txt = encontre_text(file)
for txt in list_txt:
novo_txt = traduza_palavra(txt)
if txt != novo_txt:
logger.info('%s -> %s [%s]' % (txt, novo_txt, File))
subistitua(file, txt, novo_txt)
file.close()
print File.ljust(70) + '[OK]'.rjust(5)
if __name__ == '__main__':
try:
response = urllib2.urlopen('http://www.google.com.br', timeout=1)
except urllib2.URLError as err:
print "No network connection "
exit(-1)
root = './app'
ex = ".py"
files = []
os.path.walk(root, fileWalker, [ex, files])
print '%d files found to be translated' % len(files)
try:
if _MAX_PEERS > 1:
_pool = Pool(processes=_MAX_PEERS)
result = _pool.map_async(magica, files)
result.wait()
else:
result = MagicMock()
result.successful.return_value = False
for f in files:
pass
magica(f)
result.successful.return_value = True
except AssertionError, e:
print e
else:
pass
finally:
if result.successful():
print 'Translated all files'
else:
print 'Some files were not translated'
Thank you all for the help!
In Python 2, reading from files produces regular (byte) string objects, not unicode objects. There is no need to call .encode() on these; in fact, that'll only trigger an automatic decode to Unicode first, which can fail.
Rule of thumb: use a unicode sandwich. Whenever you read data, you decode to unicode at that stage. Use unicode values throughout your code. Whenever you write data, encode at that point. You can use io.open() to open file objects that encode and decode automatically for you.
That also means you can use unicode literals everywhere; for your regular expressions, for your string literals. So use:
def encontre_text(file):
text = file.read() # assume `io.open()` was used
return re.findall(ur"\w+(?<=_)\w+", text) # use a unicode pattern
and
def subistitua(file, txt, novo_txt):
text = file.read() # assume `io.open()` was used
text = text.replace(txt, novo_txt)
file.seek(0) # rewind
file.write(text)
as all string values in the program are already unicode, and
txt = txt.replace(u'media', u'média')
as u'..' unicode string literals don't need decoding anymore.