getting text length from lxml.html nodes and truncate - python

In parse_html() I am trying to parse for the text of each element and get the len() of text in each element. I want to generate a script that parses the length of text in each element and when the cumulative text length reaches a set size parameter, it truncates the rest of the text in the document. My problem is in the child.text/tag1.text/tag2.text/tag3.text. len() doesn't seem to be working on these. Is there a way I can pull the numerical length of these text strings?
import sys
import imaplib
import getpass
import email
import email.header
import datetime
from bs4 import BeautifulSoup
import re
from lxml import etree, html
from io import StringIO, BytesIO
from lxml.html.clean import clean_html, Cleaner, word_break
from lxml.etree import HTML
from lxml.html import HtmlMixin
EMAIL_ACCOUNT = "sample#gmai.com"
EMAIL_FOLDER = "INBOX"
def process_mailbox(M):
"""
Do something with emails messages in the folder.
For the sake of this example, print some headers.
"""
rv, data = M.search(None, "ALL")
if rv != 'OK':
print "No messages found!"
return
for num in data[0].split():
rv, data = M.fetch(num, '(RFC822)')
if rv != 'OK':
print "ERROR getting message", num
return
msg = email.message_from_string(data[0][1])
decode = email.header.decode_header(msg['Subject'])[0]
subject = unicode(decode[0])
body = msg.get_payload(decode=True)
print 'Message %s: %s' % (num, subject)
print 'Raw Date:', msg['Date']
print 'Body:', body
if msg.is_multipart():
html = None
print "Checking for html or text"
for part in msg.get_payload():
if part.get_content_charset() is None:
charset = chardet.detect(srt(part))['encoding']
else:
charset = part.get_content_charset()
if part.get_content_type() == 'text/plain':
text = unicode(part.get_payload(decode=True),str(charset),"ignore").encode('utf8','replace')
f = open('email.txt', 'w')
f.write(text)
f.close
if part.get_content_type() == 'text/html':
html = unicode(part.get_payload(decode=True),str(charset),"ignore").encode('utf8','replace')
f = open('email.html','w')
f.write(html)
f.close
if part.get('Content-Disposition') is None:
continue
filename = part.get_filename()
if not os.path.isfile(filename) :
fp = open(filename, 'wb')
fp.write(part.get_payload(decode=True))
fp.close()
return 0
if html is None:
return text.strip()
else:
return html.strip()
# Now convert to local date-time
date_tuple = email.utils.parsedate_tz(msg['Date'])
if date_tuple:
local_date = datetime.datetime.fromtimestamp(
email.utils.mktime_tz(date_tuple))
print "Local Date:", \
local_date.strftime("%a, %d %b %Y %H:%M:%S")
def parse_html():
#htmldoc = open('email.html', 'r+')
#doc = htmldoc.read()
VALID_TAGS = ['iframe', 'video', 'o>', 'li', 'sub', 'sup', 'source', 'br', 'h3', 'h4', 'h6', 'hr', 'q', 'mark','wbr', 'audio','strong', 'em', 'p','ul', 'li', 'br', 'blockquote', 'pre', 'del', 'h3', 'body', 'header', 'html', 'title', 'div', 'img', 'a']
parser = etree.HTMLParser()
tree = etree.parse("email.html", parser)
#results = etree.tostring(tree.getroot(), pretty_print=True, method="html")
page = html.tostring(tree)
cleaner = Cleaner(page_structure=False, add_nofollow=True, style=True, links=True, safe_attrs_only=True)
clean_page = cleaner.clean_html(page)
root = tree.getroot()
child = root[0]
print len(root)
children = list(root)
for child in root:
print child.tag
print child.attrib
print child.text
for tag1 in child:
print tag1.tag
print tag1.attrib
print tag1.text
for tag2 in tag1:
print tag2.tag
print tag2.attrib
print tag2.text
for tag3 in tag2:
print tag3.tag
print tag3.attrib
print tag3.text
M = imaplib.IMAP4_SSL('imap.gmail.com')
try:
rv, data = M.login(EMAIL_ACCOUNT, getpass.getpass())
except imaplib.IMAP4.error:
print "LOGIN FAILED!!! "
sys.exit(1)
print rv, data
rv, mailboxes = M.list()
if rv == 'OK':
print "Mailboxes:"
print mailboxes
rv, data = M.select(EMAIL_FOLDER)
if rv == 'OK':
print "Processing mailbox...\n"
process_mailbox(M)
parse_html()
M.close()
else:
print "ERROR: Unable to open mailbox ", rv
M.logout()
This is the error I get when I try to use len()
TypeError: object of type 'NoneType' has no len()
Also, if you know anything about how to do that truncating with lxml.html, I'd appreciate being pointed in the right direction.
Thanks.

There may be a more efficient way to do this but I was able to get functioning results. I had to turn each child into a string and then strip the html tags of each child string. I used functions stringify_children() and strip_tags(), found at these links: Get all text inside a tag in lxml and
Strip HTML from strings in Python
def stringify_children(node):
from lxml.etree import tostring
from itertools import chain
parts = ([node.text] +
list(chain(*([c.text, tostring(c), c.tail] for c in node.getchildren()))) +
[node.tail])
# filter removes possible Nones in texts and tails
return ''.join(filter(None, parts))
class MLStripper(HTMLParser):
def __init__(self):
self.reset()
self.fed = []
def handle_data(self, d):
self.fed.append(d)
def get_data(self):
return ''.join(self.fed)
def strip_tags(html):
s = MLStripper()
s.feed(html)
return s.get_data()
tree = etree.fromstring(docstring)
walkAll = tree.iterchildren()
for elt in walkAll:
child = stringify_children(elt)
childtext = strip_tags(child)
print len(childtext)

Related

Removing Special Characters / Punctuation for the end of a Python List of URL's

I am writing a Python code to extract all the URLs from an input file, having content or text from Twitter (Tweets). However, while doing so I realized that several URLs that were extracted in the python list had 'special characters' or 'Punctuation' towards the end, because of which I could not further parse through them to get the base URL link. My Question is: 'How do I identify & remove special characters from the end of every URL in my list' ?
Current Output:
['https://twitter.com/GVNyqWEu5u', 'https://twitter.com/GVNyqWEu5u'', 'https://twitter.com/GVNyqWEu5u##', 'https://twitter.com/GVNyqWEu5u"']
Desired Output:
['https://twitter.com/GVNyqWEu5u', 'https://twitter.com/GVNyqWEu5u', 'https://twitter.com/GVNyqWEu5u', 'https://twitter.com/GVNyqWEu5u']
You would appreciate that not all elements in the 'Current Output' list have special characters / punctuation towards the end. The task is to identify & remove characters / punctuation only from the list elements who have them.
I am using the following Regex to extract twitter URLs from the Tweet Text: lst = re.findall('(http.?://[^\s]+)', text)
Can I remove the special characters / punctuation towards the end of the URL, in this step itself ?
Full Code:
import urllib.request, urllib.parse, urllib.error
from bs4 import BeautifulSoup
from socket import timeout
import ssl
import re
import csv
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE
count = 0
file = "Test.CSV"
with open(file,'r', encoding='utf-8') as f, open('output_themes_1.csv', 'w', newline='', encoding='utf-8') as ofile:
next(f)
reader = csv.reader(f)
writer = csv.writer(ofile)
fir = 'S.No.', 'Article_Id', 'Validity', 'Content', 'Geography', 'URL'
writer.writerow(fir)
for line in reader:
count = count+1
text = line[5]
lst = re.findall('(http.?://[^\s]+)', text)
if not lst:
x = count, line[0], 'Empty List', text, line[8], line[6]
print (x)
writer.writerow(x)
else:
try:
for url in lst:
try:
html = urllib.request.urlopen(url, context=ctx, timeout=60).read()
#html = urllib.request.urlopen(urllib.parse.quote(url, errors='ignore'), context=ctx).read()
soup = BeautifulSoup(html, 'html.parser')
title = soup.title.string
str_title = str (title)
if 'Twitter' in str_title:
if len(lst) > 1: break
else: continue
else:
y = count, line[0], 'Parsed', str_title, line[8], url
print (y)
writer.writerow(y)
except UnicodeEncodeError as e:
b_url = url.encode('ascii', errors='ignore')
n_url = b_url.decode("utf-8")
try:
html = urllib.request.urlopen(n_url, context=ctx, timeout=90).read()
soup = BeautifulSoup(html, 'html.parser')
title = soup.title.string
str_title = str (title)
if 'Twitter' in str_title:
if len(lst) > 1: break
else: continue
else:
z = count, line[0], 'Parsed_2', str_title, line[8], url
print (z)
writer.writerow(z)
except Exception as e:
a = count, line[0], str(e), text, line[8], url
print (a)
writer.writerow(a)
except Exception as e:
b = count, line[0], str(e), text, line[8], url
print (b)
writer.writerow(b)
print ('Total Rows Analyzed:', count)
Assuming the special characters occur at the end of the string you may use:
mydata = ['https://twitter.com/GVNyqWEu5u', "https://twitter.com/GVNyqWEu5u'", 'https://twitter.com/GVNyqWEu5u##', 'https://twitter.com/GVNyqWEu5u"']
mydata = [re.sub('[^a-zA-Z0-9]+$','',item) for item in mydata]
print(mydata)
Prints:
['https://twitter.com/GVNyqWEu5u', 'https://twitter.com/GVNyqWEu5u', 'https://twitter.com/GVNyqWEu5u', 'https://twitter.com/GVNyqWEu5u']
Assuming your list is called urls:
def remove_special_chars(url, char_list=None):
if char_list is None:
# Build your own default list here
char_list = ['#', '%']
for character in char_list:
if url.endswith(character):
return remove_special_chars(url[:-1], char_list)
return url
urls = [remove_special_chars(url) for url in urls]
If you want to get rid of a special set of characters just change either the default value or pass a proper list as an argument
You could try this -
lst = [re.sub('[=" ]$', '', i) for i in re.findall('(http.?://[^\s]+)', text)]
You can just add more characters that you want to replace in your sub according to your requirements

Convert structure large XML file To CSV without using tag names

Iam trying to convert large XML file to CSV format , Below is my code and sample file ,Sample file (part of xml file )
<PostalAddress>
<Id>5464443597076195439</Id>
<AddUserId>SYSTEM_USER</AddUserId>
<AddDate>2013-01-05T18:08:42-06:00</AddDate>
<LastPersistenceTransactionUserId>SYSTEM_USER</LastPersistenceTransactionUserId>
<LastPersistenceTransactionDate>2013-07-11T08:21:34-05:00</LastPersistenceTransactionDate>
<LastPersistenceTransactionType tc="2"/>
<ExternalReferenceId>3200723</ExternalReferenceId>
<SchemaVersion>2</SchemaVersion>
<Type tc="1"/>
<Usage tc="2"/>
<Valid>true</Valid>
<Overridable>true</Overridable>
<Preferred>false</Preferred>
<Line1>4849 RONSON CT</Line1>
<Line2>STE 208</Line2>
<City>SAN DIEGO</City>
<State tc="6"/>
<PostalCode>92111</PostalCode>
<Country tc="1"/>
</PostalAddress>
Below is my code
import codecs
import xml.etree.ElementTree as et
import sys
class gokul:
def __init__(self, input_file, output_file, encoding='utf-8'):
self.output_buffer = []
self.output = None
self.context = et.iterparse(input_file, from bs4 import BeautifulSoup=("start", "end"))
try:
self.output = codecs.open(output_file, "w", encoding=encoding)
except:
print("Failed to open the output file")
raise
def convert(self, tag="item", delimiter=",", ignore=[], noheader=False,
limit=-1, buffer_size=1000):
event, root = next(self.context)
items = []
header_line = []
field_name = ''
tagged = False
started = False
n = 0
for event, elem in self.context:
should_write = elem.tag != tag and started and elem.tag not in ignore
should_tag = not tagged and should_write and not noheader
i=0
if event == 'start':
if elem.tag == tag and not started:
started = True
elif should_tag:
field_name = '_'.join((field_name, elem.tag)) if field_name else elem.tag
else:
if should_write:
if should_tag:
header_line.append(field_name)
field_name = field_name.rpartition('_' + elem.tag)[0]
items.append('' if elem.text is None else elem.text.strip().replace('"', r'""'))
elif elem.tag == tag and len(items) > 0:
if header_line and not tagged:
self.output.write(delimiter.join(header_line) + '\n')
tagged = True
self.output_buffer.append(r'"' + (r'"' + delimiter + r'"').join(items) + r'"')
items = []
n += 1
if n == limit:
break
if len(self.output_buffer) > buffer_size:
self._write_buffer()
elem.clear()
self._write_buffer()
self.output.close()
return n
def _write_buffer(self):
"""Write records from buffer to the output file"""
self.output.write('\n'.join(self.output_buffer) + '\n')
self.output_buffer = []
converter = gokul(sys.argv[1], sys.argv[2], encoding="utf-8")
converter.convert(tag=sys.argv[3])

python CDATA in subelement

I'm trying to add CDATA to an subElement of XML. But the XML keeps converting the < character to <.
I've read some posts here that do something with CDATA, but I can't believe it's this hard to make it work. Besides that, I can't get those examples working.
Below a part of my code simplified, using python 3.4.
import xml.etree.cElementTree as ET
from xml.dom import minidom
class MyClass():
def __init__(self):
self.xml = None
def generateXML(self):
self.xml = ET.Element("AVXML")
row = ET.SubElement(self.xml, "ROW")
ET.SubElement(row, "DATA").text = "<![CDATA[ <ART_HDR.COMMENT1>.PDF ]]>"
def saveXML(self):
rough_string = ET.tostring(self.xml, 'Windows-1252')
reparsed = minidom.parseString(rough_string)
prettyxml = reparsed.toprettyxml(indent=" ", encoding="Windows-1252")
print(prettyxml)
f = open("filetosave.xml", "wb")
f.write(prettyxml)
f.close()
m = MyClass()
m.generateXML()
m.saveXML()
This however generates:
<DATA><![CDATA[ <ART_HDR.COMMENT1>.PDF ]]></DATA>
Instead of
<DATA><![CDATA[ <ART_HDR.COMMENT1>.PDF ]]></DATA>
Oke, I used this comment and got it working.
import xml.etree.cElementTree as ET
from xml.dom import minidom
class MyClass():
def __init__(self):
self.xml = None
def generateXML(self):
self.xml = ET.Element("AVXML")
row = ET.SubElement(self.xml, "ROW")
data = " <ART_HDR.COMMENT1>.PDF "
cdata = ET.SubElement(row, "DATA")
cdata.append(ET.Comment(' --><![CDATA[' + data.replace(']]>', ']]]]><![CDATA[>') + ']]><!-- '))
def saveXML(self):
rough_string = ET.tostring(self.xml, 'Windows-1252')
reparsed = minidom.parseString(rough_string)
prettyxml = reparsed.toprettyxml(indent=" ", encoding="Windows-1252")
f = open("filetosave.xml", "wb")
f.write(prettyxml)
f.close()
m = MyClass()
m.generateXML()
m.saveXML()

Text file parser outputs blank files

Long time listener, first time caller! So, I have this Python script that is for parsing a Google Base Feed text file. It's taking out particular pieces of data and creating a formatted file I can upload on to Bing Shopping. After finally getting it to run, I've discovered that it just outputs blank files instead of the cleaned up data I wanted. What am I missing here? I really appreciate any help! Fair warning, I'm a pretty big Python newb, and I've had a lot of help writing this already.
import sys,os
import pandas as pd
import datetime
def remove_quotes(data):
lines = data.split('\n')
for i, line in enumerate(lines):
lines[i] = lines[i].replace('"','')
print lines[i]
return data
def tab_error(index, line, output):
count = len(line.split('\t'))
if count != 19:
err = 'Tab issue at line {linenum} : {numtabs} extra tabs'.\
format(linenum=index,numtabs=(count-19))
print err
output.write(err+'\n')
return True
return False
def html_error(index, line, output):
htmltags = ['&fract12', '&39','&', '&qt;', '<', '&rt;','"','>','quot','’']
for tag in htmltags:
if line.find(tag) > 0:
err = 'HTML issue at line {linenum}'.\
format(linenum=index)
print err
output.write(err+'\n')
return True
return False
def read_data(filename):
with open(filename,'r') as infile:
data = infile.read()
return data
def tabs_check(data, output, filename):
with open(filename,'w') as cleanfile:
header = ''
for x in xrange(19):
header += 'x'+str(x+1)+'\t'
cleanfile.write(header)
# for each line in the file
for i, line in enumerate(data.split('\r')[1:]):
# check line for tabs error
data_error = tab_error(i, line, output)
newline = line.replace('"','')
newline=newline.strip()
if not data_error:
cleanfile.write('\n'+newline)
def html_check(data, output, filename):
with open(filename,'w') as cleanfile:
# for each line in the file
lines = data.split('\n')
cleanfile.write(lines[0])
for i, line in enumerate(lines[1:]):
# check line for HTML errors
data_error = html_error(i, line, output)
newline = line.replace('"','')
newline=newline.strip()
if not data_error and newline:
cleanfile.write('\n'+newline)
if __name__ == '__main__':
# Clean tabs
filename = sys.argv[1]
ts = datetime.datetime.now().isoformat()
print ts
with open('bing_errors.txt','w') as output:
# print 'Removing quotes within .. product description and ...'
# data = remove_quotes(data)
print 'Removing lines with more than 19 tabs...'
data = read_data(filename)
tabs_check(data, output, 'clean19.txt')
# Delete and reorder columns
print 'Deleting and reordering columns...'
df = pd.read_table('clean19.txt')
tmp = df[['x8','x2','x3','x4','x6','x1','x5']]
tmp.columns = ['MPID',
'Brand (BrandorManufacturer)',
'Title',
'Item Description',
'Price',
'ProductURL',
'ImageURL']
tmp.to_csv('tmp.txt', index=False, sep='\t')
os.remove('clean19.txt')
#HTML errors
print 'Checking for HTML errors...'
data = read_data('tmp.txt')
html_check(data, output, 'BT1.txt')
os.remove('tmp.txt')
# row = tmp[tmp['MPID'] == 8724]
# print row

Python: 'ascii' codec can't encode character u'\\u2026'

I am trying to use the Bing api in python with the following code:
#!/usr/bin/python
from bingapi import bingapi
import re
import json
import urllib
import cgi
import cgitb
from HTMLParser import HTMLParser
class MLStripper(HTMLParser):
def __init__(self):
self.reset()
self.fed = []
def handle_data(self, d):
self.fed.append(d)
def get_data(self):
return ''.join(self.fed)
def strip_tags(html):
s = MLStripper()
s.feed(html)
return s.get_data()
def strip_tags2(data):
p = re.compile(r'<[^<]*?>')
q = re.compile(r'[&;!##$%^*()]*')
data = p.sub('', data)
return q.sub('', data)
def getUrl(item):
return item['Url']
def getContent(item):
return item['Description']
def getTitle(item):
return item['Title']
def getInfo(qry, siteStr):
qryStr = qry + "+" + siteStr
#qryStr = u"%s" % qryStr.encode('UTF-8')
query = urllib.urlencode({'q' : qryStr})
url = 'http://api.bing.net/json.aspx?Appid=<myappid>&Version=2.2&Market=en-US&Query=%s&Sources=web&Web.Count=10&JsonType=raw' % (query)
search_results = urllib.urlopen(url)
j = json.loads(search_results.read())
results = j['SearchResponse']['Web']['Results']
return results
def updateRecent(qry):
f = open("recent.txt", "r")
lines = f.readlines()
f.close()
lines = lines[1:]
if len(qry) > 50: #truncate if string too long
qry = (qry[:50] + '...')
qry = strip_tags2(qry) #strip out the html if injection try
lines.append("\n%s" % qry)
f = open("recent.txt", "w")
f.writelines(lines)
f.close()
if __name__ == '__main__':
form = cgi.FieldStorage()
qry = form["qry"].value
qry = r'%s' % qry
updateRecent(qry)
siteStr = "(site:answers.yahoo.com OR site:chacha.com OR site:blurtit.com OR site:answers.com OR site:question.com OR site:answerbag.com OR site:stackexchange.com)"
print "Content-type: text/html"
print
header = open("header.html", "r")
contents = header.readlines()
header.close()
for item in contents:
print item
print """
<div id="results">
<center><h1>Results:</h1></center>
"""
for item in getInfo(siteStr, qry):
print "<h3>%s</h3>" % getTitle(item)
print "<br />"
print "%s" % getUrl(item)
print "<br />"
print "<p style=\"color:gray\">%s</p>" % getContent(item)
print "<br />"
print "</div>"
footer = open("footer.html", "r")
contents = footer.readlines()
footer.close()
for thing in contents:
print thing
I prints a few results, and then gives me the following error:
UnicodeEncodeError: 'ascii' codec can't encode character u'\\u2026' in position 72: ordinal not in range(128)
Can someone explain why this is happening? It clearly has something to do with how the url is getting encoded, but what is exactly is wrong? Thanks in advance!
That particular Unicode character is "HORIZONTAL ELLIPSIS". One or more of your getXXXXX() functions are returning Unicode strings, one of which contains a non-ASCII character. I suggest declaring the encoding of your output, for example:
Content-Type: text/html; charset=utf-8
and explicitly encoding your output in that encoding.
We need to know the line number where the exception was thrown, it will be in the backtrace. Anyway, the problem is that you are reading unicode from the files/URLs and then implicitly converting them to US-ASCII, probably in one of the concatenation operations. You should prefix all constant strings with u to indicate that they are unicode strings, like in
u"\n%s" % qry

Categories

Resources