How to write/overwrite a file with HTMLParser

How to write/overwrite a file with HTMLParser - python

I need to create some files from a template. I'm using psycopg2 to fetch from a database. Then I loop through. Now I need to write to file.
Thanks!
import sys
from HTMLParser import HTMLParser
from xml.etree import cElementTree as etree
import psycopg2
import psycopg2.extras
class LinksParser(HTMLParser):
def __init__(self):
HTMLParser.__init__(self)
self.tb = etree.TreeBuilder()
def handle_starttag(self, tag, attributes):
self.tb.start(tag, dict(attributes))
def handle_endtag(self, tag):
self.tb.end(tag)
def handle_data(self, data):
self.tb.data(data)
def close(self):
HTMLParser.close(self)
return self.tb.close()
conn = psycopg2.connect(dbname="**", user="**", password="**", host="/tmp/", port="**")
cur.execute("SELECT * FROM landingpagedata;")
rows = cur.fetchall()
template = 'template.html'
parser = LinksParser()
# parser.feed(open('landingIndex.html').read()) #for testing
# root = parser.close()
for row in rows:
parser.feed(open(template).read())
root = parser.close()
#title
title = root.find(".//title")
title.text = str(row['title'])
f = open(row['page_name'], 'w')
root.write(f)
parser = LinksParser()
The error is:
Traceback (most recent call last):
File "/Users/Morgan13/Programming/LandingPageBuilder/landingPages/landingBuilderTest.py", line 87, in <module>
main()
File "/Users/Morgan13/Programming/LandingPageBuilder/landingPages/landingBuilderTest.py", line 75, in main
root.write('page_name')
AttributeError: write
Oh and I'm using open('page', 'w') because these pages exist already?

I think you want f.write(root), not root.write(f). (Assuming that str(root) gives you the HTML you want to write out.)

Related

Pickling error when using pathos with XML

I'm trying to read the multistream Wikipedia dump into a database. This is my attempt at loading smaller chunks in parallel. Here's the script:
#!/usr/bin/python3
import xml.sax
from bz2 import BZ2File
import mwparserfromhell
import psycopg2
import pathos
import os
import dill
class XmlHandler(xml.sax.handler.ContentHandler):
def __init__(self):
xml.sax.handler.ContentHandler.__init__(self)
self._buffer = None
self._values = {}
self._current_tag = None
self._pages = []
def get_pages(self):
return self._pages
def get_page_count(self):
return len(self._pages)
def get_values(self):
return self._values
def characters(self, content):
if self._current_tag:
self._buffer.append(content)
def startElement(self, name, attrs):
if name in ('title', 'text', 'infobox'):
self._current_tag = name
self._buffer = []
def endElement(self, name):
if name == self._current_tag:
self._values[name] = ' '.join(self._buffer)
if name == 'page':
self.process_article()
def process_article(self):
wikicode = mwparserfromhell.parse(self._values['text'])
infobox_array = wikicode.filter_templates(matches="infobox .*")
infobox = str(infobox_array[0]) if len(infobox_array) > 0 else ""
self._pages.append((self._values['title'], self._values['text'], infobox))
def load_xml(filename):
wiki_handler = XmlHandler()
wiki_parser = xml.sax.make_parser()
wiki_parser.setContentHandler(wiki_handler)
file = os.path.join("chunks", filename)
print("I'm a worker process")
cursor = conn.cursor()
with BZ2File(file, 'r') as f:
for line in f:
wiki_parser.feed(line)
pages = wiki_handler.get_pages()
for page in pages:
cursor.execute("INSERT INTO pages (title, text, infobox) VALUES (%s, %s, %s) ON CONFLICT DO NOTHING", page)
cursor.close()
print("all done")
if __name__ == "__main__":
conn = psycopg2.connect(dbname="wikipedia",
user="postgres",
password="postgres",
host="localhost",
port=5432)
file_list = [f for f in os.listdir("chunks") if os.path.isfile(os.path.join("chunks", f))]
pool = pathos.multiprocessing.ProcessingPool(processes=pathos.multiprocessing.cpu_count())
pool.map(load_xml, file_list)
And the traceback:
Traceback (most recent call last):
File "./loader_parallel.py", line 114, in <module>
pool.map(load_xml, file_list)
File "/home/smh/.local/lib/python3.7/site-packages/multiprocess/pool.py", line 268, in map
return self._map_async(func, iterable, mapstar, chunksize).get()
File "/home/smh/.local/lib/python3.7/site-packages/multiprocess/pool.py", line 657, in get
raise self._value
multiprocess.pool.MaybeEncodingError: Error sending result:
'<multiprocess.pool.ExceptionWithTraceback object at 0x7f87ac0f4470>'.
Reason: 'TypeError("can't pickle pyexpat.xmlparser objects")'
Why can't the pyexpat.xmlparser object be pickled? How can I fix it? I tried testing it by running dill.copy(XmlHandler()) and it did so without error.
I installed pathos via pip3, running Python 3.7, on Debian 10. Pretty new to this, any help is appreciated, thanks in advance!

Python HTMLParser: AttributeError

I'm using HTMLParser (python 2.7)to parse pages I pull down with urllib2,and am coming across AttributeError exceptions when I want to store my data into a list in feed method. But if comment out the __init__ method, the exception was gone
main.py
# -*- coding: utf-8 -*-
from HTMLParser import HTMLParser
import urllib2
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
class MyHTMLParser(HTMLParser):
def __init__(self):
self.terms = []
self.definitions = []
def handle_starttag(self, tag, attrs):
# retrive the terms
if tag == 'div':
for attribute, value in attrs:
if value == 'word':
self.terms.append(attrs[1][1])
# retrive the definitions
if value == 'desc':
if attrs[1][1]:
self.definitions.append(attrs[1][1])
else:
self.definitions.append(None)
parser = MyHTMLParser()
# open page and retrive source page
response = urllib2.urlopen('http://localhost/')
html = response.read().decode('utf-8')
response.close()
# extract the terms and definitions
parser.feed(html)
Output
Traceback (most recent call last):
File "/Users/megachweng/Project/Anki-Youdao/combined.py", line 35, in <module>
parser.feed(html)
File "/usr/local/Cellar/python/2.7.13/Frameworks/Python.framework/Versions/2.7/lib/python2.7/HTMLParser.py", line 116, in feed
self.rawdata = self.rawdata + data
AttributeError: MyHTMLParser instance has no attribute 'rawdata'

I think that you don't initialize HTMLParser properly. Maybe you don't need to initialize it at all. This works for me:
# -*- coding: utf-8 -*-
from HTMLParser import HTMLParser
import urllib2
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
class MyHTMLParser(HTMLParser):
def handle_starttag(self, tag, attrs):
print "Encountered a start tag:", tag
# retrive the terms
if tag == 'div':
for attribute, value in attrs:
if value == 'word':
self.terms.append(attrs[1][1])
# retrive the definitions
if value == 'desc':
if attrs[1][1]:
self.definitions.append(attrs[1][1])
else:
self.definitions.append(None)
parser = MyHTMLParser()
# open page and retrive source page
response = urllib2.urlopen('http://localhost/')
html = response.read().decode('utf-8')
response.close()
# extract the terms and definitions
parser.feed(html)
UPDATE
# -*- coding: utf-8 -*-
from HTMLParser import HTMLParser
import urllib2
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
class MyHTMLParser(HTMLParser):
def __init__(self):
HTMLParser.__init__(self)
self.terms = []
self.definitions = []
def handle_starttag(self, tag, attrs):
# retrive the terms
for attribute in attrs:
if attribute[0] == 'align':
self.terms.append(attribute[1])
self.definitions.append(attribute[1])
parser = MyHTMLParser()
html = "<table align='center'><tr><td align='left'><p>ciao</p></td></tr></table>"
# extract the terms and definitions
parser.feed(html)
print parser.terms
print parser.definitions
Output:
['center', 'left']
['center', 'left']

OK I got the solution,super().__init__ cannot work, must hard code the name
def __init__(self):
HTMLParser.__init__(self)

How do I mock an open(...).write() without getting a 'No such file or directory' error?

I've based my solution on:
How do I mock an open used in a with statement (using the Mock framework in Python)?,
AttributeError: <module '__main__' from [..] does not have the attribute 'open',
http://www.voidspace.org.uk/python/mock/helpers.html#mock.mock_open
I have a class, which I can instantiate, which writes to a file. I'm trying to test it, but I'm having problems mocking open(). I'm using the following as the smallest piece of code, which can
import os
import unittest
from unittest.mock import mock_open, patch
__author__ = 'drews'
class MockPathExists(object):
def __init__(self, return_value):
self.received_args = None
self.return_value = return_value
def __call__(self, *args, **kwargs):
self.received_args = args
return self.return_value
class WriteData:
def __init__(self, dir, name='World'):
self.name = name
self.dir = dir
def dump(self):
if os.path.exists(self.dir):
with open('{0}/output.text'.format(self.dir), 'w+') as fp:
fp.write('Hello, {0}!'.format(self.name))
class TestListWindowsPasswords(unittest.TestCase):
def setUp(self):
self._orig_pathexists = os.path.exists
os.path.exists = MockPathExists(True)
def test_dump(self):
m = mock_open()
with patch.object(WriteData, 'open', m, create=True):
data_writer = WriteData(
dir='/my/path/not/exists',
name='Foo'
)
data_writer.dump()
self.assertEqual(os.path.exists.received_args[0], '/my/path/not/exists/output.text')
m.assert_called_once_with('/my/path/not/exists/output.text', 'w+')
handle = m()
handle.write.assert_called_once_with('Hello, Foo!')
def tearDown(self):
os.path.exists = self._orig_pathexists
When I run this, I get the following error:
Error
Traceback (most recent call last):
File "/Users/drews/Development/tool/tests/test_mockopen.py", line 41, in test_dump
data_writer.dump()
File "/Users/drews/Development/tool/tests/test_mockopen.py", line 25, in dump
with open('{0}/output.text'.format(self.dir), 'w+') as fp:
FileNotFoundError: [Errno 2] No such file or directory: '/my/path/not/exists/output.text'
How can I mock open(), so that it just returns a file_pointer, and doesn't try to interact with the file system at all?

Mock builtins.open (or module.open, module = the module name that contains WriteData) with the mock_open:
import builtins
class TestListWindowsPasswords(unittest.TestCase):
def setUp(self):
self._orig_pathexists = os.path.exists
os.path.exists = MockPathExists(True)
def test_dump(self):
with patch('builtins.open', unittest.mock.mock_open()) as m:
data_writer = WriteData(
dir='/my/path/not/exists',
name='Foo'
)
data_writer.dump()
self.assertEqual(os.path.exists.received_args[0], '/my/path/not/exists') # fixed
m.assert_called_once_with('/my/path/not/exists/output.text', 'w+')
handle = m()
handle.write.assert_called_once_with('Hello, Foo!')

You can use the __enter__ magic method to simulate which:
from unittest.mock import patch, MagicMock, call, mock_open
#patch('os')
#patch('builtins.open', new_callable=mock_open())
def test_dump(self, mock_open_file, mock_os):
data_writer = WriteData(dir='/my/path/not/exists', name='Foo')
mock_os.path.exists.assert_called_once_with('/my/path/not/exists')
mock_open_file.assert_called_once_with('/my/path/not/exists/output.text', 'w+')
mock_open_file.return_value.__enter__().write.assert_called_once_with('Hello, Foo!')
Hope this helps!

how to return a collection from mongodb using pymongo

I'm trying to create a Collection Class in Python to access the various collections in my db. Here's what I've got:
import sys
import os
import pymongo
from pymongo import MongoClient
class Collection():
client = MongoClient()
def __init__(self, db, collection_name):
self.db = db
self.collection_name = collection_name
# self.data_base = getattr(self.client, db)
# self.collObject = getattr(self.data_base, self.collection_name)
def getCollection(self):
data_base = getattr(self.client, self.db)
collObject = getattr(data_base, self.collection_name)
return collObject
def getCollectionKeys(self, collection):
"""Get a set of keys from a collection"""
keys_list = []
collection_list = collection.find()
for document in collection_list:
for field in document.keys():
keys_list.append(field)
keys_set = set(keys_list)
return keys_set
if __name__ == '__main__':
print"Begin Main"
agents = Collection('hkpr_restore','agents')
print "agents is" , agents
agents_collection = agents.getCollection
print agents_collection
print agents.getCollectionKeys(agents_collection)
I get the following output:
Begin Main
agents is <__main__.Collection instance at 0x10ff33e60>
<bound method Collection.getCollection of <__main__.Collection instance at 0x10ff33e60>>
Traceback (most recent call last):
File "collection.py", line 52, in <module>
print agents.getCollectionKeys(agents_collection)
File "collection.py", line 35, in getCollectionKeys
collection_list = collection.find()
AttributeError: 'function' object has no attribute 'find'
The function getCollectionKeys works fine outside of a class. What am I doing wrong?

This line:
agents_collection = agents.getCollection
Should be:
agents_collection = agents.getCollection()
Also, you don't need to use getattr the way you are. Your getCollection method can be:
def getCollection(self):
return self.client[self.db][self.collection_name]

buildbot doesn't accept my MailNotifier's IEMailLookup object

A number of people in my organization have different email names from perforce names, so I need to create an IEmailLookup derivation that overrides getAddress to do my evil bidding:
(From my master.cfg)
class MyIEmailLookup:
from buildbot import interfaces
__implements__ = interfaces.IEmailLookup
def getAddresses(user):
address_dict = {"user1", "user_one#our_domain.com"}
try:
address = address_dict[user]
except KeyError:
address = user + "#our_domain.com"
return address
maillookup = MyIEmailLookup()
from buildbot.status import mail
c['status'].append(mail.MailNotifier(....
....
lookup=maillookup
))
I've tried any number of permutations, but I either get:
Traceback (most recent call last):
File "/Library/Python/2.6/site-packages/buildbot-0.8.3p1-py2.6.egg/buildbot/scripts/runner.py", line 1071, in doCheckConfig
ConfigLoader(configFileName=configFileName)
File "/Library/Python/2.6/site-packages/buildbot-0.8.3p1-py2.6.egg/buildbot/scripts/checkconfig.py", line 46, in __init__
self.loadConfig(configFile, check_synchronously_only=True)
File "/Library/Python/2.6/site-packages/buildbot-0.8.3p1-py2.6.egg/buildbot/master.py", line 727, in loadConfig
exec f in localDict
File "/Users/playbuilder/buildbot/master.cfg", line 207, in <module>
lookup=maillookup
File "/Library/Python/2.6/site-packages/buildbot-0.8.3p1-py2.6.egg/buildbot/status/mail.py", line 293, in __init__
assert interfaces.IEmailLookup.providedBy(lookup)
AssertionError
...or any other number of issues, dependant upon how I try to implement the IEmailLookup interface.
I'm using buildbot 0.8.3p1 and python 2.6.1.
I see precious few examples of how to do this, and every one of them fails in my context. What am I missing here?

I just solved this problem myself.
First you need to add (somewhere at the top of the file)
from zope.interface import implements
and then change
__implements__ = interfaces.IEmailLookup
to
if implements:
implements( interfaces.IEmailLookup )
else:
__implements__ = interfaces.IEmailLookup

If you want to fetch email from perforce user, you can use this class:
# .-----------------------.
# | Perforce Email Lookup |
# `-----------------------'
from twisted.internet import defer, utils
from buildbot.util import ComparableMixin
from buildbot.interfaces import IEmailLookup
from zope.interface import implements
import os
import re
class PerforceEmailLookup(ComparableMixin):
implements(IEmailLookup)
compare_attrs = ["p4port", "p4user", "p4passwd", "p4bin"]
env_vars = ["P4CLIENT", "P4PORT", "P4PASSWD", "P4USER",
"P4CHARSET"]
def __init__(self,
p4port = None,
p4user = None,
p4passwd = None,
p4bin = 'p4'):
self.p4port = p4port
self.p4user = p4user
self.p4passwd = p4passwd
self.p4bin = p4bin
self.email_re = re.compile(r"Email:\s+(?P<email>\S+#\S+)\s*$")
def _get_process_output(self, args):
env = dict([(e, os.environ.get(e)) for e in self.env_vars if os.environ.get(e)])
d = utils.getProcessOutput(self.p4bin, args, env)
return d
#defer.deferredGenerator
def getAddress(self, name):
if '#' in name:
yield name
return
args = []
if self.p4port:
args.extend(['-p', self.p4port])
if self.p4user:
args.extend(['-u', self.p4user])
if self.p4passwd:
args.extend(['-P', self.p4passwd])
args.extend(['user', '-o', name])
wfd = defer.waitForDeferred(self._get_process_output(args))
yield wfd
result = wfd.getResult()
for line in result.split('\n'):
line = line.strip()
if not line: continue
m = self.email_re.match(line)
if m:
yield m.group('email')
return
yield name
usage would look like:
c['status'].append(
MailNotifier(
sendToInterestedUsers = True,
mode = 'failing',
lookup = PerforceEmailLookup(
p4port = "perforce:1666",
p4user = "buildbot",
p4passwd = "buildbot")))

Try this:
from buildbot.interfaces import IEmailLookup
from buildbot.util import ComparableMixin
from zope.interface import implements
class lookup_example_email(ComparableMixin):
implements(IEmailLookup)
def getAddress(self,user):
return "%s#example.com"%(user)
...
mn = MailNotifier(..., lookup=lookup_example_email(), extraRecipients=m)

Here's the piece of code I use which works with buildbot 2.3.1 in python3.6.
from buildbot.interfaces import IEmailLookup
from buildbot.util import ComparableMixin
from zope.interface import implementer
#implementer(IEmailLookup)
class EmailMap(ComparableMixin):
def getAddress(self, name):
return f'{name}#xxxxx'

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

How to write/overwrite a file with HTMLParser - python

I think you want f.write(root), not root.write(f). (Assuming that str(root) gives you the HTML you want to write out.)

Related

Pickling error when using pathos with XML

Python HTMLParser: AttributeError

How do I mock an open(...).write() without getting a 'No such file or directory' error?

how to return a collection from mongodb using pymongo

buildbot doesn't accept my MailNotifier's IEMailLookup object

Categories

Resources