Python Web Importer - load submodules - python

I have a little edited the WebImporter class from this site.
import imp
import sys
import http.client
from urllib.parse import urlparse
def register_domain(name, url):
WebImporter.registered_domains[name] = url
class WebImporter:
registered_domains = {}
def find_module(self, fullname, path=None):
print('FIND', fullname, path)
if fullname in self.registered_domains:
return self
if fullname.rsplit('.')[0] not in self.registered_domains:
return None
try:
r = self._do_request(fullname, method="HEAD")
except ValueError:
return None
else:
r.close()
if r.status == 200:
return self
return None
def load_module(self, fullname):
if fullname in sys.modules:
return sys.modules[fullname]
mod = imp.new_module(fullname)
mod.__loader__ = self
mod.__name__ = fullname
mod.__package__ = ['/'.join(fullname.split('.'))]
mod.__path__ = '/'.join(fullname.split('.'))
sys.modules[fullname] = mod
if fullname not in self.registered_domains:
url = self._get_host_and_path(fullname).geturl()
mod.__file__ = url
r = self._do_request(fullname)
code = r.read()
assert r.status == 200
exec(code, mod.__dict__)
else:
mod.__file__ = "[fake module %r]" % fullname
mod.__path__ = []
return mod
def _do_request(self, fullname, method="GET"):
url = self._get_host_and_path(fullname)
c = http.client.HTTPConnection(url.netloc)
c.request(method, url.path)
return c.getresponse()
def _get_host_and_path(self, fullname):
tld, rest = fullname.rsplit('.', 1)
path = "/%s.py" % rest.replace('.', '/')
url = self.registered_domains[fullname.split('.')[0]] + path
url = urlparse(url)
return url
sys.meta_path.append(WebImporter())
I have two modules on my web server on a local network:
foo.py (file at: http://192.168.122.2:80/foo.py):
class foo:
def __init__(self):
print('FOO')
bar.py (file at: http://192.168.122.2:80/bar.py):
from foo import foo
class bar(foo):
def __init__(self):
foo.__init__(self)
print('BAR')
when I run:
import webimport
webimport.register_domain('dev', 'http://192.168.122.2:80')
from dev.foo import foo
if __name__ == '__main__':
foo()
it imports and runs the code
but when I want to import and run the bar:
import webimport
webimport.register_domain('dev', 'http://192.168.122.2:80')
from dev.bar import bar
if __name__ == '__main__':
bar()
the output is:
FIND foo None
Traceback (most recent call last):
File "test.py", line 4, in <module>
from dev.bar import bar
File "/home/user/tests/webimport/webimport.py", line 44, in load_module
exec(code, mod.__dict__)
File "<string>", line 1, in <module>
ImportError: No module named 'foo'
where is the problem?

You need to add path of foo module to PYTHONPATH. If you're using bash (on a Mac or GNU/Linux distro), add this to your ~/.bashrc
export PYTHONPATH="${PYTHONPATH}:/my/other/path"
EDITED
try to change in bar.py
from foo import foo
to
from dev.foo import foo

Related

Pickling error when using pathos with XML

I'm trying to read the multistream Wikipedia dump into a database. This is my attempt at loading smaller chunks in parallel. Here's the script:
#!/usr/bin/python3
import xml.sax
from bz2 import BZ2File
import mwparserfromhell
import psycopg2
import pathos
import os
import dill
class XmlHandler(xml.sax.handler.ContentHandler):
def __init__(self):
xml.sax.handler.ContentHandler.__init__(self)
self._buffer = None
self._values = {}
self._current_tag = None
self._pages = []
def get_pages(self):
return self._pages
def get_page_count(self):
return len(self._pages)
def get_values(self):
return self._values
def characters(self, content):
if self._current_tag:
self._buffer.append(content)
def startElement(self, name, attrs):
if name in ('title', 'text', 'infobox'):
self._current_tag = name
self._buffer = []
def endElement(self, name):
if name == self._current_tag:
self._values[name] = ' '.join(self._buffer)
if name == 'page':
self.process_article()
def process_article(self):
wikicode = mwparserfromhell.parse(self._values['text'])
infobox_array = wikicode.filter_templates(matches="infobox .*")
infobox = str(infobox_array[0]) if len(infobox_array) > 0 else ""
self._pages.append((self._values['title'], self._values['text'], infobox))
def load_xml(filename):
wiki_handler = XmlHandler()
wiki_parser = xml.sax.make_parser()
wiki_parser.setContentHandler(wiki_handler)
file = os.path.join("chunks", filename)
print("I'm a worker process")
cursor = conn.cursor()
with BZ2File(file, 'r') as f:
for line in f:
wiki_parser.feed(line)
pages = wiki_handler.get_pages()
for page in pages:
cursor.execute("INSERT INTO pages (title, text, infobox) VALUES (%s, %s, %s) ON CONFLICT DO NOTHING", page)
cursor.close()
print("all done")
if __name__ == "__main__":
conn = psycopg2.connect(dbname="wikipedia",
user="postgres",
password="postgres",
host="localhost",
port=5432)
file_list = [f for f in os.listdir("chunks") if os.path.isfile(os.path.join("chunks", f))]
pool = pathos.multiprocessing.ProcessingPool(processes=pathos.multiprocessing.cpu_count())
pool.map(load_xml, file_list)
And the traceback:
Traceback (most recent call last):
File "./loader_parallel.py", line 114, in <module>
pool.map(load_xml, file_list)
File "/home/smh/.local/lib/python3.7/site-packages/multiprocess/pool.py", line 268, in map
return self._map_async(func, iterable, mapstar, chunksize).get()
File "/home/smh/.local/lib/python3.7/site-packages/multiprocess/pool.py", line 657, in get
raise self._value
multiprocess.pool.MaybeEncodingError: Error sending result:
'<multiprocess.pool.ExceptionWithTraceback object at 0x7f87ac0f4470>'.
Reason: 'TypeError("can't pickle pyexpat.xmlparser objects")'
Why can't the pyexpat.xmlparser object be pickled? How can I fix it? I tried testing it by running dill.copy(XmlHandler()) and it did so without error.
I installed pathos via pip3, running Python 3.7, on Debian 10. Pretty new to this, any help is appreciated, thanks in advance!

TypeError: object() takes no parameters - Python 3.6.3

When I run $ python -m weatherterm -a USNY1192:1:US -p WeatherComParser -td
to output the parsed data in terminal, I get following error:
Traceback (most recent call last):
File "C:\Python36\lib\runpy.py", line 193, in _run_module_as_main "__main__", mod_spec)
File "C:\Python36\lib\runpy.py", line 85, in _run_code
exec(code, run_globals)
File "C:\Users\Reza\weather-app\weatherterm\weatherterm\__main__.py", line 81, in <module>
parser = cls()
File "C:\Users\Reza\weather-app\weatherterm\weatherterm\parsers\weather_com_parser.py", line 25, in __init__
self._request = Request(self._base_url)
TypeError: object() takes no parameters
Appreciate if you could advise what went wrong here:
from weatherterm.core import ForecastType
import re
from weatherterm.core import Forecast
from weatherterm.core import Request
from weatherterm.core import Unit
from weatherterm.core import UnitConverter
from bs4 import BeautifulSoup
class WeatherComParser:
def __init__(self):
self._forecast = {
ForecastType.TODAY: self._today_forecast,
ForecastType.FIVEDAYS: self._five_and_ten_days_forecast,
ForecastType.TENDAYS: self._five_and_ten_days_forecast,
ForecastType.WEEKEND: self._weekend_forecast,
}
self._base_url = 'http://weather.com/weather/{forecast}/l/{area}'
self._request = Request(self._base_url)
self._temp_regex = re.compile('([0-9]+)\D{,2}([0-9]+)')
self._only_digits_regex = re.compile('[0-9]+')
self._unit_converter = UnitConverter(Unit.FAHRENHEIT)
def _get_additional_info(self, content):
data = tuple(item.td.span.get_text()
for item in content.table.tbody.children)
return data[:2]
def _clear_str_number(self, str_number):
result = self._only_digits_regex.match(str_number)
return '--' if result is None else result.group()
def _parse(self, container, criteria):
results = [self._get_data(item, criteria)
for item in container.children]
return [result for result in results if result]
def _get_data(self, container, search_items):
scraped_data = {}
for key, value in search_items.items():
result = container.find(value, class_=key)
data = None if result is None else result.get_text()
if data is not None:
scraped_data[key] = data
return scraped_data
def _today_forecast(self, args):
# raise NotImplementedError()
criteria = {
'today_nowcard-temp': 'div',
'today_nowcard-phrase': 'div',
'today_nowcard-hilo': 'div',
}
content = self._request.fetch_data(args.forecast_option.value,
args.area_code)
bs = BeautifulSoup(content, 'html.parser')
container = bs.find('section', class_='today_nowcard-container')
weather_conditions = self._parse(container, criteria)
if len(weather_conditions) < 1:
raise Exception('Could not parse weather forecast for today.')
weatherinfo = weather_conditions[0]
temp_regex = re.compile(('H\s+([0-9]+|\-{,2}).+'
'L\s+([0-9]+|\-{,2})'))
temp_info = temp_regex.search(weatherinfo['today_nowcard-hilo'])
high_temp, low_temp = temp_info.groups()
side = container.find('div', class_='today_nowcard-sidecar')
wind, humidity = self._get_additional_info(side)
curr_temp = self._clear_str_number(weatherinfo['today_nowcard-temp'])
self._unit_converter.dest_unit = args.unit
td_forecast = Forecast(self._unit_converter.convert(curr_temp),
humidity,
wind,
high_temp=self._unit_converter.convert(
high_temp),
low_temp=self._unit_converter.convert(
low_temp),
description=weatherinfo['today_nowcard-phrase'])
return [td_forecast]
def _five_and_ten_days_forecast(self, args):
raise NotImplementedError()
def _weekend_forecast(self, args):
raise NotImplementedError()
def run(self, args):
self._forecast_type = args.forecast_option
forecast_function = self._forecast[args.forecast_option]
return forecast_function(args)
Looking at the call stack, it seems to me that it is something wrong in the Request class in weatherterm/core/request.py. The initializer of the Request class should look like this:
class Request:
def __init__(self, base_url):
...
Make sure that the initializer also gets a base_url parameter.

How do I mock an open(...).write() without getting a 'No such file or directory' error?

I've based my solution on:
How do I mock an open used in a with statement (using the Mock framework in Python)?,
AttributeError: <module '__main__' from [..] does not have the attribute 'open',
http://www.voidspace.org.uk/python/mock/helpers.html#mock.mock_open
I have a class, which I can instantiate, which writes to a file. I'm trying to test it, but I'm having problems mocking open(). I'm using the following as the smallest piece of code, which can
import os
import unittest
from unittest.mock import mock_open, patch
__author__ = 'drews'
class MockPathExists(object):
def __init__(self, return_value):
self.received_args = None
self.return_value = return_value
def __call__(self, *args, **kwargs):
self.received_args = args
return self.return_value
class WriteData:
def __init__(self, dir, name='World'):
self.name = name
self.dir = dir
def dump(self):
if os.path.exists(self.dir):
with open('{0}/output.text'.format(self.dir), 'w+') as fp:
fp.write('Hello, {0}!'.format(self.name))
class TestListWindowsPasswords(unittest.TestCase):
def setUp(self):
self._orig_pathexists = os.path.exists
os.path.exists = MockPathExists(True)
def test_dump(self):
m = mock_open()
with patch.object(WriteData, 'open', m, create=True):
data_writer = WriteData(
dir='/my/path/not/exists',
name='Foo'
)
data_writer.dump()
self.assertEqual(os.path.exists.received_args[0], '/my/path/not/exists/output.text')
m.assert_called_once_with('/my/path/not/exists/output.text', 'w+')
handle = m()
handle.write.assert_called_once_with('Hello, Foo!')
def tearDown(self):
os.path.exists = self._orig_pathexists
When I run this, I get the following error:
Error
Traceback (most recent call last):
File "/Users/drews/Development/tool/tests/test_mockopen.py", line 41, in test_dump
data_writer.dump()
File "/Users/drews/Development/tool/tests/test_mockopen.py", line 25, in dump
with open('{0}/output.text'.format(self.dir), 'w+') as fp:
FileNotFoundError: [Errno 2] No such file or directory: '/my/path/not/exists/output.text'
How can I mock open(), so that it just returns a file_pointer, and doesn't try to interact with the file system at all?
Mock builtins.open (or module.open, module = the module name that contains WriteData) with the mock_open:
import builtins
class TestListWindowsPasswords(unittest.TestCase):
def setUp(self):
self._orig_pathexists = os.path.exists
os.path.exists = MockPathExists(True)
def test_dump(self):
with patch('builtins.open', unittest.mock.mock_open()) as m:
data_writer = WriteData(
dir='/my/path/not/exists',
name='Foo'
)
data_writer.dump()
self.assertEqual(os.path.exists.received_args[0], '/my/path/not/exists') # fixed
m.assert_called_once_with('/my/path/not/exists/output.text', 'w+')
handle = m()
handle.write.assert_called_once_with('Hello, Foo!')
You can use the __enter__ magic method to simulate which:
from unittest.mock import patch, MagicMock, call, mock_open
#patch('os')
#patch('builtins.open', new_callable=mock_open())
def test_dump(self, mock_open_file, mock_os):
data_writer = WriteData(dir='/my/path/not/exists', name='Foo')
mock_os.path.exists.assert_called_once_with('/my/path/not/exists')
mock_open_file.assert_called_once_with('/my/path/not/exists/output.text', 'w+')
mock_open_file.return_value.__enter__().write.assert_called_once_with('Hello, Foo!')
Hope this helps!

Django import settings error

I have the following script for scraping a webform:
#!/usr/bin/env python
"""
Python script for searching firms on http://www.adviserinfo.sec.gov/IAPD/Content/Search/iapd_Search.aspx
Assumes that the input Excel file is in the same directory as this script.
"""
import os
import re
import sys
import django
import mechanize
from bs4 import BeautifulSoup
from xlrd import open_workbook
XLSX_FILE = 'Legal Names.xlsx'
IAPD_URL = 'http://www.adviserinfo.sec.gov/IAPD/Content/Search/iapd_Search.aspx'
#------------------------------------------------------------------------------------------
sys.path.append(os.path.realpath(os.path.join(os.path.dirname(__file__), 'scraper/')))
sys.path.append(os.path.realpath(os.path.join(os.path.dirname(__file__), 'scraper/scraper/')))
os.environ['DJANGO_SETTINGS_MODULE'] = 'settings'
django.setup()
from django.core.exceptions import ObjectDoesNotExist
from custom_scraper.models import *
#------------------------------------------------------------------------------------------
def legal_names(file=XLSX_FILE):
'''
Read in legal names from the excel spreadsheet with the assumption
that names to be searched for are in column 1. Skip row 1 header
'''
wb = open_workbook(file)
s = wb.sheets()[0]
col = 1
for row in range(s.nrows)[1:]:
name = s.cell(row, col).value
if type(name) == int:
continue
yield name
class IapdScraper(object):
def __init__(self):
self.br = mechanize.Browser()
self.br.addheaders = [('User-agent',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_8) AppleWebKit/535.7 (KHTML, like Gecko) Chrome/16.0.912.63 Safari/535.7')]
def scrape(self, q):
'''
Search for a Firm in the IAPD database by name
'''
def select_form(form):
return form.attrs.get('id', None) == 'aspnetForm'
try:
firm = IapdFirm.objects.get(query_name=q)
except ObjectDoesNotExist:
firm = IapdFirm(query_name=q)
firm.save()
else:
if firm.checked:
return
self.br.open(IAPD_URL)
self.br.select_form(predicate=select_form)
self.br.form['ctl00$cphMainContent$ucUnifiedSearch$rdoSearchBy'] = ['rdoOrg']
self.br.submit()
self.br.select_form(predicate=select_form)
self.br.form['ctl00$cphMainContent$ucUnifiedSearch$txtFirm'] = q.encode('utf8')
self.br.submit()
s = BeautifulSoup(self.br.response().read())
r = re.compile(r'^ctl\d+_cphMainContent_grOrgResults$')
t = s.find('table', id=r)
if not t: # Not found
print 'Not Found'
firm.checked = True
firm.save()
return
tr = t.findAll('tr', recursive=False)
tr = tr[2:-1] # Skip records-per-page header/footer and title header
for row in tr:
td = row.findAll('td', recursive=False)
firm.legal_name = td[0].b.text.strip()
firm.other_name = td[1].text.strip()
firm.sec_number = td[2].text.strip()
firm.address = td[3].text.strip()
firm.checked = True
firm.save()
if __name__ == '__main__':
scraper = IapdScraper()
for name in legal_names():
print '\nq=%s' % name
scraper.scrape(q=name)
When I run the script in my venv, however, I get the following error:
c:\Users\bal2155\venv\Scripts>scraper.py
Traceback (most recent call last):
File "C:\Users\bal2155\venv\Scripts\scraper.py", line 26, in <module>
django.setup()
File "C:\Python27\lib\site-packages\django\__init__.py", line 20, in setup
configure_logging(settings.LOGGING_CONFIG, settings.LOGGING)
File "C:\Python27\lib\site-packages\django\conf\__init__.py", line 46, in __ge
tattr__
self._setup(name)
File "C:\Python27\lib\site-packages\django\conf\__init__.py", line 42, in _set
up
self._wrapped = Settings(settings_module)
File "C:\Python27\lib\site-packages\django\conf\__init__.py", line 98, in __in
it__
% (self.SETTINGS_MODULE, e)
ImportError: Could not import settings 'settings' (Is it on sys.path? Is there a
n import error in the settings file?): No module named settings
I have a inkling that this has to do either with how I've set up the venv or how I downloaded django. I've checked the documentation for the latter and I'm not sure what the myproject.settings extension means. Any help here would be greatly appreciated.

buildbot doesn't accept my MailNotifier's IEMailLookup object

A number of people in my organization have different email names from perforce names, so I need to create an IEmailLookup derivation that overrides getAddress to do my evil bidding:
(From my master.cfg)
class MyIEmailLookup:
from buildbot import interfaces
__implements__ = interfaces.IEmailLookup
def getAddresses(user):
address_dict = {"user1", "user_one#our_domain.com"}
try:
address = address_dict[user]
except KeyError:
address = user + "#our_domain.com"
return address
maillookup = MyIEmailLookup()
from buildbot.status import mail
c['status'].append(mail.MailNotifier(....
....
lookup=maillookup
))
I've tried any number of permutations, but I either get:
Traceback (most recent call last):
File "/Library/Python/2.6/site-packages/buildbot-0.8.3p1-py2.6.egg/buildbot/scripts/runner.py", line 1071, in doCheckConfig
ConfigLoader(configFileName=configFileName)
File "/Library/Python/2.6/site-packages/buildbot-0.8.3p1-py2.6.egg/buildbot/scripts/checkconfig.py", line 46, in __init__
self.loadConfig(configFile, check_synchronously_only=True)
File "/Library/Python/2.6/site-packages/buildbot-0.8.3p1-py2.6.egg/buildbot/master.py", line 727, in loadConfig
exec f in localDict
File "/Users/playbuilder/buildbot/master.cfg", line 207, in <module>
lookup=maillookup
File "/Library/Python/2.6/site-packages/buildbot-0.8.3p1-py2.6.egg/buildbot/status/mail.py", line 293, in __init__
assert interfaces.IEmailLookup.providedBy(lookup)
AssertionError
...or any other number of issues, dependant upon how I try to implement the IEmailLookup interface.
I'm using buildbot 0.8.3p1 and python 2.6.1.
I see precious few examples of how to do this, and every one of them fails in my context. What am I missing here?
I just solved this problem myself.
First you need to add (somewhere at the top of the file)
from zope.interface import implements
and then change
__implements__ = interfaces.IEmailLookup
to
if implements:
implements( interfaces.IEmailLookup )
else:
__implements__ = interfaces.IEmailLookup
If you want to fetch email from perforce user, you can use this class:
# .-----------------------.
# | Perforce Email Lookup |
# `-----------------------'
from twisted.internet import defer, utils
from buildbot.util import ComparableMixin
from buildbot.interfaces import IEmailLookup
from zope.interface import implements
import os
import re
class PerforceEmailLookup(ComparableMixin):
implements(IEmailLookup)
compare_attrs = ["p4port", "p4user", "p4passwd", "p4bin"]
env_vars = ["P4CLIENT", "P4PORT", "P4PASSWD", "P4USER",
"P4CHARSET"]
def __init__(self,
p4port = None,
p4user = None,
p4passwd = None,
p4bin = 'p4'):
self.p4port = p4port
self.p4user = p4user
self.p4passwd = p4passwd
self.p4bin = p4bin
self.email_re = re.compile(r"Email:\s+(?P<email>\S+#\S+)\s*$")
def _get_process_output(self, args):
env = dict([(e, os.environ.get(e)) for e in self.env_vars if os.environ.get(e)])
d = utils.getProcessOutput(self.p4bin, args, env)
return d
#defer.deferredGenerator
def getAddress(self, name):
if '#' in name:
yield name
return
args = []
if self.p4port:
args.extend(['-p', self.p4port])
if self.p4user:
args.extend(['-u', self.p4user])
if self.p4passwd:
args.extend(['-P', self.p4passwd])
args.extend(['user', '-o', name])
wfd = defer.waitForDeferred(self._get_process_output(args))
yield wfd
result = wfd.getResult()
for line in result.split('\n'):
line = line.strip()
if not line: continue
m = self.email_re.match(line)
if m:
yield m.group('email')
return
yield name
usage would look like:
c['status'].append(
MailNotifier(
sendToInterestedUsers = True,
mode = 'failing',
lookup = PerforceEmailLookup(
p4port = "perforce:1666",
p4user = "buildbot",
p4passwd = "buildbot")))
Try this:
from buildbot.interfaces import IEmailLookup
from buildbot.util import ComparableMixin
from zope.interface import implements
class lookup_example_email(ComparableMixin):
implements(IEmailLookup)
def getAddress(self,user):
return "%s#example.com"%(user)
...
mn = MailNotifier(..., lookup=lookup_example_email(), extraRecipients=m)
Here's the piece of code I use which works with buildbot 2.3.1 in python3.6.
from buildbot.interfaces import IEmailLookup
from buildbot.util import ComparableMixin
from zope.interface import implementer
#implementer(IEmailLookup)
class EmailMap(ComparableMixin):
def getAddress(self, name):
return f'{name}#xxxxx'

Categories

Resources