I have the following script for scraping a webform:
#!/usr/bin/env python
"""
Python script for searching firms on http://www.adviserinfo.sec.gov/IAPD/Content/Search/iapd_Search.aspx
Assumes that the input Excel file is in the same directory as this script.
"""
import os
import re
import sys
import django
import mechanize
from bs4 import BeautifulSoup
from xlrd import open_workbook
XLSX_FILE = 'Legal Names.xlsx'
IAPD_URL = 'http://www.adviserinfo.sec.gov/IAPD/Content/Search/iapd_Search.aspx'
#------------------------------------------------------------------------------------------
sys.path.append(os.path.realpath(os.path.join(os.path.dirname(__file__), 'scraper/')))
sys.path.append(os.path.realpath(os.path.join(os.path.dirname(__file__), 'scraper/scraper/')))
os.environ['DJANGO_SETTINGS_MODULE'] = 'settings'
django.setup()
from django.core.exceptions import ObjectDoesNotExist
from custom_scraper.models import *
#------------------------------------------------------------------------------------------
def legal_names(file=XLSX_FILE):
'''
Read in legal names from the excel spreadsheet with the assumption
that names to be searched for are in column 1. Skip row 1 header
'''
wb = open_workbook(file)
s = wb.sheets()[0]
col = 1
for row in range(s.nrows)[1:]:
name = s.cell(row, col).value
if type(name) == int:
continue
yield name
class IapdScraper(object):
def __init__(self):
self.br = mechanize.Browser()
self.br.addheaders = [('User-agent',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_8) AppleWebKit/535.7 (KHTML, like Gecko) Chrome/16.0.912.63 Safari/535.7')]
def scrape(self, q):
'''
Search for a Firm in the IAPD database by name
'''
def select_form(form):
return form.attrs.get('id', None) == 'aspnetForm'
try:
firm = IapdFirm.objects.get(query_name=q)
except ObjectDoesNotExist:
firm = IapdFirm(query_name=q)
firm.save()
else:
if firm.checked:
return
self.br.open(IAPD_URL)
self.br.select_form(predicate=select_form)
self.br.form['ctl00$cphMainContent$ucUnifiedSearch$rdoSearchBy'] = ['rdoOrg']
self.br.submit()
self.br.select_form(predicate=select_form)
self.br.form['ctl00$cphMainContent$ucUnifiedSearch$txtFirm'] = q.encode('utf8')
self.br.submit()
s = BeautifulSoup(self.br.response().read())
r = re.compile(r'^ctl\d+_cphMainContent_grOrgResults$')
t = s.find('table', id=r)
if not t: # Not found
print 'Not Found'
firm.checked = True
firm.save()
return
tr = t.findAll('tr', recursive=False)
tr = tr[2:-1] # Skip records-per-page header/footer and title header
for row in tr:
td = row.findAll('td', recursive=False)
firm.legal_name = td[0].b.text.strip()
firm.other_name = td[1].text.strip()
firm.sec_number = td[2].text.strip()
firm.address = td[3].text.strip()
firm.checked = True
firm.save()
if __name__ == '__main__':
scraper = IapdScraper()
for name in legal_names():
print '\nq=%s' % name
scraper.scrape(q=name)
When I run the script in my venv, however, I get the following error:
c:\Users\bal2155\venv\Scripts>scraper.py
Traceback (most recent call last):
File "C:\Users\bal2155\venv\Scripts\scraper.py", line 26, in <module>
django.setup()
File "C:\Python27\lib\site-packages\django\__init__.py", line 20, in setup
configure_logging(settings.LOGGING_CONFIG, settings.LOGGING)
File "C:\Python27\lib\site-packages\django\conf\__init__.py", line 46, in __ge
tattr__
self._setup(name)
File "C:\Python27\lib\site-packages\django\conf\__init__.py", line 42, in _set
up
self._wrapped = Settings(settings_module)
File "C:\Python27\lib\site-packages\django\conf\__init__.py", line 98, in __in
it__
% (self.SETTINGS_MODULE, e)
ImportError: Could not import settings 'settings' (Is it on sys.path? Is there a
n import error in the settings file?): No module named settings
I have a inkling that this has to do either with how I've set up the venv or how I downloaded django. I've checked the documentation for the latter and I'm not sure what the myproject.settings extension means. Any help here would be greatly appreciated.
Related
First of all I'm getting too many errors for python right now. This situation affected my motivation quite negatively. I'm trying to run an expert system that I found in this error. But I get the following error. I looked at other solutions but it didn't work for me either.
Drug Store Expert System:
https://github.com/enzoftware/pyswipl_drugstore
Code:
from flask import Flask, render_template, flash, request
from wtforms import Form, TextField, TextAreaField, validators, StringField, SubmitField
from pyswip import Prolog
import os
import time
# App config.
DEBUG = True
app = Flask(__name__)
app.config.from_object(__name__)
app.config['SECRET_KEY'] = '7d441f27d441f27567d441f2b6176a'
class ReusableForm(Form):
name = TextField('Name:', validators=[validators.required()])
#app.route("/", methods=['GET', 'POST'])
def hello():
form = ReusableForm(request.form)
if request.method == 'POST':
name=request.form['name']
fiebre = request.form.get('fiebre') == 'on'
nausea = request.form.get('nausea') == 'on'
diarrea = request.form.get('diarrea') == 'on'
headache = request.form.get('headache') == 'on'
print(fiebre, nausea, diarrea, headache)
os.system('swipl -q -t "program" console.pl')
if form.validate():
f = open("file.txt", "r")
disease = f.read()
print(disease)
flash('Hola ' + name + ', por tus sintomas podemos deducir que tienes '+ disease)
else:
flash('Error: Debes ingresar tu nombre. ')
return render_template('hello.html', form=form)
if __name__ == "__main__":
app.run()
Error:
ERROR: The system was unable to find the specified registry key or value.
Traceback (most recent call last):
File "c:/Users/BAUM-PC/Desktop/Yeni klasör/pyswipl_drugstore-master/main.py", line 3, in <module>
from pyswip import Prolog
File "c:\Users\BAUM-PC\Desktop\Yeni klasör\pyswipl_drugstore-master\pyswip\__init__.py", line 29, in <module>
from pyswip.prolog import Prolog
File "c:\Users\BAUM-PC\Desktop\Yeni klasör\pyswipl_drugstore-master\pyswip\prolog.py", line 28, in <module>
from pyswip.core import *
File "c:\Users\BAUM-PC\Desktop\Yeni klasör\pyswipl_drugstore-master\pyswip\core.py", line 568, in <module>
(_path, SWI_HOME_DIR) = _findSwipl()
File "c:\Users\BAUM-PC\Desktop\Yeni klasör\pyswipl_drugstore-master\pyswip\core.py", line 411, in _findSwipl
(path, swiHome) = _findSwiplWin()
File "c:\Users\BAUM-PC\Desktop\Yeni klasör\pyswipl_drugstore-master\pyswip\core.py", line 208, in _findSwiplWin
match = pattern.match(ret[-1])
IndexError: list index out of range
core.py
(related section)
try:
cmd = Popen(['reg', 'query',
r'HKEY_LOCAL_MACHINE\Software\SWI\Prolog',
'/v', 'home'], stdout=PIPE)
ret = cmd.communicate()
# Result is like:
# ! REG.EXE VERSION 3.0
#
# HKEY_LOCAL_MACHINE\Software\SWI\Prolog
# home REG_SZ C:\Program Files\pl
# (Note: spaces may be \t or spaces in the output)
ret = ret[0].splitlines()
ret = [line.decode("utf-8") for line in ret if len(line) > 0]
pattern = re.compile('[^h]*home[^R]*REG_SZ( |\t)*(.*)$')
match = pattern.match(ret[-1])
if match is not None:
path = match.group(2)
paths = [os.path.join(path, 'bin', dllName)
for dllName in dllNames]
for path in paths:
if os.path.exists(path):
return (path, None)
Windows Solution:
Make sure your Python and SWI Prolog are both 32- or 64-bit.
Go to line 180 of core.py and change the line to this:
paths = [os.path.join(programFiles, r'swipl\bin', dllName)
For first error:
As per official documentation of Pyswip you must take care of version of python and SWI-Prolog.
Make sure the SWI-Prolog architecture is the same as the Python architecture. If you are using a 64bit build of Python, use a 64bit build of SWI-Prolog.
For second error:
When you are accessing ret[-1] in match = pattern.match(ret[-1]) the index of list ret is not available.
Below is my python code I read the keys from a CSV file and delete them in the database.It's running fine for a while and throwing me this timeout error. I don't see any GC issue and health of the node is working fine.
Traceback (most recent call last):
File "/Users/XXX/Downloads/XXX/XXX", line 65, in <module>
parse_file(datafile)
File "/Users/XXX/Downloads/XXX/XXX", line 49, in parse_file
session = cluster.connect('XXX')
File "cassandra/cluster.py", line 1193, in cassandra.cluster.Cluster.connect (cassandra/cluster.c:17796)
File "cassandra/cluster.py", line 1240, in cassandra.cluster.Cluster._new_session (cassandra/cluster.c:18952)
File "cassandra/cluster.py", line 1980, in cassandra.cluster.Session.__init__ (cassandra/cluster.c:35191)
cassandra.cluster.NoHostAvailable: ("Unable to connect to any servers using keyspace 'qualys_ioc'", ['127.0.0.1'])
Python Code:
import argparse
import sys
import itertools
import codecs
import uuid
import os
import subprocess
try:
import cassandra
import cassandra.concurrent
except ImportError:
sys.exit('Python Cassandra driver not installed. You might try \"pip install cassandra-driver\".')
from cassandra.cluster import Cluster, ResultSet, Session
from cassandra.policies import DCAwareRoundRobinPolicy
from cassandra.auth import PlainTextAuthProvider
from cassandra.cluster import ConsistencyLevel
from cassandra import ReadTimeout
datafile = "/Users/XXX/adf.csv"
if os.path.exists(datafile):
os.remove(datafile)
def dumptableascsv():
os.system(
"sh /Users/XXX/Documents/dse-5.0.14/bin/cqlsh 127.0.0.1 9042 -u cassandra -p cassandra -e \" COPY XXX.agent_delta_fragment(agent_id,delta_id ,last_fragment_id ,processed) TO \'/Users/XXX/adf.csv\' WITH HEADER = true;\"\n"
" ")
#print datafile
def parse_file(datafile):
global fields
data = []
with open(datafile, "rb") as f:
header = f.readline().split(",")
# Loop through remaining lines in file object f
for line in f:
fields = line.split(",") # Split line into list
#print fields[3]
if fields[3]:
print "connect"
print fields[0],fields[1],fields[2],fields[3]
auth_provider = PlainTextAuthProvider(username='cassandra', password='cassandra')
cluster = Cluster(['127.0.0.1'],
load_balancing_policy=DCAwareRoundRobinPolicy(local_dc='Cassandra'),
port=9042, auth_provider=auth_provider, connect_timeout=10000,)
session = cluster.connect('XXX')
#session = cluster.connect('XXX')
# session.execute("select * from XXX.agent_delta_fragment LIMIT 1")
#rows = session.execute('select agent_id from XXX.agent_delta_fragment LIMIT 1')
#for row in rows:
# print row.agent_id
#batch = BatchStatement("DELETE FROM XXX.agent_delta_fragment_detail_test WHERE agent_id=%s and delta_id=%s and fragment_id=%s", (uuid.UUID(fields[0]), uuid.UUID(fields[1]), int(fields[3])))
session.execute("DELETE FROM XXX.agent_delta_fragment_detail WHERE agent_id=%s and delta_id=%s and fragment_id=%s", (uuid.UUID(fields[0]), uuid.UUID(fields[1]), int(fields[2])), timeout=1000000)
#session.execute(batch)
else:
print fields[3]
print "connect-False"
# print fields[3]
dumptableascsv()
parse_file(datafile)
I have a little edited the WebImporter class from this site.
import imp
import sys
import http.client
from urllib.parse import urlparse
def register_domain(name, url):
WebImporter.registered_domains[name] = url
class WebImporter:
registered_domains = {}
def find_module(self, fullname, path=None):
print('FIND', fullname, path)
if fullname in self.registered_domains:
return self
if fullname.rsplit('.')[0] not in self.registered_domains:
return None
try:
r = self._do_request(fullname, method="HEAD")
except ValueError:
return None
else:
r.close()
if r.status == 200:
return self
return None
def load_module(self, fullname):
if fullname in sys.modules:
return sys.modules[fullname]
mod = imp.new_module(fullname)
mod.__loader__ = self
mod.__name__ = fullname
mod.__package__ = ['/'.join(fullname.split('.'))]
mod.__path__ = '/'.join(fullname.split('.'))
sys.modules[fullname] = mod
if fullname not in self.registered_domains:
url = self._get_host_and_path(fullname).geturl()
mod.__file__ = url
r = self._do_request(fullname)
code = r.read()
assert r.status == 200
exec(code, mod.__dict__)
else:
mod.__file__ = "[fake module %r]" % fullname
mod.__path__ = []
return mod
def _do_request(self, fullname, method="GET"):
url = self._get_host_and_path(fullname)
c = http.client.HTTPConnection(url.netloc)
c.request(method, url.path)
return c.getresponse()
def _get_host_and_path(self, fullname):
tld, rest = fullname.rsplit('.', 1)
path = "/%s.py" % rest.replace('.', '/')
url = self.registered_domains[fullname.split('.')[0]] + path
url = urlparse(url)
return url
sys.meta_path.append(WebImporter())
I have two modules on my web server on a local network:
foo.py (file at: http://192.168.122.2:80/foo.py):
class foo:
def __init__(self):
print('FOO')
bar.py (file at: http://192.168.122.2:80/bar.py):
from foo import foo
class bar(foo):
def __init__(self):
foo.__init__(self)
print('BAR')
when I run:
import webimport
webimport.register_domain('dev', 'http://192.168.122.2:80')
from dev.foo import foo
if __name__ == '__main__':
foo()
it imports and runs the code
but when I want to import and run the bar:
import webimport
webimport.register_domain('dev', 'http://192.168.122.2:80')
from dev.bar import bar
if __name__ == '__main__':
bar()
the output is:
FIND foo None
Traceback (most recent call last):
File "test.py", line 4, in <module>
from dev.bar import bar
File "/home/user/tests/webimport/webimport.py", line 44, in load_module
exec(code, mod.__dict__)
File "<string>", line 1, in <module>
ImportError: No module named 'foo'
where is the problem?
You need to add path of foo module to PYTHONPATH. If you're using bash (on a Mac or GNU/Linux distro), add this to your ~/.bashrc
export PYTHONPATH="${PYTHONPATH}:/my/other/path"
EDITED
try to change in bar.py
from foo import foo
to
from dev.foo import foo
I need to create some files from a template. I'm using psycopg2 to fetch from a database. Then I loop through. Now I need to write to file.
Thanks!
import sys
from HTMLParser import HTMLParser
from xml.etree import cElementTree as etree
import psycopg2
import psycopg2.extras
class LinksParser(HTMLParser):
def __init__(self):
HTMLParser.__init__(self)
self.tb = etree.TreeBuilder()
def handle_starttag(self, tag, attributes):
self.tb.start(tag, dict(attributes))
def handle_endtag(self, tag):
self.tb.end(tag)
def handle_data(self, data):
self.tb.data(data)
def close(self):
HTMLParser.close(self)
return self.tb.close()
conn = psycopg2.connect(dbname="**", user="**", password="**", host="/tmp/", port="**")
cur.execute("SELECT * FROM landingpagedata;")
rows = cur.fetchall()
template = 'template.html'
parser = LinksParser()
# parser.feed(open('landingIndex.html').read()) #for testing
# root = parser.close()
for row in rows:
parser.feed(open(template).read())
root = parser.close()
#title
title = root.find(".//title")
title.text = str(row['title'])
f = open(row['page_name'], 'w')
root.write(f)
parser = LinksParser()
The error is:
Traceback (most recent call last):
File "/Users/Morgan13/Programming/LandingPageBuilder/landingPages/landingBuilderTest.py", line 87, in <module>
main()
File "/Users/Morgan13/Programming/LandingPageBuilder/landingPages/landingBuilderTest.py", line 75, in main
root.write('page_name')
AttributeError: write
Oh and I'm using open('page', 'w') because these pages exist already?
I think you want f.write(root), not root.write(f). (Assuming that str(root) gives you the HTML you want to write out.)
A number of people in my organization have different email names from perforce names, so I need to create an IEmailLookup derivation that overrides getAddress to do my evil bidding:
(From my master.cfg)
class MyIEmailLookup:
from buildbot import interfaces
__implements__ = interfaces.IEmailLookup
def getAddresses(user):
address_dict = {"user1", "user_one#our_domain.com"}
try:
address = address_dict[user]
except KeyError:
address = user + "#our_domain.com"
return address
maillookup = MyIEmailLookup()
from buildbot.status import mail
c['status'].append(mail.MailNotifier(....
....
lookup=maillookup
))
I've tried any number of permutations, but I either get:
Traceback (most recent call last):
File "/Library/Python/2.6/site-packages/buildbot-0.8.3p1-py2.6.egg/buildbot/scripts/runner.py", line 1071, in doCheckConfig
ConfigLoader(configFileName=configFileName)
File "/Library/Python/2.6/site-packages/buildbot-0.8.3p1-py2.6.egg/buildbot/scripts/checkconfig.py", line 46, in __init__
self.loadConfig(configFile, check_synchronously_only=True)
File "/Library/Python/2.6/site-packages/buildbot-0.8.3p1-py2.6.egg/buildbot/master.py", line 727, in loadConfig
exec f in localDict
File "/Users/playbuilder/buildbot/master.cfg", line 207, in <module>
lookup=maillookup
File "/Library/Python/2.6/site-packages/buildbot-0.8.3p1-py2.6.egg/buildbot/status/mail.py", line 293, in __init__
assert interfaces.IEmailLookup.providedBy(lookup)
AssertionError
...or any other number of issues, dependant upon how I try to implement the IEmailLookup interface.
I'm using buildbot 0.8.3p1 and python 2.6.1.
I see precious few examples of how to do this, and every one of them fails in my context. What am I missing here?
I just solved this problem myself.
First you need to add (somewhere at the top of the file)
from zope.interface import implements
and then change
__implements__ = interfaces.IEmailLookup
to
if implements:
implements( interfaces.IEmailLookup )
else:
__implements__ = interfaces.IEmailLookup
If you want to fetch email from perforce user, you can use this class:
# .-----------------------.
# | Perforce Email Lookup |
# `-----------------------'
from twisted.internet import defer, utils
from buildbot.util import ComparableMixin
from buildbot.interfaces import IEmailLookup
from zope.interface import implements
import os
import re
class PerforceEmailLookup(ComparableMixin):
implements(IEmailLookup)
compare_attrs = ["p4port", "p4user", "p4passwd", "p4bin"]
env_vars = ["P4CLIENT", "P4PORT", "P4PASSWD", "P4USER",
"P4CHARSET"]
def __init__(self,
p4port = None,
p4user = None,
p4passwd = None,
p4bin = 'p4'):
self.p4port = p4port
self.p4user = p4user
self.p4passwd = p4passwd
self.p4bin = p4bin
self.email_re = re.compile(r"Email:\s+(?P<email>\S+#\S+)\s*$")
def _get_process_output(self, args):
env = dict([(e, os.environ.get(e)) for e in self.env_vars if os.environ.get(e)])
d = utils.getProcessOutput(self.p4bin, args, env)
return d
#defer.deferredGenerator
def getAddress(self, name):
if '#' in name:
yield name
return
args = []
if self.p4port:
args.extend(['-p', self.p4port])
if self.p4user:
args.extend(['-u', self.p4user])
if self.p4passwd:
args.extend(['-P', self.p4passwd])
args.extend(['user', '-o', name])
wfd = defer.waitForDeferred(self._get_process_output(args))
yield wfd
result = wfd.getResult()
for line in result.split('\n'):
line = line.strip()
if not line: continue
m = self.email_re.match(line)
if m:
yield m.group('email')
return
yield name
usage would look like:
c['status'].append(
MailNotifier(
sendToInterestedUsers = True,
mode = 'failing',
lookup = PerforceEmailLookup(
p4port = "perforce:1666",
p4user = "buildbot",
p4passwd = "buildbot")))
Try this:
from buildbot.interfaces import IEmailLookup
from buildbot.util import ComparableMixin
from zope.interface import implements
class lookup_example_email(ComparableMixin):
implements(IEmailLookup)
def getAddress(self,user):
return "%s#example.com"%(user)
...
mn = MailNotifier(..., lookup=lookup_example_email(), extraRecipients=m)
Here's the piece of code I use which works with buildbot 2.3.1 in python3.6.
from buildbot.interfaces import IEmailLookup
from buildbot.util import ComparableMixin
from zope.interface import implementer
#implementer(IEmailLookup)
class EmailMap(ComparableMixin):
def getAddress(self, name):
return f'{name}#xxxxx'