I have about 100 spiders on a server. Every morning all spiders start scraping and writing all of the logs in their logs. Sometimes a couple of them gives me an error. When a spider gives me an error I have to go to the server and read from log file but I want to read the logs from the mail.
I already set dynamic mail sender as follow:
class FirstBotSpiderMiddleware:
def __init__(self, stats):
self.stats = stats
#classmethod
def from_crawler(cls, crawler):
s = cls(crawler.stats)
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
crawler.signals.connect(s.spider_closed, signal=signals.spider_closed)
return s
def process_spider_input(self, response, spider):
return None
def process_spider_output(self, response, result, spider):
for i in result:
yield i
def process_spider_exception(self, response, exception, spider):
pass
def process_start_requests(self, start_requests, spider):
for r in start_requests:
yield r
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
def spider_closed(self, spider,reason):
error_count = self.stats.get_value('log_count/ERROR')
counts = self.stats.get_value('item_scraped_count')
count_403 = self.stats.get_value('downloader/response_status_count/403')
count_404 = self.stats.get_value('downloader/response_status_count/404')
robots_404 = self.stats.get_value('robotstxt/response_status_count/404')
robots_403 = self.stats.get_value('robotstxt/response_status_count/403')
duplicate_count = self.stats.get_value('item_dropped_count')
#I want to read all logs here
content = "some stat string"
self.mailSender(spider.name,content,logs)
def mailSender(self,spider,content,logs):
send_mail(
"Scrapy "+spider+" done",
content,
djsettings.EMAIL_HOST_USER,
['xxx#xxx.com'],
)
I couldn't figure out how to read the error log at spider_closed on middleware dynamically. Do you have any suggestions?
I have implemented a similar method in my web scraping module.
Below is the implementation you can look at and take reference from.
import gzip
import datetime
from scrapy import signals
from scrapy.mail import MailSender
from scrapy.exceptions import NotConfigured
from scrapy.utils.serialize import ScrapyJSONEncoder
from collections import defaultdict
try:
from cStringIO import cStringIO as StringIO
except ImportError:
from io import StringIO
def format_size(size):
for x in ['bytes', 'KB', 'MB', 'GB']:
if size < 1024.0:
return "%3.1f %s" % (size, x)
size /= 1024.0
class GzipCompressor(gzip.GzipFile):
extension = '.gz'
mimetype = 'application/gzip'
def __init__(self):
super(GzipCompressor, self).__init__(
fileobj=PlainCompressor(), mode='w')
self.read = self.fileobj.read
class PlainCompressor(StringIO):
extension = ''
mimetype = 'text/plain'
def read(self, *args, **kwargs):
self.seek(0)
return StringIO.read(self, *args, **kwargs)
#property
def size(self):
return len(self.getvalue())
class StatusMailer(object):
def __init__(self, recipients, mail, compressor, crawler):
self.recipients = recipients
self.mail = mail
self.encoder = ScrapyJSONEncoder()
self.files = defaultdict(compressor)
self.num_items = 0
self.num_errors = 0
self.start_time = datetime.datetime.now()
#classmethod
def from_crawler(cls, crawler):
recipients = crawler.settings.getlist('STATUSMAILER_RECIPIENTS')
compression = crawler.settings.get('STATUSMAILER_COMPRESSION')
if not compression:
compressor = PlainCompressor
elif compression.lower().startswith('gz'):
compressor = GzipCompressor
else:
raise NotConfigured
if not recipients:
raise NotConfigured
mail = MailSender.from_settings(crawler.settings)
instance = cls(recipients, mail, compressor, crawler)
crawler.signals.connect(instance.item_scraped,
signal=signals.item_scraped)
crawler.signals.connect(instance.spider_error,
signal=signals.spider_error)
crawler.signals.connect(instance.spider_closed,
signal=signals.spider_closed)
return instance
def item_scraped(self, item, response, spider):
self.num_items += 1
self.files[spider.name + '.log'].write(str(self.num_items)+ " " + str(response.url) + '\n')
self.files[spider.name +
'-items.json'].write(self.encoder.encode(item))
def spider_error(self, failure, response, spider):
self.files[spider.name + '.log'].write(failure.getTraceback())
self.num_errors += 1
def spider_closed(self, spider, reason):
files = []
for name, compressed in self.files.items():
files.append((name + compressed.extension,
compressed.mimetype, compressed))
try:
size = self.files[spider.name + '-items.json'].size
except KeyError:
size = 0
body = '''Crawl statistics:
- Spider name: {0}
- Spider started at: {1}
- Spider finished at: {2}
- Number of items scraped: {3}
- Number of errors: {4}
- Size of scraped items: {5}'''.format(
spider.name,
self.start_time,
datetime.datetime.now(),
self.num_items,
self.num_errors,
format_size(size)
)
return self.mail.send(
to=self.recipients,
subject='Crawler for %s: %s' % (spider.name, reason),
body=body,
attachs=files
)
Related
Im trying to edit this project in Python to have HP ILO exporter for Prometheus, so far I read a few articles here on stackoverflow and tried to implement some functionalities, eventually I came up to partialy working script but the hostname is not changing after first request, is there a way to dump collector?
I have tried it with try&except but it just does not work.
The goal is to use curl like this
curl localhost:9116/metrics?hostname=ip
And what will happen if there will be 10 requests at the same time with different hostname? Should it create somekind of a queue?
Can someone help me? Thanks
Original Project : https://github.com/JackWindows/ilo-exporter
My code :
#!/usr/bin/env python
import collections
import os
import time
import flask
import redfish
import waitress
from flask import Flask
from prometheus_client import make_wsgi_app
from prometheus_client.core import GaugeMetricFamily, REGISTRY
from werkzeug.middleware.dispatcher import DispatcherMiddleware
from flask import request
from time import sleep
from flask import Flask, Response, request
import traceback
from werkzeug.wsgi import ClosingIterator
class AfterResponse:
def __init__(self, app=None):
self.callbacks = []
if app:
self.init_app(app)
def __call__(self, callback):
self.callbacks.append(callback)
return callback
def init_app(self, app):
# install extension
app.after_response = self
# install middleware
app.wsgi_app = AfterResponseMiddleware(app.wsgi_app, self)
def flush(self):
for fn in self.callbacks:
try:
fn()
except Exception:
traceback.print_exc()
class AfterResponseMiddleware:
def __init__(self, application, after_response_ext):
self.application = application
self.after_response_ext = after_response_ext
def __call__(self, environ, after_response):
iterator = self.application(environ, after_response)
try:
return ClosingIterator(iterator, [self.after_response_ext.flush])
except Exception:
traceback.print_exc()
return iterator
class ILOCollector(object):
def __init__(self, hostname: str, port: int = 443, user: str = 'admin', password: str = 'password') -> None:
self.ilo = redfish.LegacyRestClient(base_url=hostname, username=user, password=password)
self.ilo.login()
system = self.ilo.get('/redfish/v1/Systems/1/').obj
self.label_names = ('hostname', 'product_name', 'sn')
self.label_values = (hostname, system.Model, system.SerialNumber.strip())
def collect(self):
embedded_media = self.ilo.get('/redfish/v1/Managers/1/EmbeddedMedia/').obj
smart_storage = self.ilo.get('/redfish/v1/Systems/1/SmartStorage/').obj
thermal = self.ilo.get('/redfish/v1/Chassis/1/Thermal/').obj
power = self.ilo.get('/redfish/v1/Chassis/1/Power/').obj
g = GaugeMetricFamily('hpilo_health',
'iLO health status, -1: Unknown, 0: OK, 1: Degraded, 2: Failed.',
labels=self.label_names + ('component',))
def status_to_code(status: str) -> int:
status = status.lower()
ret = -1
if status == 'ok':
ret = 0
elif status == 'warning':
ret = 1
elif status == 'failed':
ret = 2
return ret
g.add_metric(self.label_values + ('embedded_media',), status_to_code(embedded_media.Controller.Status.Health))
g.add_metric(self.label_values + ('smart_storage',), status_to_code(smart_storage.Status.Health))
for fan in thermal.Fans:
g.add_metric(self.label_values + (fan.FanName,), status_to_code(fan.Status.Health))
yield g
g = GaugeMetricFamily('hpilo_fan_speed', 'Fan speed in percentage.',
labels=self.label_names + ('fan',), unit='percentage')
for fan in thermal.Fans:
g.add_metric(self.label_values + (fan.FanName,), fan.CurrentReading)
yield g
sensors_by_unit = collections.defaultdict(list)
for sensor in thermal.Temperatures:
if sensor.Status.State.lower() != 'enabled':
continue
reading = sensor.CurrentReading
unit = sensor.Units
sensors_by_unit[unit].append((sensor.Name, reading))
for unit in sensors_by_unit:
g = GaugeMetricFamily('hpilo_temperature', 'Temperature sensors reading.',
labels=self.label_names + ('sensor',), unit=unit.lower())
for sensor_name, sensor_reading in sensors_by_unit[unit]:
g.add_metric(self.label_values + (sensor_name,), sensor_reading)
yield g
g = GaugeMetricFamily('hpilo_power_current', 'Current power consumption in Watts.', labels=self.label_names,
unit='watts')
g.add_metric(self.label_values, power.PowerConsumedWatts)
yield g
label_values = self.label_values + (str(power.PowerMetrics.IntervalInMin),)
g = GaugeMetricFamily('hpilo_power_average', 'Average power consumption in Watts.',
labels=self.label_names + ('IntervalInMin',), unit='watts')
g.add_metric(label_values, power.PowerMetrics.AverageConsumedWatts)
yield g
g = GaugeMetricFamily('hpilo_power_min', 'Min power consumption in Watts.',
labels=self.label_names + ('IntervalInMin',), unit='watts')
g.add_metric(label_values, power.PowerMetrics.MinConsumedWatts)
yield g
g = GaugeMetricFamily('hpilo_power_max', 'Max power consumption in Watts.',
labels=self.label_names + ('IntervalInMin',), unit='watts')
g.add_metric(label_values, power.PowerMetrics.MaxConsumedWatts)
yield g
# Create Flask app
app = Flask('iLO Exporter')
#app.route('/')
def root():
return '''<html>
<head><title>iLO Exporter</title></head>
<body>
<h1>iLO Exporter</h1>
<p><a href='/metrics'>Metrics</a></p>
</body>
</html>'''
AfterResponse(app)
#app.after_response
def say_hi():
print("hi")
#app.route("/metrics")
def home():
try:
REGISTRY.unregister(collector)
except:
print("An exception occurred")
pass
port = int(os.getenv('ILO_PORT', 443))
user = os.getenv('ILO_USER', 'admin')
password = os.getenv('ILO_PASSWORD', 'password')
hostname = request.args.get('hostname')
app.wsgi_app = DispatcherMiddleware(app.wsgi_app, {
'/metrics': make_wsgi_app()
})
collector = ILOCollector(hostname, port, user, password)
REGISTRY.register(collector)
if __name__ == '__main__':
exporter_port = int(os.getenv('LISTEN_PORT', 9116))
waitress.serve(app, host='0.0.0.0', port=exporter_port)
Im trying to index PDF text to a python lib called Scout. I have tried doing the same thing with elasticsearch too. In both cases I can't figure out how to post text to an index in bulk, using python.
After a lot of research, I believe I need to use async http request. The only problem is, I don't understand async calls nor do I understand what a Scout python 'client' really is. I'm a self-taught programmer and still have many things I don't understand. my thought is the client cant stay open for a loop to keep using the connection. I have seen coding concepts like "await" and "sessions" in many books on programming. However, I don't know how to implement these concepts. Can someone help me write some python code that will successfully post new documents to a running scout server and explain how it's done?
Here is My attempt:
from scout_client import Scout
# import libraries to help read and create PDF
import PyPDF2
from fpdf import FPDF
import base64
import os
from flask import Flask, jsonify, request, render_template, json
# before you start, Run the Server.py file and create a Sqlite DB
# Step one loop though PDF in 'books' folder
for k in range(14,15):
# open the pdf file
read_pdf = PyPDF2.PdfFileReader("books/%s.pdf"%(k))
# Test to see if Step one is complete and succesful
#print (read_pdf)
# Step Two Gain intel on how many Pages are in the Document
# get the page numbers
num = read_pdf.getNumPages()
print ("PDF pages:", num)
# Step Three understand the data by page
# create a dictionary object for page data
all_pages = []
# Step For Create a new index in Scout Server
# client.create_index('test3')
# iterate the page numbers
for page in range(num):
data = read_pdf.getPage(page)
#page_mode = read_pdf.getPageMode()
# extract the page's text
page_text = data.extractText()
# put the text data into the dict
all_pages.append(page_text)
# initiate the Client from scout_client.py
client = Scout('http://localhost:8000')
# THe issue: I tryed for loops, and while loops but cant get past: urllib.error.HTTPError: HTTP Error 400: BAD REQUEST
i = 1
while i <= num:
client.create_document(all_pages[i],['test3'])
print(i,"....done")
i += 1
I get an error:
Traceback (most recent call last):
File "test.py", line 37, in <module>
client.create_document(all_pages[i],['test3'])
File "../Searchtest4/scout/scout_client.py", line 149, in create_document
return self.post('/documents/', post_data, attachments)
File "../Searchtest4/scout/scout_client.py", line 53, in post
return self.post_json(url, data)
File "../Searchtest4/scout/scout_client.py", line 63, in post_json
return json.loads(urlopen(request).read().decode('utf8'))
File "../lib/python3.7/urllib/request.py", line 222, in urlopen
return opener.open(url, data, timeout)
File "../lib/python3.7/urllib/request.py", line 531, in open
response = meth(req, response)
File "../lib/python3.7/urllib/request.py", line 641, in http_response
'http', request, response, code, msg, hdrs)
File "../lib/python3.7/urllib/request.py", line 569, in error
return self._call_chain(*args)
File "../lib/python3.7/urllib/request.py", line 503, in _call_chain
result = func(*args)
File "../lib/python3.7/urllib/request.py", line 649, in http_error_default
raise HTTPError(req.full_url, code, msg, hdrs, fp)
**urllib.error.HTTPError: HTTP Error 400: BAD REQUEST**
Here is the server that runs fine (server.py):
import logging
import optparse
import os
import sys
from flask import Flask
from werkzeug.serving import run_simple
from scout.exceptions import InvalidRequestException
from scout.models import database
from scout.models import Attachment
from scout.models import BlobData
from scout.models import Document
from scout.models import Index
from scout.models import IndexDocument
from scout.models import Metadata
from scout.views import register_views
logger = logging.getLogger('scout')
def create_server(config=None, config_file=None):
app = Flask(__name__)
# Configure application using a config file.
if config_file is not None:
app.config.from_pyfile(config_file)
# (Re-)Configure application using command-line switches/environment flags.
if config is not None:
app.config.update(config)
# Initialize the SQLite database.
initialize_database(app.config.get('DATABASE') or 'scout.db',
pragmas=app.config.get('SQLITE_PRAGMAS') or None)
register_views(app)
#app.errorhandler(InvalidRequestException)
def handle_invalid_request(exc):
return exc.response()
#app.before_request
def connect_database():
if database.database != ':memory:':
database.connect()
#app.teardown_request
def close_database(exc):
if database.database != ':memory:' and not database.is_closed():
database.close()
return app
def initialize_database(database_file, pragmas=None):
database.init(database_file, pragmas=pragmas)
try:
meth = database.execution_context
except AttributeError:
meth = database
with meth:
database.create_tables([
Attachment,
BlobData,
Document,
Index,
IndexDocument,
Metadata])
def run(app):
if app.config['DEBUG']:
app.run(host=app.config['HOST'], port=app.config['PORT'], debug=True)
else:
run_simple(
hostname=app.config['HOST'],
port=app.config['PORT'],
application=app,
threaded=True)
def panic(s, exit_code=1):
sys.stderr.write('\033[91m%s\033[0m\n' % s)
sys.stderr.flush()
sys.exit(exit_code)
def get_option_parser():
parser = optparse.OptionParser()
parser.add_option(
'-H',
'--host',
default='127.0.0.1',
dest='host',
help='The hostname to listen on. Defaults to 127.0.0.1.')
parser.add_option(
'-p',
'--port',
default=8000,
dest='port',
help='The port to listen on. Defaults to 8000.',
type='int')
parser.add_option(
'-u',
'--url-prefix',
dest='url_prefix',
help='URL path to prefix Scout API.')
parser.add_option(
'-s',
'--stem',
dest='stem',
help='Specify stemming algorithm for content.')
parser.add_option(
'-d',
'--debug',
action='store_true',
dest='debug',
help='Run Flask app in debug mode.')
parser.add_option(
'-c',
'--config',
dest='config',
help='Configuration module (python file).')
parser.add_option(
'--paginate-by',
default=50,
dest='paginate_by',
help='Number of documents displayed per page of results, default=50',
type='int')
parser.add_option(
'-k',
'--api-key',
dest='api_key',
help='Set the API key required to access Scout.')
parser.add_option(
'-C',
'--cache-size',
default=64,
dest='cache_size',
help='SQLite page-cache size (MB). Defaults to 64MB.',
type='int')
parser.add_option(
'-f',
'--fsync',
action='store_true',
dest='fsync',
help='Synchronize database to disk on every write.')
parser.add_option(
'-j',
'--journal-mode',
default='wal',
dest='journal_mode',
help='SQLite journal mode. Defaults to WAL (recommended).')
parser.add_option(
'-l',
'--logfile',
dest='logfile',
help='Log file')
return parser
def parse_options():
option_parser = get_option_parser()
options, args = option_parser.parse_args()
if options.logfile:
handler = logging.FileHandler(options.logfile)
logger.addHandler(handler)
config_file = os.environ.get('SCOUT_CONFIG') or options.config
config = {'DATABASE': os.environ.get('SCOUT_DATABASE')}
if len(args) == 0 and not config['DATABASE']:
panic('Error: missing required path to database file.')
elif len(args) > 1:
panic('Error: [%s] only accepts one argument, which is the path '
'to the database file.' % __file__)
elif args:
config['DATABASE'] = args[0]
pragmas = [('journal_mode', options.journal_mode)]
if options.cache_size:
pragmas.append(('cache_size', -1024 * options.cache_size))
if not options.fsync:
pragmas.append(('synchronous', 0))
config['SQLITE_PRAGMAS'] = pragmas
# Handle command-line options. These values will override any values
# that may have been specified in the config file.
if options.api_key:
config['AUTHENTICATION'] = options.api_key
if options.debug:
config['DEBUG'] = True
config['HOST'] = options.host or '127.0.0.1'
config['PORT'] = options.port or 8000
config['URL_PREFIX'] = options.url_prefix or ''
if options.paginate_by:
if options.paginate_by < 1 or options.paginate_by > 1000:
panic('paginate-by must be between 1 and 1000')
config['PAGINATE_BY'] = options.paginate_by
if options.stem:
if options.stem not in ('simple', 'porter'):
panic('Unrecognized stemmer. Must be "porter" or "simple".')
config['STEM'] = options.stem
return create_server(config, config_file)
def main():
app = parse_options()
run(app)
if __name__ == '__main__':
main()
and the so-called client (scout_client.py):
import base64
import json
try:
from email.generator import _make_boundary as choose_boundary
except ImportError:
from mimetools import choose_boundary
import mimetypes
import os
try:
from urllib.parse import urlencode
except ImportError:
from urllib import urlencode
try:
from urllib.request import Request
from urllib.request import urlopen
except ImportError:
from urllib2 import Request
from urllib2 import urlopen
import zlib
ENDPOINT = None
KEY = None
class Scout(object):
def __init__(self, endpoint=ENDPOINT, key=KEY):
self.endpoint = endpoint.rstrip('/')
self.key = key
def get_full_url(self, url):
return self.endpoint + url
def get_raw(self, url, **kwargs):
headers = {'Content-Type': 'application/json'}
if self.key:
headers['key'] = self.key
if kwargs:
if '?' not in url:
url += '?'
url += urlencode(kwargs, True)
request = Request(self.get_full_url(url), headers=headers)
fh = urlopen(request)
return fh.read()
def get(self, url, **kwargs):
return json.loads(self.get_raw(url, **kwargs))
def post(self, url, data=None, files=None):
if files:
return self.post_files(url, data, files)
else:
return self.post_json(url, data)
def post_json(self, url, data=None):
headers = {'Content-Type': 'application/json'}
if self.key:
headers['key'] = self.key
data = json.dumps(data or {})
if not isinstance(data, bytes):
data = data.encode('utf-8')
request = Request(self.get_full_url(url), data=data, headers=headers)
return json.loads(urlopen(request).read().decode('utf8'))
def post_files(self, url, json_data, files=None):
if not files or not isinstance(files, dict):
raise ValueError('One or more files is required. Files should be '
'passed as a dictionary of filename: file-like-'
'object.')
boundary = choose_boundary()
form_files = []
for i, (filename, file_obj) in enumerate(files.items()):
try:
data = file_obj.read()
except AttributeError:
data = bytes(file_obj)
mimetype = mimetypes.guess_type(filename)[0]
form_files.append((
'file_%s' % i,
filename,
mimetype or 'application/octet-stream',
data))
part_boundary = '--' + boundary
parts = [
part_boundary,
'Content-Disposition: form-data; name="data"',
'',
json.dumps(json_data)]
for field_name, filename, mimetype, data in form_files:
parts.extend((
part_boundary,
'Content-Disposition: file; name="%s"; filename="%s"' % (
field_name, filename),
'Content-Type: %s' % mimetype,
'',
data))
parts.append('--' + boundary + '--')
parts.append('')
headers = {'Content-Type': 'multipart/form-data; boundary=%s' %
boundary}
if self.key:
headers['key'] = self.key
data = '\r\n'.join(parts)
if not isinstance(data, bytes):
data = data.encode('utf-8')
request = Request(self.get_full_url(url), data=data, headers=headers)
return json.loads(urlopen(request).read())
def delete(self, url):
headers = {}
if self.key:
headers['key'] = self.key
request = Request(self.get_full_url(url), headers=headers)
request.get_method = lambda: 'DELETE'
fh = urlopen(request)
return json.loads(fh.read())
def get_indexes(self, **kwargs):
return self.get('/', **kwargs)['indexes']
def create_index(self, name):
return self.post('/', {'name': name})
def rename_index(self, old_name, new_name):
return self.post('/%s/' % old_name, {'name': new_name})
def delete_index(self, name):
return self.delete('/%s/' % name)
def get_index(self, name, **kwargs):
return self.get('/%s/' % name, **kwargs)
def get_documents(self, **kwargs):
return self.get('/documents/', **kwargs)
def create_document(self, content, indexes, identifier=None,
attachments=None, **metadata):
if not isinstance(indexes, (list, tuple)):
indexes = [indexes]
post_data = {
'content': content,
'identifier': identifier,
'indexes': indexes,
'metadata': metadata}
return self.post('/documents/', post_data, attachments)
def update_document(self, document_id=None, content=None, indexes=None,
metadata=None, identifier=None, attachments=None):
if not document_id and not identifier:
raise ValueError('`document_id` must be provided.')
data = {}
if content is not None:
data['content'] = content
if indexes is not None:
if not isinstance(indexes, (list, tuple)):
indexes = [indexes]
data['indexes'] = indexes
if metadata is not None:
data['metadata'] = metadata
if not data and not attachments:
raise ValueError('Nothing to update.')
return self.post('/documents/%s/' % document_id, data, attachments)
def delete_document(self, document_id=None):
if not document_id:
raise ValueError('`document_id` must be provided.')
return self.delete('/documents/%s/' % document_id)
def get_document(self, document_id=None):
if not document_id:
raise ValueError('`document_id` must be provided.')
return self.get('/documents/%s/' % document_id)
def attach_files(self, document_id, attachments):
return self.post_files('/documents/%s/attachments/' % document_id,
{}, attachments)
def detach_file(self, document_id, filename):
return self.delete('/documents/%s/attachments/%s/' %
(document_id, filename))
def update_file(self, document_id, filename, file_object):
return self.post_files('/documents/%s/attachments/%s/' %
(document_id, filename),
{}, {filename: file_object})
def get_attachments(self, document_id, **kwargs):
return self.get('/documents/%s/attachments/' % document_id, **kwargs)
def get_attachment(self, document_id, filename):
return self.get('/documents/%s/attachments/%s/' %
(document_id, filename))
def download_attachment(self, document_id, filename):
return self.get_raw('/documents/%s/attachments/%s/download/' %
(document_id, filename))
def search_attachments(self, **kwargs):
return self.get('/documents/attachments/', **kwargs)
class SearchProvider(object):
def content(self, obj):
raise NotImplementedError
def identifier(self, obj):
raise NotImplementedError
def metadata(self, obj):
raise NotImplementedError
class SearchSite(object):
def __init__(self, client, index):
self.client = client
self.index = index
self.registry = {}
def register(self, model_class, search_provider):
self.registry.setdefault(model_class, [])
self.registry[model_class].append(search_provider())
def unregister(self, model_class, search_provider=None):
if search_provider is None:
self.registry.pop(model_class, None)
elif model_class in self.registry:
self.registry[model_class] = [
sp for sp in self.registry[model_class]
if not isinstance(sp, search_provider)]
def store(self, obj):
if type(obj) not in self.registry:
return False
for provider in self.registry[type(obj)]:
content = provider.content(obj)
try:
metadata = provider.metadata(obj)
except NotImplementedError:
metadata = {}
try:
identifier = provider.identifier(obj)
except NotImplementedError:
pass
else:
metadata['identifier'] = identifier
self.client.create_document(content, self.index, **metadata)
return True
def remove(self, obj):
if type(obj) not in self.registry:
return False
for provider in self.registry[type(obj)]:
self.client.delete_document(provider.identifier(obj))
return True
Finally the Documentation for Scout:
https://scout.readthedocs.io/en/latest/server.html#index-detail-index-name
https://charlesleifer.com/blog/meet-scout-a-search-server-powered-by-sqlite/
Any Detailed Help is much appreciated:)
So i find a lib called scout and...got it to work!
from scout_client import Scout
# import libraries to help read and create PDF
import PyPDF2
from fpdf import FPDF
import base64
import os
from flask import Flask, jsonify, request, render_template, json
client = Scout('http://localhost:8000')
for k in range(7,18):
read_pdf = PyPDF2.PdfFileReader("books/%s.pdf"%(k))
num = read_pdf.getNumPages()
print ("PDF pages:", num)
all_pages = []
for page in range(num):
data = read_pdf.getPage(page)
page_text = data.extractText()
all_pages.append(page_text)
import requests
for z in all_pages:
url = 'http://localhost:8000/documents/'
data = {'content': z, 'indexes': ['test13']}
headers = {
'Content-Type': 'application/json',
}
response = requests.post(url, data=json.dumps(data), headers=headers)
print(response)
I can now loop though as many PDF's as I want locally
Post to the server for indexing
and search for keywords
Now I just need help with Making a basic front end with a search bar that calls data from a JSON response in python and flask.
I have this code that receives all of the networking resources of a web page.
I took this code from this site so I don't know how it works but I know that it receives all of the networking resources of a web page, which is what I need.
This is my code:
import sys, time
from PySide.QtCore import QUrl, SIGNAL
from PySide.QtGui import QApplication
from PySide.QtWebKit import QWebPage, QWebView, QWebSettings
from PySide.QtNetwork import QNetworkAccessManager, QNetworkRequest
#reload(sys)
#sys.setdefaultencoding('utf-8')
fn_log = 'url_dd.txt'
fp_log = open(fn_log, 'ab+')
class WebPage(QWebPage):
def __init__(self, logger=None, parent=None):
super(WebPage, self).__init__(parent)
def javaScriptConsoleMessage(self, message, lineNumber, sourceID):
sys.stderr.write('Javascritp error at line number %d\n' % (lineNumber))
sys.stderr.write('%s\n' % (message, ))
sys.stderr.write('Source ID: %s\n' % (sourceID, ))
class Crawler(QApplication):
def __init__(self, url):
super(Crawler, self).__init__(sys.argv)
self.url = url
self.web_view = QWebView()
self.web_page = WebPage()
self.web_view.setPage(self.web_page)
self.web_frame = self.web_page.mainFrame()
self.network = NetworkAccessManager()
self.web_page.setNetworkAccessManager(self.network)
self.settings = self.web_page.settings().globalSettings()
self.settings.setAttribute(QWebSettings.PluginsEnabled, False)
QWebSettings.clearMemoryCaches()
self.web_view.resize(1024, 9000)
self.connect(self.web_page, SIGNAL('loadFinished(bool)'), self.loadFinished)
print('Before loading')
self.web_view.load(QUrl(self.url))
print('After loading')
def loadFinished(self, ok):
print('Start loadFinished()')
print('Start writing')
#with open('content_dd.txt', 'ab+') as fp:
#fp.write(self.web_frame.toHtml().toUtf8())
print('End writing')
print('End loadFinished()')
try:
self.quit()
except Exception as e:
print('FATAL ERROR: %s' % (str(e)))
class NetworkAccessManager(QNetworkAccessManager):
def __init__(self):
super(NetworkAccessManager, self).__init__()
# QNetworkAccessManager.__init__(self)
self.connect(self, SIGNAL('finished (QNetworkReply *)'), self.finishd)
def createRequest(self, operation, request, data):
# url = request.url().toString()
self.setNetworkAccessible(self.Accessible)
return QNetworkAccessManager.createRequest(self, operation, request, data)
def finishd(self, reply):
print('In NetworkAccessManager finishd')
url = str(reply.url().toString())
log = '%s: %s\n' % (time.ctime(), url)
#fp_log.write(log)
print(reply)
print(reply.request())
print(log)
print(url)
if __name__ == '__main__':
url = 'http://need4bit.com'
crawler = Crawler(url)
sys.exit(crawler.exec_())
How should I modify this code so it could save all the resources into a directory.
I write script, which uploads large file to third party website. I use poster module
Here code
class upload_in_chunks(object):
def __init__(self, filename, chunksize):
self.filename = filename
self.chunksize = chunksize
self.totalsize = os.path.getsize(filename)
self.readsofar = 0
def __iter__(self):
with open(self.filename, 'rb') as file:
while True:
data = file.read(self.chunksize)
if not data:
sys.stderr.write("\n")
break
self.readsofar += len(data)
percent = self.readsofar * 1e2 / self.totalsize
sys.stderr.write("\r{percent:3.0f}%".format(percent=percent))
yield data
def __len__(self):
return self.totalsize
class IterableToFileAdapter(object):
def __init__(self, iterable):
self.iterator = iter(iterable)
self.length = len(iterable)
def read(self, size=-1): # TBD: add buffer for `len(data) > size` case
return next(self.iterator, b'')
def __len__(self):
return self.length
def upload_file(s, data, files, url):
class FuncThread(threading.Thread):
def __init__(self, s, data, files, url):
threading.Thread.__init__(self)
self.result = False
self.error = None
self.s = s
self.data = data
self.files = files
self.url = url
def run(self):
try:
register_openers()
items = []
for name, value in self.data.items():
items.append(MultipartParam(name, value))
#add file
for name, value in self.files.items():
items.append(MultipartParam.from_file(name, value))
datagen, h = multipart_encode(items)
#h.update(headers)
# Create the Request object
#self.result = self.s.post(self.url, data = datagen.read(), headers = h)
for item, value in headers.iteritems():
h[item] = value
#for item, value in headers.iteritems():
#h[item] = value
h["Cookie"] = ""
for item, value in requests.utils.dict_from_cookiejar(self.s.cookies).iteritems():
h["Cookie"] += item + "=" + value + "; "
print h["Cookie"]
#h[item] = value
print h
request = urllib2.Request(self.url, datagen, h)
#Actually do the request, and get the response
self.result = urllib2.urlopen(request).read()
except Exception as error:
self.error = error
self.result = False
def _stop(self):
if self.isAlive():
threading.Thread._Thread__stop(self)
try:
it = FuncThread(s, data, files, url)
it.start()
it.join(upload_timeout)
if it.isAlive():
it._stop()
return (False, "[-] Time expired.")
else:
return (it.result, it.error)
except Exception as error:
return (False, error)
And in my upload.py I use
try:
payload = {'Filename': f_path,
'tags': 'tag1',
'keywords': 'keyword1',
'cookie': cookie,
'title': 'Large file',
'categories': '["35","13","37"]',
'privacy': 'community',
'source': 5,
'production': "professional"}
(r, error) = upload_file(self.s, payload, {'Filedata': f_path}, url)
I have dedicated server with 100m\bit internet channel.
But my script upload files using only 1-2 m\bit
The website recipient has not any limitations and it doesn't throttle me.
How can I force my code to use full 100 m\bit?
I'm trying to get Scrapy to send me an email when a crawler finishes or breaks. There's already a built-in extension for sending stats, but I'd like to attach the spider's errors as <spidername>-errors.log and the scraped items as <spidername>-items.json.
I've connected callbacks to each of the signals, but for some reason only the last one is firing:
from scrapy import signals
from scrapy.mail import MailSender
from scrapy.exceptions import NotConfigured
from scrapy.utils.serialize import ScrapyJSONEncoder
from collections import defaultdict
try:
from cStringIO import cStringIO as StringIO
except ImportError:
from StringIO import StringIO
class StatusMailer(object):
def __init__(self, recipients, mail, crawler):
self.recipients = recipients
self.mail = mail
self.files = defaultdict(StringIO)
self.encoder = ScrapyJSONEncoder(crawler=crawler)
#classmethod
def from_crawler(cls, crawler):
recipients = crawler.settings.getlist("STATUSMAILER_RCPTS")
if not recipients:
raise NotConfigured
mail = MailSender.from_settings(crawler.settings)
instance = cls(recipients, mail, crawler)
crawler.signals.connect(instance.item_scraped, signal=signals.item_scraped)
crawler.signals.connect(instance.spider_error, signal=signals.spider_error)
crawler.signals.connect(instance.spider_closed, signal=signals.spider_closed)
return instance
def item_scraped(self, item, response, spider):
self.files[spider.name + '.json'].write(self.encoder.encode(item) + '\n')
def spider_error(self, failure, response, spider):
self.files[spider.name + '-errors.log'].write(failure.getTraceback() + '\n')
def spider_closed(self, spider):
return self.mail.send(
to=self.recipients,
subject="Crawler for %s finished" % spider.name,
body="",
attachs=[(name, 'text/plain', contents) for name, contents in self.files.items()]
)
Is there any way to access the exported items and the spider's errors from within Scrapy (possibly making some kind of hook to intercept those messages before they're printed to the console)?
Well, it looks like the problem was much simpler than I had thought. You have to "rewind" StringIO instances after you're completely done writing to them:
def spider_closed(self, spider):
files = []
for name, contents in self.files.items():
contents.seek(0)
files.append((name, 'text/plain', contents))
return self.mail.send(
to=self.recipients,
subject="Crawler for %s finished" % spider.name,
body="",
attachs=files
)
For anyone that's interested, here's my email extension:
import gzip
import datetime
from scrapy import signals
from scrapy.mail import MailSender
from scrapy.exceptions import NotConfigured
from scrapy.utils.serialize import ScrapyJSONEncoder
from collections import defaultdict
try:
from cStringIO import cStringIO as StringIO
except ImportError:
from StringIO import StringIO
def format_size(size):
for x in ['bytes', 'KB', 'MB', 'GB']:
if size < 1024.0:
return "%3.1f %s" % (size, x)
size /= 1024.0
class GzipCompressor(gzip.GzipFile):
extension = '.gz'
mimetype = 'application/gzip'
def __init__(self):
super(GzipCompressor, self).__init__(fileobj=PlainCompressor(), mode='w')
self.read = self.fileobj.read
class PlainCompressor(StringIO):
extension = ''
mimetype = 'text/plain'
def read(self, *args, **kwargs):
self.seek(0)
return StringIO.read(self, *args, **kwargs)
#property
def size(self):
return len(self.getvalue())
class StatusMailer(object):
def __init__(self, recipients, mail, compressor, crawler):
self.recipients = recipients
self.mail = mail
self.encoder = ScrapyJSONEncoder(crawler=crawler)
self.files = defaultdict(compressor)
self.num_items = 0
self.num_errors = 0
#classmethod
def from_crawler(cls, crawler):
recipients = crawler.settings.getlist('STATUSMAILER_RECIPIENTS')
compression = crawler.settings.get('STATUSMAILER_COMPRESSION')
if not compression:
compressor = PlainCompressor
elif compression.lower().startswith('gz'):
compressor = GzipCompressor
else:
raise NotConfigured
if not recipients:
raise NotConfigured
mail = MailSender.from_settings(crawler.settings)
instance = cls(recipients, mail, compressor, crawler)
crawler.signals.connect(instance.item_scraped, signal=signals.item_scraped)
crawler.signals.connect(instance.spider_error, signal=signals.spider_error)
crawler.signals.connect(instance.spider_closed, signal=signals.spider_closed)
crawler.signals.connect(instance.request_received, signal=signals.request_received)
return instance
def item_scraped(self, item, response, spider):
self.files[spider.name + '-items.json'].write(self.encoder.encode(item))
self.num_items += 1
def spider_error(self, failure, response, spider):
self.files[spider.name + '.log'].write(failure.getTraceback())
self.num_errors += 1
def request_received(self, request, spider):
self.files[spider.name + '.log'].write(str(request) + '\n')
def spider_closed(self, spider, reason):
files = []
for name, compressed in self.files.items():
files.append((name + compressed.extension, compressed.mimetype, compressed))
try:
size = self.files[spider.name + '-items.json'].size
except KeyError:
size = 0
body='''Crawl statistics:
- Spider name: {0}
- Spider finished at: {1}
- Number of items scraped: {2}
- Number of errors: {3}
- Size of scraped items: {4}'''.format(
spider.name,
datetime.datetime.now(),
self.num_items,
self.num_errors,
format_size(size)
)
return self.mail.send(
to=self.recipients,
subject='Crawler for %s: %s' % (spider.name, reason),
body=body,
attachs=files
)
Add it to your settings.py:
EXTENSIONS = {
'your_package.extensions.StatusMailer': 80
}
And configure it:
STATUSMAILER_RECIPIENTS = []
STATUSMAILER_COMPRESSION = 'gzip'
#STATUSMAILER_COMPRESSION = None
MAIL_HOST = 'smtp.gmail.com'
MAIL_PORT = 587
MAIL_USER = ''
MAIL_PASS = ''