Error 401 when searching tweets with Python-Twitter - python

This is an app to search tweets and save the results to mongdb. This line is problematic
search_results = twitter_api.search.tweets(q=q, count=100, **kw)
I'm getting this error:
Traceback (most recent call last):
File "mongo.py", line 89, in
results = twitter_search(twitter_api,q,max_results=10)
File "mongo.py", line 21, in twitter_search
search_results = twitter_api.search.tweets(q=q, count=100, **kw)
File "/usr/local/lib/python2.7/dist-packages/twitter/api.py", line 312, in __call__
return self._handle_response(req, uri, arg_data, _timeout)
File "/usr/local/lib/python2.7/dist-packages/twitter/api.py", line 345, in _handle_response
raise TwitterHTTPError(e, uri, self.format, arg_data)
twitter.api.TwitterHTTPError: Twitter sent status 401 for URL: 1.1/search/tweets.json using parameters: (count=100&oauth_consumer_key= &oauth_nonce= &oauth_signature_method=HMAC-SHA1&oauth_timestamp=1457104474&oauth_token= &oauth_version=1.0&q=CrossFit&oauth_signature=6SMuzJBWfFUEmZHlu%2F4Hmt6Zu8k%3D)
details: {u'errors': [{u'message': u'Could not authenticate you.', u'code': 32}]}
Here is my code:
import json
import pymongo
import twitter
def oauth_login():
CONSUMER_KEY = ''
CONSUMER_SECRET = ''
OAUTH_TOKEN = ''
OAUTH_TOKEN_SECRET = ''
auth = twitter.oauth.OAuth(OAUTH_TOKEN,OAUTH_TOKEN_SECRET,CONSUMER_KEY,CONSUMER_SECRET)
twitter_api = twitter.Twitter(auth=auth)
return twitter_api
def twitter_search(twitter_api, q, max_results=200, **kw):
search_results = twitter_api.search.tweets(q=q, count=100, **kw)
statuses = search_results['statuses']
max_results = min(1000, max_results)
for _ in range(10): # 10*100 = 1000
try:
next_results = search_results['search_metadata']['next_results']
except KeyError, e:
break
kwargs = dict([kv.split('=')
for kv in next_results[1:].split("&") ])
search_results = twitter_api.search.tweets(**kwargs)
statuses += search_results['statuses']
if len(statuses) > max_results:
break
return statuses
def save_to_mongo(data,mongo_db,mongo_db_coll,**mongo_conn_kw):
client = pymongo.MongoClient(**mongo_conn_kw)
db = client[mongo_db]
coll = db[mongo_db_coll]
return coll.insert(data)
def load_from_mongo(mongo_db,mongo_db_coll,return_cursor=False,criteria=None,projection=None,**mongo_conn_kw):
client = pymongo.MongoClient(**mongo_conn_kw)
db = client[mongo_db]
coll = db[mongo_db_coll]
if criteria is None:
criteria = {}
if projection is None:
cursor = coll.find(criteria)
else:
cursor = coll.find(criteria,projection)
#Returning a cursor is recommended for large amounts of data
if return_cursor:
return cursor
else:
return [item for item in cursor]
#Sample usage
q = 'CrossFit'
twitter_api = oauth_login()
results = twitter_search(twitter_api,q,max_results=10)
save_to_mongo(results,'search_results',q)
load_from_mongo('search_results',q)

Related

Flask-SQLAlchemy Queue Pool limit reached

I have an application where a service makes a POST request to a flask app which then inserts the body of request into a postgresql database into a table as well as inserting data into several other tables.
The problem occurs once a curtain amount of requests are made that the queue pool limit is reach and the app throws an error. i have isolated it down to the extra functions called to do the inserting into other tables. i don't understand what is happening.
Each service is a docker container
172.30.0.17 - - [16/Jun/2022 06:25:07] "GET /api/orgs HTTP/1.1" 500 -
Traceback (most recent call last):
File "/root/.local/lib/python3.9/site-packages/flask/app.py", line 2095, in __call__
return self.wsgi_app(environ, start_response)
File "/root/.local/lib/python3.9/site-packages/flask/app.py", line 2080, in wsgi_app
response = self.handle_exception(e)
File "/root/.local/lib/python3.9/site-packages/flask/app.py", line 2077, in wsgi_app
response = self.full_dispatch_request()
File "/root/.local/lib/python3.9/site-packages/flask/app.py", line 1525, in full_dispatch_request
rv = self.handle_user_exception(e)
File "/root/.local/lib/python3.9/site-packages/flask/app.py", line 1523, in full_dispatch_request
rv = self.dispatch_request()
File "/root/.local/lib/python3.9/site-packages/flask/app.py", line 1509, in dispatch_request
return self.ensure_sync(self.view_functions[rule.endpoint])(**req.view_args)
File "/service/routes/get/get_org.py", line 17, in base_route
query = [convert_class(item) for item in Organization.query.all()]
File "/root/.local/lib/python3.9/site-packages/sqlalchemy/orm/query.py", line 2768, in all
return self._iter().all()
File "/root/.local/lib/python3.9/site-packages/sqlalchemy/orm/query.py", line 2903, in _iter
result = self.session.execute(
File "/root/.local/lib/python3.9/site-packages/sqlalchemy/orm/session.py", line 1711, in execute
conn = self._connection_for_bind(bind)
File "/root/.local/lib/python3.9/site-packages/sqlalchemy/orm/session.py", line 1552, in _connection_for_bind
return self._transaction._connection_for_bind(
File "/root/.local/lib/python3.9/site-packages/sqlalchemy/orm/session.py", line 747, in _connection_for_bind
conn = bind.connect()
File "/root/.local/lib/python3.9/site-packages/sqlalchemy/engine/base.py", line 3234, in connect
return self._connection_cls(self, close_with_result=close_with_result)
File "/root/.local/lib/python3.9/site-packages/sqlalchemy/engine/base.py", line 96, in __init__
else engine.raw_connection()
File "/root/.local/lib/python3.9/site-packages/sqlalchemy/engine/base.py", line 3313, in raw_connection
return self._wrap_pool_connect(self.pool.connect, _connection)
File "/root/.local/lib/python3.9/site-packages/sqlalchemy/engine/base.py", line 3280, in _wrap_pool_connect
return fn()
File "/root/.local/lib/python3.9/site-packages/sqlalchemy/pool/base.py", line 310, in connect
return _ConnectionFairy._checkout(self)
File "/root/.local/lib/python3.9/site-packages/sqlalchemy/pool/base.py", line 868, in _checkout
fairy = _ConnectionRecord.checkout(pool)
File "/root/.local/lib/python3.9/site-packages/sqlalchemy/pool/base.py", line 476, in checkout
rec = pool._do_get()
File "/root/.local/lib/python3.9/site-packages/sqlalchemy/pool/impl.py", line 134, in _do_get
raise exc.TimeoutError(
sqlalchemy.exc.TimeoutError: QueuePool limit of size 20 overflow 20 reached, connection timed out, timeout 30.00 (Background on this error at: https://sqlalche.me/e/14/3o7r)
Config
class Config:
# Database
SQLALCHEMY_DATABASE_URI = environ.get("DB_URI")
SQLALCHEMY_ECHO = False
SQLALCHEMY_TRACK_MODIFICATIONS = False
SQLALCHEMY_ENGINE_OPTIONS = {
"pool_size": 1,
"max_overflow": 0,
}
App init
db = SQLAlchemy()
# create flask App
def create_app():
app = Flask(__name__, instance_relative_config=False)
# DB config - postgreSQL
app.config['SQLALCHEMY_DATABASE_URI'] = getenv("DB_URI")
app.config.from_object('config.Config')
# config server with CORS
# cors = CORS(app, resources={r"/*": {"origins": "*"}})
with app.app_context():
db.init_app(app)
from service.routes.post import post_event
return app
app = create_app()
POST request
#app.route('/api/orgs/<org_id>/sites/<site_id>/sessions/<session_id>/events', methods=["POST"])
def edger_post_request(org_id, site_id, session_id):
print("URL params: {}, {}, {}".format(
org_id, site_id, session_id
))
# parse body of post request
try:
request_body = request.json
event_data = json.loads(request_body["data"])
except:
return "error in body of request", 400
print("Body of request: ", request_body)
# check if body of request is valid
errors_body, has_error = validate_input(request_body)
print(errors_body, has_error)
if has_error:
return Response(errors_body, 400)
# insert event
event_dict = dict()
try:
event_dict = {
"id": request_body["id"],
"timestamp": request_body["timestamp"],
"type": request_body["type"],
"event_type": event_data["type"],
"data": request_body["data"],
"org_id": org_id,
"site_id": site_id,
"device_id": request_body["device_id"],
"session_id": request_body["session_id"]
}
if "frame_url" in request_body.keys():
event_dict["frame_url"] = request_body["frame_url"]
else:
event_dict["frame_url"] = None
insert_event = Event(**event_dict)
db.session.add(insert_event)
# commit event to DB
db.session.commit()
# Insert extra tables
insert_threads(org_id, site_id, event_dict, event_data)
db.session.commit()
db.session.close()
return {"result": "success"}, 200
except Exception as e:
print("- - - - - - - - - - - - - - - - -")
db.session.rollback()
print("Error inserting Event", e)
print("- - - - - - - - - - - - - - - - -")
db.session.close()
return {"result": "unsuccessful"}, 409
insert_threads function
def insert_threads(org_id, site_id, body, data):
event_data_insert(body, data),
event_type_insert(data["type"]),
insert_org(org_id),
insert_site(org_id, site_id),
insert_device(org_id, site_id, body['device_id']),
insert_session(org_id, site_id, body['device_id'], body['session_id'], body['timestamp'])
def insert_org(org_id):
org_list = getattr(g, "org_list", None)
# query organization insert if doesn't exist
if org_list is None:
check = [org.id for org in Organization.query.all()]
g.org_list = check
org_list = check
if org_id not in org_list:
insert_org = Organization(id=org_id, name="temp_name")
db.session.add(insert_org)
org_list.append(insert_org.id)
g.org_list = org_list
def insert_site(org_id, site_id):
site_list = getattr(g, "site_list", None)
# query Site insert if doesn't exist
if site_list is None:
check = [site.id for site in Site.query.all()]
g.site_list = check
site_list = check
if site_id not in site_list:
insert_site = Site(id=site_id, org_id=org_id, name="temp_name")
db.session.add(insert_site)
site_list.append(insert_site.id)
g.site_list = site_list
def insert_device(org_id, site_id, device_id):
device_list = getattr(g, "device_list", None)
# query device insert if doesn't exist
if device_list is None:
check = [device.id for device in Device.query.all()]
g.device_list = check
device_list = check
if device_id not in device_list:
insert_device = Device(id=device_id, site_id=site_id, org_id=org_id)
db.session.add(insert_device)
device_list.append(insert_device.id)
g.device_list = device_list
def insert_session(org_id, site_id, device_id, session_id, timestamp):
session_list = getattr(g, "session_list", None)
# query session and insert if it doesn't already exist
if session_list is None:
check = [session.id for session in Session.query.all()]
g.session_list = check
session_list = check
if session_id not in session_list:
insert_session = Session(
id=session_id,
timestamp=timestamp,
org_id=org_id,
site_id=site_id,
device_id=device_id
)
db.session.add(insert_session)
session_list.append(insert_session.id)
g.session_list = session_list
def event_data_insert(event, event_data):
# get white list from global var else None
query_list = getattr(g, "query_list", None)
# if query_list is None get query_list from DB
if query_list is None:
check = [convert_class(item) for item in Event_data_query_list.query.all()]
g.query_list = check
query_list = check
# loop though query_list and insert key/value pair into event_data table
for key in query_list:
print("the key: ", key)
key_path = key["key_name"].split(".")
# check if keys exist in data Object and continue down levels till the value is reached
if key_path[0] in event_data["data"] and key["type"] == event_data["type"]:
print("inside first if")
value = event_data['data']
for sub_key in key_path:
print("inside loop")
if sub_key in value.keys():
print("in subkey" , sub_key)
value = value[sub_key]
# handle if keys and values dont exist
else:
value = None
break
if value is None:
continue
# determine where value is an Integer or a String and insert into correct column
val_int = None
val_str = None
if type(value) == str:
val_str = value
elif type(value) == int:
val_int = value
# add event_data row to session
insert = Event_data(event_id=event["id"], type=event_data["type"], key=key["key_name"], value_int=val_int, value_str=val_str )
print("insert object", insert)
db.session.add(insert)
def event_type_insert(event_type):
event_type_list = getattr(g, "event_type_list", None)
if event_type_list is None:
check = [item.event_type for item in Event_type.query.all()]
g.event_type_list = check
event_type_list = check
if event_type not in event_type_list:
insert = Event_type(event_type=event_type)
db.session.add(insert)
g.event_type_list.append(event_type)

How do you send many documents to a Scout Server in Python using the Python Scout Client?

Im trying to index PDF text to a python lib called Scout. I have tried doing the same thing with elasticsearch too. In both cases I can't figure out how to post text to an index in bulk, using python.
After a lot of research, I believe I need to use async http request. The only problem is, I don't understand async calls nor do I understand what a Scout python 'client' really is. I'm a self-taught programmer and still have many things I don't understand. my thought is the client cant stay open for a loop to keep using the connection. I have seen coding concepts like "await" and "sessions" in many books on programming. However, I don't know how to implement these concepts. Can someone help me write some python code that will successfully post new documents to a running scout server and explain how it's done?
Here is My attempt:
from scout_client import Scout
# import libraries to help read and create PDF
import PyPDF2
from fpdf import FPDF
import base64
import os
from flask import Flask, jsonify, request, render_template, json
# before you start, Run the Server.py file and create a Sqlite DB
# Step one loop though PDF in 'books' folder
for k in range(14,15):
# open the pdf file
read_pdf = PyPDF2.PdfFileReader("books/%s.pdf"%(k))
# Test to see if Step one is complete and succesful
#print (read_pdf)
# Step Two Gain intel on how many Pages are in the Document
# get the page numbers
num = read_pdf.getNumPages()
print ("PDF pages:", num)
# Step Three understand the data by page
# create a dictionary object for page data
all_pages = []
# Step For Create a new index in Scout Server
# client.create_index('test3')
# iterate the page numbers
for page in range(num):
data = read_pdf.getPage(page)
#page_mode = read_pdf.getPageMode()
# extract the page's text
page_text = data.extractText()
# put the text data into the dict
all_pages.append(page_text)
# initiate the Client from scout_client.py
client = Scout('http://localhost:8000')
# THe issue: I tryed for loops, and while loops but cant get past: urllib.error.HTTPError: HTTP Error 400: BAD REQUEST
i = 1
while i <= num:
client.create_document(all_pages[i],['test3'])
print(i,"....done")
i += 1
I get an error:
Traceback (most recent call last):
File "test.py", line 37, in <module>
client.create_document(all_pages[i],['test3'])
File "../Searchtest4/scout/scout_client.py", line 149, in create_document
return self.post('/documents/', post_data, attachments)
File "../Searchtest4/scout/scout_client.py", line 53, in post
return self.post_json(url, data)
File "../Searchtest4/scout/scout_client.py", line 63, in post_json
return json.loads(urlopen(request).read().decode('utf8'))
File "../lib/python3.7/urllib/request.py", line 222, in urlopen
return opener.open(url, data, timeout)
File "../lib/python3.7/urllib/request.py", line 531, in open
response = meth(req, response)
File "../lib/python3.7/urllib/request.py", line 641, in http_response
'http', request, response, code, msg, hdrs)
File "../lib/python3.7/urllib/request.py", line 569, in error
return self._call_chain(*args)
File "../lib/python3.7/urllib/request.py", line 503, in _call_chain
result = func(*args)
File "../lib/python3.7/urllib/request.py", line 649, in http_error_default
raise HTTPError(req.full_url, code, msg, hdrs, fp)
**urllib.error.HTTPError: HTTP Error 400: BAD REQUEST**
Here is the server that runs fine (server.py):
import logging
import optparse
import os
import sys
from flask import Flask
from werkzeug.serving import run_simple
from scout.exceptions import InvalidRequestException
from scout.models import database
from scout.models import Attachment
from scout.models import BlobData
from scout.models import Document
from scout.models import Index
from scout.models import IndexDocument
from scout.models import Metadata
from scout.views import register_views
logger = logging.getLogger('scout')
def create_server(config=None, config_file=None):
app = Flask(__name__)
# Configure application using a config file.
if config_file is not None:
app.config.from_pyfile(config_file)
# (Re-)Configure application using command-line switches/environment flags.
if config is not None:
app.config.update(config)
# Initialize the SQLite database.
initialize_database(app.config.get('DATABASE') or 'scout.db',
pragmas=app.config.get('SQLITE_PRAGMAS') or None)
register_views(app)
#app.errorhandler(InvalidRequestException)
def handle_invalid_request(exc):
return exc.response()
#app.before_request
def connect_database():
if database.database != ':memory:':
database.connect()
#app.teardown_request
def close_database(exc):
if database.database != ':memory:' and not database.is_closed():
database.close()
return app
def initialize_database(database_file, pragmas=None):
database.init(database_file, pragmas=pragmas)
try:
meth = database.execution_context
except AttributeError:
meth = database
with meth:
database.create_tables([
Attachment,
BlobData,
Document,
Index,
IndexDocument,
Metadata])
def run(app):
if app.config['DEBUG']:
app.run(host=app.config['HOST'], port=app.config['PORT'], debug=True)
else:
run_simple(
hostname=app.config['HOST'],
port=app.config['PORT'],
application=app,
threaded=True)
def panic(s, exit_code=1):
sys.stderr.write('\033[91m%s\033[0m\n' % s)
sys.stderr.flush()
sys.exit(exit_code)
def get_option_parser():
parser = optparse.OptionParser()
parser.add_option(
'-H',
'--host',
default='127.0.0.1',
dest='host',
help='The hostname to listen on. Defaults to 127.0.0.1.')
parser.add_option(
'-p',
'--port',
default=8000,
dest='port',
help='The port to listen on. Defaults to 8000.',
type='int')
parser.add_option(
'-u',
'--url-prefix',
dest='url_prefix',
help='URL path to prefix Scout API.')
parser.add_option(
'-s',
'--stem',
dest='stem',
help='Specify stemming algorithm for content.')
parser.add_option(
'-d',
'--debug',
action='store_true',
dest='debug',
help='Run Flask app in debug mode.')
parser.add_option(
'-c',
'--config',
dest='config',
help='Configuration module (python file).')
parser.add_option(
'--paginate-by',
default=50,
dest='paginate_by',
help='Number of documents displayed per page of results, default=50',
type='int')
parser.add_option(
'-k',
'--api-key',
dest='api_key',
help='Set the API key required to access Scout.')
parser.add_option(
'-C',
'--cache-size',
default=64,
dest='cache_size',
help='SQLite page-cache size (MB). Defaults to 64MB.',
type='int')
parser.add_option(
'-f',
'--fsync',
action='store_true',
dest='fsync',
help='Synchronize database to disk on every write.')
parser.add_option(
'-j',
'--journal-mode',
default='wal',
dest='journal_mode',
help='SQLite journal mode. Defaults to WAL (recommended).')
parser.add_option(
'-l',
'--logfile',
dest='logfile',
help='Log file')
return parser
def parse_options():
option_parser = get_option_parser()
options, args = option_parser.parse_args()
if options.logfile:
handler = logging.FileHandler(options.logfile)
logger.addHandler(handler)
config_file = os.environ.get('SCOUT_CONFIG') or options.config
config = {'DATABASE': os.environ.get('SCOUT_DATABASE')}
if len(args) == 0 and not config['DATABASE']:
panic('Error: missing required path to database file.')
elif len(args) > 1:
panic('Error: [%s] only accepts one argument, which is the path '
'to the database file.' % __file__)
elif args:
config['DATABASE'] = args[0]
pragmas = [('journal_mode', options.journal_mode)]
if options.cache_size:
pragmas.append(('cache_size', -1024 * options.cache_size))
if not options.fsync:
pragmas.append(('synchronous', 0))
config['SQLITE_PRAGMAS'] = pragmas
# Handle command-line options. These values will override any values
# that may have been specified in the config file.
if options.api_key:
config['AUTHENTICATION'] = options.api_key
if options.debug:
config['DEBUG'] = True
config['HOST'] = options.host or '127.0.0.1'
config['PORT'] = options.port or 8000
config['URL_PREFIX'] = options.url_prefix or ''
if options.paginate_by:
if options.paginate_by < 1 or options.paginate_by > 1000:
panic('paginate-by must be between 1 and 1000')
config['PAGINATE_BY'] = options.paginate_by
if options.stem:
if options.stem not in ('simple', 'porter'):
panic('Unrecognized stemmer. Must be "porter" or "simple".')
config['STEM'] = options.stem
return create_server(config, config_file)
def main():
app = parse_options()
run(app)
if __name__ == '__main__':
main()
and the so-called client (scout_client.py):
import base64
import json
try:
from email.generator import _make_boundary as choose_boundary
except ImportError:
from mimetools import choose_boundary
import mimetypes
import os
try:
from urllib.parse import urlencode
except ImportError:
from urllib import urlencode
try:
from urllib.request import Request
from urllib.request import urlopen
except ImportError:
from urllib2 import Request
from urllib2 import urlopen
import zlib
ENDPOINT = None
KEY = None
class Scout(object):
def __init__(self, endpoint=ENDPOINT, key=KEY):
self.endpoint = endpoint.rstrip('/')
self.key = key
def get_full_url(self, url):
return self.endpoint + url
def get_raw(self, url, **kwargs):
headers = {'Content-Type': 'application/json'}
if self.key:
headers['key'] = self.key
if kwargs:
if '?' not in url:
url += '?'
url += urlencode(kwargs, True)
request = Request(self.get_full_url(url), headers=headers)
fh = urlopen(request)
return fh.read()
def get(self, url, **kwargs):
return json.loads(self.get_raw(url, **kwargs))
def post(self, url, data=None, files=None):
if files:
return self.post_files(url, data, files)
else:
return self.post_json(url, data)
def post_json(self, url, data=None):
headers = {'Content-Type': 'application/json'}
if self.key:
headers['key'] = self.key
data = json.dumps(data or {})
if not isinstance(data, bytes):
data = data.encode('utf-8')
request = Request(self.get_full_url(url), data=data, headers=headers)
return json.loads(urlopen(request).read().decode('utf8'))
def post_files(self, url, json_data, files=None):
if not files or not isinstance(files, dict):
raise ValueError('One or more files is required. Files should be '
'passed as a dictionary of filename: file-like-'
'object.')
boundary = choose_boundary()
form_files = []
for i, (filename, file_obj) in enumerate(files.items()):
try:
data = file_obj.read()
except AttributeError:
data = bytes(file_obj)
mimetype = mimetypes.guess_type(filename)[0]
form_files.append((
'file_%s' % i,
filename,
mimetype or 'application/octet-stream',
data))
part_boundary = '--' + boundary
parts = [
part_boundary,
'Content-Disposition: form-data; name="data"',
'',
json.dumps(json_data)]
for field_name, filename, mimetype, data in form_files:
parts.extend((
part_boundary,
'Content-Disposition: file; name="%s"; filename="%s"' % (
field_name, filename),
'Content-Type: %s' % mimetype,
'',
data))
parts.append('--' + boundary + '--')
parts.append('')
headers = {'Content-Type': 'multipart/form-data; boundary=%s' %
boundary}
if self.key:
headers['key'] = self.key
data = '\r\n'.join(parts)
if not isinstance(data, bytes):
data = data.encode('utf-8')
request = Request(self.get_full_url(url), data=data, headers=headers)
return json.loads(urlopen(request).read())
def delete(self, url):
headers = {}
if self.key:
headers['key'] = self.key
request = Request(self.get_full_url(url), headers=headers)
request.get_method = lambda: 'DELETE'
fh = urlopen(request)
return json.loads(fh.read())
def get_indexes(self, **kwargs):
return self.get('/', **kwargs)['indexes']
def create_index(self, name):
return self.post('/', {'name': name})
def rename_index(self, old_name, new_name):
return self.post('/%s/' % old_name, {'name': new_name})
def delete_index(self, name):
return self.delete('/%s/' % name)
def get_index(self, name, **kwargs):
return self.get('/%s/' % name, **kwargs)
def get_documents(self, **kwargs):
return self.get('/documents/', **kwargs)
def create_document(self, content, indexes, identifier=None,
attachments=None, **metadata):
if not isinstance(indexes, (list, tuple)):
indexes = [indexes]
post_data = {
'content': content,
'identifier': identifier,
'indexes': indexes,
'metadata': metadata}
return self.post('/documents/', post_data, attachments)
def update_document(self, document_id=None, content=None, indexes=None,
metadata=None, identifier=None, attachments=None):
if not document_id and not identifier:
raise ValueError('`document_id` must be provided.')
data = {}
if content is not None:
data['content'] = content
if indexes is not None:
if not isinstance(indexes, (list, tuple)):
indexes = [indexes]
data['indexes'] = indexes
if metadata is not None:
data['metadata'] = metadata
if not data and not attachments:
raise ValueError('Nothing to update.')
return self.post('/documents/%s/' % document_id, data, attachments)
def delete_document(self, document_id=None):
if not document_id:
raise ValueError('`document_id` must be provided.')
return self.delete('/documents/%s/' % document_id)
def get_document(self, document_id=None):
if not document_id:
raise ValueError('`document_id` must be provided.')
return self.get('/documents/%s/' % document_id)
def attach_files(self, document_id, attachments):
return self.post_files('/documents/%s/attachments/' % document_id,
{}, attachments)
def detach_file(self, document_id, filename):
return self.delete('/documents/%s/attachments/%s/' %
(document_id, filename))
def update_file(self, document_id, filename, file_object):
return self.post_files('/documents/%s/attachments/%s/' %
(document_id, filename),
{}, {filename: file_object})
def get_attachments(self, document_id, **kwargs):
return self.get('/documents/%s/attachments/' % document_id, **kwargs)
def get_attachment(self, document_id, filename):
return self.get('/documents/%s/attachments/%s/' %
(document_id, filename))
def download_attachment(self, document_id, filename):
return self.get_raw('/documents/%s/attachments/%s/download/' %
(document_id, filename))
def search_attachments(self, **kwargs):
return self.get('/documents/attachments/', **kwargs)
class SearchProvider(object):
def content(self, obj):
raise NotImplementedError
def identifier(self, obj):
raise NotImplementedError
def metadata(self, obj):
raise NotImplementedError
class SearchSite(object):
def __init__(self, client, index):
self.client = client
self.index = index
self.registry = {}
def register(self, model_class, search_provider):
self.registry.setdefault(model_class, [])
self.registry[model_class].append(search_provider())
def unregister(self, model_class, search_provider=None):
if search_provider is None:
self.registry.pop(model_class, None)
elif model_class in self.registry:
self.registry[model_class] = [
sp for sp in self.registry[model_class]
if not isinstance(sp, search_provider)]
def store(self, obj):
if type(obj) not in self.registry:
return False
for provider in self.registry[type(obj)]:
content = provider.content(obj)
try:
metadata = provider.metadata(obj)
except NotImplementedError:
metadata = {}
try:
identifier = provider.identifier(obj)
except NotImplementedError:
pass
else:
metadata['identifier'] = identifier
self.client.create_document(content, self.index, **metadata)
return True
def remove(self, obj):
if type(obj) not in self.registry:
return False
for provider in self.registry[type(obj)]:
self.client.delete_document(provider.identifier(obj))
return True
Finally the Documentation for Scout:
https://scout.readthedocs.io/en/latest/server.html#index-detail-index-name
https://charlesleifer.com/blog/meet-scout-a-search-server-powered-by-sqlite/
Any Detailed Help is much appreciated:)
So i find a lib called scout and...got it to work!
from scout_client import Scout
# import libraries to help read and create PDF
import PyPDF2
from fpdf import FPDF
import base64
import os
from flask import Flask, jsonify, request, render_template, json
client = Scout('http://localhost:8000')
for k in range(7,18):
read_pdf = PyPDF2.PdfFileReader("books/%s.pdf"%(k))
num = read_pdf.getNumPages()
print ("PDF pages:", num)
all_pages = []
for page in range(num):
data = read_pdf.getPage(page)
page_text = data.extractText()
all_pages.append(page_text)
import requests
for z in all_pages:
url = 'http://localhost:8000/documents/'
data = {'content': z, 'indexes': ['test13']}
headers = {
'Content-Type': 'application/json',
}
response = requests.post(url, data=json.dumps(data), headers=headers)
print(response)
I can now loop though as many PDF's as I want locally
Post to the server for indexing
and search for keywords
Now I just need help with Making a basic front end with a search bar that calls data from a JSON response in python and flask.

Why is there a Imaplib.abort EOF error after 10 correctly crawled messages?

I need to Copy every mail from my 'inbox'+sub folders to 2 new folders, one for short subjects and another for the long ones.
The crawl always abort after the 10th mail, and I need to repeat the process for 10 000 of them.. if someone had a solution I would be very grateful, here is my code :
import imaplib, getpass, re, email
pattern_uid = re.compile(b'\d+ \(UID (?P<uid>\d+)\)')
def parse_mailbox(data):
flags, b, c = data.partition(' ')
separator, b, name = c.partition(' ')
return (flags, separator.replace('"', ''), name.replace('"', ''))
def connect(email):
imap = imaplib.IMAP4_SSL("outlook.office365.com",993)
password = "*******"
imap.login(email, password)
return imap
def disconnect(imap):
imap.logout()
def parse_uid(data):
match = pattern_uid.match(data)
return match.group('uid')
if __name__ == '__main__':
imap = connect('************')
rv,data = imap.list()
# print(data)
if rv == 'OK':
for mbox in data:
flags,separator,name = parse_mailbox(bytes.decode(mbox))
print(name)
if 'HasNoChildren' in flags and '2' in name:
name = name.replace('/ Inbox','Inbox')
rv2, data2 = imap.select('"'+name+'"')
print(rv2)
resp, items = imap.search(None, 'All')
email_ids = items[0].split()
# print(items)
print(email_ids)
# Assuming that you are moving the latest email.
# for mail in email_ids:
# if
print(len(email_ids))
for i in range(0,len(email_ids)):
print('NOUVEAU MESSAGE \n')
print(email_ids[i])
resp, data = imap.fetch(email_ids[i], "(UID)")
rv,sujet = imap.fetch(email_ids[i],("(RFC822)"))
varSubject= ""
for response_part in sujet:
if isinstance(response_part, tuple):
msg = email.message_from_bytes(response_part[1])
varSubject = msg['subject']
break
print(varSubject)
print(i)
print(type(data[0]))
print(data[0])
try :
msg_uid = parse_uid(data[0])
except AttributeError:
continue
num = ""
dossier = ""
if len(varSubject) <= 20:
dossier = 'INBOX/sujets_courts'
imap.create(dossier)
elif len(varSubject) > 20:
dossier = 'INBOX/sujets_longs'
imap.create(dossier)
print(varSubject+'******'+dossier)
print(msg_uid)
result = imap.uid('COPY', msg_uid, dossier)
# if result[0] == 'OK':
# mov, data = imap.uid('STORE', msg_uid , '+FLAGS', '(\Deleted)')
# imap.expunge()
disconnect(imap)
The email and the password are substituted by '****'
Thanks a lot :)
console output :
Traceback (most recent call last):
File "C:\Users\lschaub\AppData\Local\Programs\Python\Python36-32\lib\imaplib.py", line 1011, in _command_complete
typ, data = self._get_tagged_response(tag)
File "C:\Users\lschaub\AppData\Local\Programs\Python\Python36-32\lib\imaplib.py", line 1131, in _get_tagged_response
self._get_response()
File "C:\Users\lschaub\AppData\Local\Programs\Python\Python36-32\lib\imaplib.py", line 1039, in _get_response
resp = self._get_line()
File "C:\Users\lschaub\AppData\Local\Programs\Python\Python36-32\lib\imaplib.py", line 1143, in _get_line
raise self.abort('socket error: EOF')
imaplib.abort: socket error: EOF
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "movemail.py", line 44, in <module>
resp, data = imap.fetch(email_ids[i], "(UID)")
File "C:\Users\lschaub\AppData\Local\Programs\Python\Python36-32\lib\imaplib.py", line 526, in fetch
typ, dat = self._simple_command(name, message_set, message_parts)
File "C:\Users\lschaub\AppData\Local\Programs\Python\Python36-32\lib\imaplib.py", line 1188, in _simple_command
return self._command_complete(name, self._command(name, *args))
File "C:\Users\lschaub\AppData\Local\Programs\Python\Python36-32\lib\imaplib.py", line 1013, in _command_complete
raise self.abort('command: %s => %s' % (name, val))
imaplib.abort: command: FETCH => socket error: EOF
OK I figured out that the problem comes from the creation of an already-created folder, but I could not find any folder.exists() look-a-like method in the imaplib doc...

Python script to harvest tweets to a MongoDb works with users but not hashtags. Any ideas why not?

I'm playing around the Twitter API and am in the process of developing a script to pull all Tweets with a certain hashtag down to a local mongoDB. I have it working fine when I'm downloading tweets from users, but when downloading tweets from a hashtag I get:
return loads(fp.read(),
AttributeError: 'int' object has no attribute 'read'
Can anyone offer their infinite wisdom into how I could get this script to work?
To run, save it as a .py file, cd to the folder and run:
python twitter.py
Code:
__author__ = 'Tom Cusack'
import pymongo
import oauth2 as oauth
import urllib2, json
import sys, argparse, time
def oauth_header(url, consumer, token):
params = {'oauth_version': '1.0',
'oauth_nonce': oauth.generate_nonce(),
'oauth_timestamp': int(time.time()),
}
req = oauth.Request(method = 'GET',url = url, parameters = params)
req.sign_request(oauth.SignatureMethod_HMAC_SHA1(),consumer, token)
return req.to_header()['Authorization'].encode('utf-8')
def main():
### Twitter Settings
numtweets = '32000'
verbose = 'store_true'
retweet = 'store_false'
CONSUMER_KEY = 'M7Xu9Wte0eIZvqhb4G9HnIn3G'
CONSUMER_SECRET = 'c8hB4Qwps2aODQUx7UsyzQuCRifEp3PKu6hPQll8wnJGIhbKgZ'
ACCESS_TOKEN = '3213221313-APuXuNjVMbRbZpu6sVbETbgqkponGsZJVT53QmG'
ACCESS_SECRET = 'BJHrqWC9ed3pA5oDstSMCYcUcz2pYF3DmJ7jcuDe7yxvi'
base_url = url = 'https://api.twitter.com/1.1/search/tweets.json?include_entities=true&count=200&q=#mongodb&include_rts=%s' % (retweet)
oauth_consumer = oauth.Consumer(key = CONSUMER_KEY, secret = CONSUMER_SECRET)
oauth_token = oauth.Token(key = ACCESS_TOKEN, secret = ACCESS_SECRET)
### Mongodb Settings
uri = 'mongodb://127.0.0.1:27017/SARKY'
if uri != None:
try:
conn = pymongo.MongoClient(uri)
print 'Pulling Tweets..'
except:
print 'Error: Unable to connect to DB. Check uri variable.'
return
uri_parts = pymongo.uri_parser.parse_uri(uri)
db = conn[uri_parts['database']]
db['twitter-harvest'].ensure_index('id_str')
### Helper Variables for Harvest
max_id = -1
tweet_count = 0
stream = 0
### Begin Harvesting
while True:
auth = oauth_header(url, oauth_consumer, oauth_token)
headers = {"Authorization": auth}
request = urllib2.Request(url, headers = headers)
try:
stream = urllib2.urlopen(request)
except urllib2.HTTPError, err:
if err.code == 404:
print 'Error: Unknown user. Check --user arg'
return
if err.code == 401:
print 'Error: Unauthorized. Check Twitter credentials'
return
tweet_list = json.load(stream)
if len(tweet_list) == 0:
print 'No tweets to harvest!'
return
if 'errors' in tweet_list:
print 'Hit rate limit, code: %s, message: %s' % (tweets['errors']['code'], tweets['errors']['message'])
return
if max_id == -1:
tweets = tweet_list
else:
tweets = tweet_list[1:]
if len(tweets) == 0:
print 'Finished Harvest!'
return
for tweet in tweets:
max_id = id_str = tweet['id_str']
try:
if tweet_count == numtweets:
print 'Finished Harvest- hit numtweets!'
return
if uri != None:
db[user].update({'id_str':id_str},tweet,upsert = True)
else:
print tweet['text']
tweet_count+=1
if verbose == True and uri != None:
print tweet['text']
except Exception, err:
print 'Unexpected error encountered: %s' %(err)
return
url = base_url + '&max_id=' + max_id
if __name__ == '__main__':
try:
main()
except SystemExit as e:
if e.code == 0:
pass
You initially set stream = 0. When your try...except block catches a HTTP response with a code that isn't 404 or 401, stream is still equal to 0, but your except block doesn't break out of the function.
I'd look more closely at what this response says.

Tweepy not working

So I was just trying to run a tweepy script to collect tweets.
I've setup the database but now I'm running into an error.
Starting...
Started user: user1
Exception in thread Thread-1:
Traceback (most recent call last):
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/threading.py", line 810, in __bootstrap_inner
self.run()
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/threading.py", line 763, in run
self.__target(*self.__args, **self.__kwargs)
File "build/bdist.macosx-10.6-intel/egg/tweepy/streaming.py", line 414, in filter
self.body['follow'] = u','.join(follow).encode(encoding)
TypeError: sequence item 0: expected string or Unicode, int found
EDIT: The script is actually using: from urllib import urlencode_noplus but the _noplus is not in the urllib which is why I simply deleted it from the code. Although I suspect this for causing the error..
import tweepy
import threading
import logging
from tweepy.models import Status
from tweepy.utils import import_simplejson
from urllib import urlencode
import json
import re
json = import_simplejson()
class Stream:
def __init__(self, consumer_key, consumer_secret,
key, secret, name):
self.auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
self.auth.set_access_token(key, secret)
self.tweetsBuffer = TweetsBuffer()
self.name = name
self.logger = logging.getLogger('TwitterCollector')
#check credentials
if not tweepy.API(self.auth).verify_credentials():
print "Invalid credentials for user: ",self.name,".\nExiting..."
logging.error("Invalid credentials for user: "+self.name+".\nExiting...")
exit(0)
def run(self, users_list = None):
sl = StreamListener()
sl.init(self.tweetsBuffer)
try:
streamer = tweepy.Stream(auth=self.auth,
listener=sl,
timeout=3000000000,
include_entities=1,
include_rts=1)
#load friends
filter = []
if users_list is None:
filter = tweepy.API(self.auth).friends_ids()
else:
for subList in users_list:
for user in subList['users']:
filter.append(user.id)
#remove duplicates
filter = list(set(filter))
sThread = threading.Thread(target=streamer.filter, args=(filter,))
sThread.start()
return sThread
except Exception, e:
print e
def getTweetsBuffer(self):
return self.tweetsBuffer
def getUserList(self, lists):
if lists is None:
return None
api = tweepy.API(self.auth)
users_list = []
for list in lists:
users = []
members = tweepy.Cursor(
api.list_members,
list['owner'],
list['slug']
).items()
for member in members:
users.append(member)
users_list.append(
{
'owner' : list['owner'],
'slug' : list['slug'],
'users' : users
})
return users_list
class StreamListener(tweepy.StreamListener):
def init(self, tweetsBuffer):
#set buffer
self.tweetsBuffer = tweetsBuffer
def parse_status(self, status, retweet = False):
tweet = {
'tweet_id':status.id,
'tweet_text':status.text,
'created_at':status.created_at,
'geo_lat':status.coordinates['coordinates'][0]
if not status.coordinates is None
else 0,
'geo_long': status.coordinates['coordinates'][1]
if not status.coordinates is None
else 0,
'user_id':status.user.id,
'tweet_url':"http://twitter.com/"+status.user.id_str+"/status/"+status.id_str,
'retweet_count':status.retweet_count,
'original_tweet_id':status.retweeted_status.id
if not retweet and (status.retweet_count > 0)
else 0,
'urls': status.entities['urls'],
'hashtags':status.entities['hashtags'],
'mentions': status.entities['user_mentions']
}
#parse user object
user = {
'user_id':status.user.id,
'screen_name': status.user.screen_name,
'name': status.user.name,
'followers_count': status.user.followers_count,
'friends_count': status.user.friends_count,
'description': status.user.description
if not status.user.description is None
else "N/A",
'image_url': status.user.profile_image_url,
'location': status.user.location
if not status.user.location is None
else "N/A",
'created_at': status.user.created_at
}
return {'tweet':tweet, 'user':user}
def on_data(self, data):
if 'in_reply_to_status_id' in data:
status = Status.parse(self.api, json.loads(data))
if self.on_status(status, data) is False:
return False
elif 'delete' in data:
delete = json.loads(data)['delete']['status']
if self.on_delete(delete['id'], delete['user_id']) is False:
return False
elif 'limit' in data:
if self.on_limit(json.loads(data)['limit']['track']) is False:
return False
def on_status(self, status, rawJsonData):
try:
#parse tweet
tweet = self.parse_status(status)
tweet['raw_json'] = rawJsonData
self.tweetsBuffer.insert(tweet)
#parse retweet
if tweet['tweet']['retweet_count'] > 0:
retweet = self.parse_status(status.retweeted_status, True)
retweet['raw_json'] = None
self.tweetsBuffer.insert(retweet)
except Exception:
# Catch any unicode errors while printing to console
# and just ignore them to avoid breaking application.
pass
class TweetsBuffer():
tweetsBuffer = []
def __init__(self):
self.lock = threading.Lock()
def insert(self, tweet):
self.lock.acquire()
self.tweetsBuffer.append(tweet)
self.lock.release()
def pop(self):
self.lock.acquire()
tweet = self.tweetsBuffer.pop() if len(self.tweetsBuffer) > 0 else None
self.lock.release()
return tweet

Categories

Resources