Tweepy not working - python

So I was just trying to run a tweepy script to collect tweets.
I've setup the database but now I'm running into an error.
Starting...
Started user: user1
Exception in thread Thread-1:
Traceback (most recent call last):
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/threading.py", line 810, in __bootstrap_inner
self.run()
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/threading.py", line 763, in run
self.__target(*self.__args, **self.__kwargs)
File "build/bdist.macosx-10.6-intel/egg/tweepy/streaming.py", line 414, in filter
self.body['follow'] = u','.join(follow).encode(encoding)
TypeError: sequence item 0: expected string or Unicode, int found
EDIT: The script is actually using: from urllib import urlencode_noplus but the _noplus is not in the urllib which is why I simply deleted it from the code. Although I suspect this for causing the error..
import tweepy
import threading
import logging
from tweepy.models import Status
from tweepy.utils import import_simplejson
from urllib import urlencode
import json
import re
json = import_simplejson()
class Stream:
def __init__(self, consumer_key, consumer_secret,
key, secret, name):
self.auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
self.auth.set_access_token(key, secret)
self.tweetsBuffer = TweetsBuffer()
self.name = name
self.logger = logging.getLogger('TwitterCollector')
#check credentials
if not tweepy.API(self.auth).verify_credentials():
print "Invalid credentials for user: ",self.name,".\nExiting..."
logging.error("Invalid credentials for user: "+self.name+".\nExiting...")
exit(0)
def run(self, users_list = None):
sl = StreamListener()
sl.init(self.tweetsBuffer)
try:
streamer = tweepy.Stream(auth=self.auth,
listener=sl,
timeout=3000000000,
include_entities=1,
include_rts=1)
#load friends
filter = []
if users_list is None:
filter = tweepy.API(self.auth).friends_ids()
else:
for subList in users_list:
for user in subList['users']:
filter.append(user.id)
#remove duplicates
filter = list(set(filter))
sThread = threading.Thread(target=streamer.filter, args=(filter,))
sThread.start()
return sThread
except Exception, e:
print e
def getTweetsBuffer(self):
return self.tweetsBuffer
def getUserList(self, lists):
if lists is None:
return None
api = tweepy.API(self.auth)
users_list = []
for list in lists:
users = []
members = tweepy.Cursor(
api.list_members,
list['owner'],
list['slug']
).items()
for member in members:
users.append(member)
users_list.append(
{
'owner' : list['owner'],
'slug' : list['slug'],
'users' : users
})
return users_list
class StreamListener(tweepy.StreamListener):
def init(self, tweetsBuffer):
#set buffer
self.tweetsBuffer = tweetsBuffer
def parse_status(self, status, retweet = False):
tweet = {
'tweet_id':status.id,
'tweet_text':status.text,
'created_at':status.created_at,
'geo_lat':status.coordinates['coordinates'][0]
if not status.coordinates is None
else 0,
'geo_long': status.coordinates['coordinates'][1]
if not status.coordinates is None
else 0,
'user_id':status.user.id,
'tweet_url':"http://twitter.com/"+status.user.id_str+"/status/"+status.id_str,
'retweet_count':status.retweet_count,
'original_tweet_id':status.retweeted_status.id
if not retweet and (status.retweet_count > 0)
else 0,
'urls': status.entities['urls'],
'hashtags':status.entities['hashtags'],
'mentions': status.entities['user_mentions']
}
#parse user object
user = {
'user_id':status.user.id,
'screen_name': status.user.screen_name,
'name': status.user.name,
'followers_count': status.user.followers_count,
'friends_count': status.user.friends_count,
'description': status.user.description
if not status.user.description is None
else "N/A",
'image_url': status.user.profile_image_url,
'location': status.user.location
if not status.user.location is None
else "N/A",
'created_at': status.user.created_at
}
return {'tweet':tweet, 'user':user}
def on_data(self, data):
if 'in_reply_to_status_id' in data:
status = Status.parse(self.api, json.loads(data))
if self.on_status(status, data) is False:
return False
elif 'delete' in data:
delete = json.loads(data)['delete']['status']
if self.on_delete(delete['id'], delete['user_id']) is False:
return False
elif 'limit' in data:
if self.on_limit(json.loads(data)['limit']['track']) is False:
return False
def on_status(self, status, rawJsonData):
try:
#parse tweet
tweet = self.parse_status(status)
tweet['raw_json'] = rawJsonData
self.tweetsBuffer.insert(tweet)
#parse retweet
if tweet['tweet']['retweet_count'] > 0:
retweet = self.parse_status(status.retweeted_status, True)
retweet['raw_json'] = None
self.tweetsBuffer.insert(retweet)
except Exception:
# Catch any unicode errors while printing to console
# and just ignore them to avoid breaking application.
pass
class TweetsBuffer():
tweetsBuffer = []
def __init__(self):
self.lock = threading.Lock()
def insert(self, tweet):
self.lock.acquire()
self.tweetsBuffer.append(tweet)
self.lock.release()
def pop(self):
self.lock.acquire()
tweet = self.tweetsBuffer.pop() if len(self.tweetsBuffer) > 0 else None
self.lock.release()
return tweet

Related

How to remove the error "TypeError: can't concat str to bytes"

I tried the GitHub code to make a USB connection with an iOS device and Python on a PC.
The above code was created in Python2, so I ran 2to3 -w usbmux.py to convert it to Python3 code.
The Python3 code is as follows.
import socket, struct, select, sys
try:
import plistlib
haveplist = True
except:
haveplist = False
class MuxError(Exception):
pass
class MuxVersionError(MuxError):
pass
class SafeStreamSocket:
def __init__(self, address, family):
self.sock = socket.socket(family, socket.SOCK_STREAM)
self.sock.connect(address)
def send(self, msg):
totalsent = 0
while totalsent < len(msg):
sent = self.sock.send(msg[totalsent:])
if sent == 0:
raise MuxError("socket connection broken")
totalsent = totalsent + sent
def recv(self, size):
msg = ''
while len(msg) < size:
chunk = self.sock.recv(size-len(msg))
if chunk == '':
raise MuxError("socket connection broken")
msg = msg + chunk
return msg
class MuxDevice(object):
def __init__(self, devid, usbprod, serial, location):
self.devid = devid
self.usbprod = usbprod
self.serial = serial
self.location = location
def __str__(self):
return "<MuxDevice: ID %d ProdID 0x%04x Serial '%s' Location 0x%x>"%(self.devid, self.usbprod, self.serial, self.location)
class BinaryProtocol(object):
TYPE_RESULT = 1
TYPE_CONNECT = 2
TYPE_LISTEN = 3
TYPE_DEVICE_ADD = 4
TYPE_DEVICE_REMOVE = 5
VERSION = 0
def __init__(self, socket):
self.socket = socket
self.connected = False
def _pack(self, req, payload):
if req == self.TYPE_CONNECT:
return struct.pack("IH", payload['DeviceID'], payload['PortNumber']) + "\x00\x00"
elif req == self.TYPE_LISTEN:
return ""
else:
raise ValueError("Invalid outgoing request type %d"%req)
def _unpack(self, resp, payload):
if resp == self.TYPE_RESULT:
return {'Number':struct.unpack("I", payload)[0]}
elif resp == self.TYPE_DEVICE_ADD:
devid, usbpid, serial, pad, location = struct.unpack("IH256sHI", payload)
serial = serial.split("\0")[0]
return {'DeviceID': devid, 'Properties': {'LocationID': location, 'SerialNumber': serial, 'ProductID': usbpid}}
elif resp == self.TYPE_DEVICE_REMOVE:
devid = struct.unpack("I", payload)[0]
return {'DeviceID': devid}
else:
raise MuxError("Invalid incoming request type %d"%req)
def sendpacket(self, req, tag, payload={}):
payload = self._pack(req, payload)
if self.connected:
raise MuxError("Mux is connected, cannot issue control packets")
length = 16 + len(payload)
data = struct.pack("IIII", length, self.VERSION, req, tag) + payload
self.socket.send(data)
def getpacket(self):
if self.connected:
raise MuxError("Mux is connected, cannot issue control packets")
dlen = self.socket.recv(4)
dlen = struct.unpack("I", dlen)[0]
body = self.socket.recv(dlen - 4)
version, resp, tag = struct.unpack("III",body[:0xc])
if version != self.VERSION:
raise MuxVersionError("Version mismatch: expected %d, got %d"%(self.VERSION,version))
payload = self._unpack(resp, body[0xc:])
return (resp, tag, payload)
class PlistProtocol(BinaryProtocol):
TYPE_RESULT = "Result"
TYPE_CONNECT = "Connect"
TYPE_LISTEN = "Listen"
TYPE_DEVICE_ADD = "Attached"
TYPE_DEVICE_REMOVE = "Detached" #???
TYPE_PLIST = 8
VERSION = 1
def __init__(self, socket):
if not haveplist:
raise Exception("You need the plistlib module")
BinaryProtocol.__init__(self, socket)
def _pack(self, req, payload):
return payload
def _unpack(self, resp, payload):
return payload
def sendpacket(self, req, tag, payload={}):
payload['ClientVersionString'] = 'usbmux.py by marcan'
if isinstance(req, int):
req = [self.TYPE_CONNECT, self.TYPE_LISTEN][req-2]
payload['MessageType'] = req
payload['ProgName'] = 'tcprelay'
BinaryProtocol.sendpacket(self, self.TYPE_PLIST, tag, plistlib.writePlistToString(payload))
def getpacket(self):
resp, tag, payload = BinaryProtocol.getpacket(self)
if resp != self.TYPE_PLIST:
raise MuxError("Received non-plist type %d"%resp)
payload = plistlib.readPlistFromString(payload)
return payload['MessageType'], tag, payload
class MuxConnection(object):
def __init__(self, socketpath, protoclass):
self.socketpath = socketpath
if sys.platform in ['win32', 'cygwin']:
family = socket.AF_INET
address = ('127.0.0.1', 27015)
else:
family = socket.AF_UNIX
address = self.socketpath
self.socket = SafeStreamSocket(address, family)
self.proto = protoclass(self.socket)
self.pkttag = 1
self.devices = []
def _getreply(self):
while True:
resp, tag, data = self.proto.getpacket()
if resp == self.proto.TYPE_RESULT:
return tag, data
else:
raise MuxError("Invalid packet type received: %d"%resp)
def _processpacket(self):
resp, tag, data = self.proto.getpacket()
if resp == self.proto.TYPE_DEVICE_ADD:
self.devices.append(MuxDevice(data['DeviceID'], data['Properties']['ProductID'], data['Properties']['SerialNumber'], data['Properties']['LocationID']))
elif resp == self.proto.TYPE_DEVICE_REMOVE:
for dev in self.devices:
if dev.devid == data['DeviceID']:
self.devices.remove(dev)
elif resp == self.proto.TYPE_RESULT:
raise MuxError("Unexpected result: %d"%resp)
else:
raise MuxError("Invalid packet type received: %d"%resp)
def _exchange(self, req, payload={}):
mytag = self.pkttag
self.pkttag += 1
self.proto.sendpacket(req, mytag, payload)
recvtag, data = self._getreply()
if recvtag != mytag:
raise MuxError("Reply tag mismatch: expected %d, got %d"%(mytag, recvtag))
return data['Number']
def listen(self):
ret = self._exchange(self.proto.TYPE_LISTEN)
if ret != 0:
raise MuxError("Listen failed: error %d"%ret)
def process(self, timeout=None):
if self.proto.connected:
raise MuxError("Socket is connected, cannot process listener events")
rlo, wlo, xlo = select.select([self.socket.sock], [], [self.socket.sock], timeout)
if xlo:
self.socket.sock.close()
raise MuxError("Exception in listener socket")
if rlo:
self._processpacket()
def connect(self, device, port):
ret = self._exchange(self.proto.TYPE_CONNECT, {'DeviceID':device.devid, 'PortNumber':((port<<8) & 0xFF00) | (port>>8)})
if ret != 0:
raise MuxError("Connect failed: error %d"%ret)
self.proto.connected = True
return self.socket.sock
def close(self):
self.socket.sock.close()
class USBMux(object):
def __init__(self, socketpath=None):
if socketpath is None:
if sys.platform == 'darwin':
socketpath = "/var/run/usbmuxd"
else:
socketpath = "/var/run/usbmuxd"
self.socketpath = socketpath
self.listener = MuxConnection(socketpath, BinaryProtocol)
try:
self.listener.listen()
self.version = 0
self.protoclass = BinaryProtocol
except MuxVersionError:
self.listener = MuxConnection(socketpath, PlistProtocol)
self.listener.listen()
self.protoclass = PlistProtocol
self.version = 1
self.devices = self.listener.devices
def process(self, timeout=None):
self.listener.process(timeout)
def connect(self, device, port):
connector = MuxConnection(self.socketpath, self.protoclass)
return connector.connect(device, port)
if __name__ == "__main__":
mux = USBMux()
print("Waiting for devices...")
if not mux.devices:
mux.process(0.1)
while True:
print("Devices:")
for dev in mux.devices:
print(dev)
mux.process()
I ran the above code in Python 3.7 and got the error.
The details of the error are as follows.
File "C:\Users\taichi\Documents\usbmux.py", line 81, in sendpacket
data = struct.pack("IIII", length, self.VERSION, req, tag) + payload
TypeError: can't concat str to bytes
How can I rewrite this code to get rid of the error?
struct.pack returns bytes in python 3, not a string as it did in python 2. Have a look at this answer to see one way to emulate the python 2 behavior.
Edit: actually you may need to do something slightly different, since it looks like socket.send wants bytes as input. So you'd need to convert the combined payload to bytes.
You could try changing the following code that seems to be populating the payload to return bytes in both cases. Notice the "b" added in the return values to specify bytes instead of str. You are hitting the TYPE_LISTEN case but should test the other case as well, as it looks like it had the same bug trying to concat str and bytes:
def _pack(self, req, payload):
if req == self.TYPE_CONNECT:
return struct.pack("IH", payload['DeviceID'], payload['PortNumber']) + b"\x00\x00"
elif req == self.TYPE_LISTEN:
return b""
else:
raise ValueError("Invalid outgoing request type %d"%req)
You'll likely need to edit _pack in PlistProtocol as well.
The line:
data = struct.pack("IIII", length, self.VERSION, req, tag) + payload
is causing the trouble. According to the struct.pack() docs, the method returns bytes. You need to convert your payload also into bytes so that the two can be concatenated, making the error go away.
Not sure what your payload is, but encoding the payload using something like 'utf-8' might do the trick. I believe a simple payload.encode() will work, since 'utf-8' is the default encoding. So, try:
data = struct.pack("IIII", length, self.VERSION, req, tag) + payload.encode()
# add this ^^^^^^^^^

How do you send many documents to a Scout Server in Python using the Python Scout Client?

Im trying to index PDF text to a python lib called Scout. I have tried doing the same thing with elasticsearch too. In both cases I can't figure out how to post text to an index in bulk, using python.
After a lot of research, I believe I need to use async http request. The only problem is, I don't understand async calls nor do I understand what a Scout python 'client' really is. I'm a self-taught programmer and still have many things I don't understand. my thought is the client cant stay open for a loop to keep using the connection. I have seen coding concepts like "await" and "sessions" in many books on programming. However, I don't know how to implement these concepts. Can someone help me write some python code that will successfully post new documents to a running scout server and explain how it's done?
Here is My attempt:
from scout_client import Scout
# import libraries to help read and create PDF
import PyPDF2
from fpdf import FPDF
import base64
import os
from flask import Flask, jsonify, request, render_template, json
# before you start, Run the Server.py file and create a Sqlite DB
# Step one loop though PDF in 'books' folder
for k in range(14,15):
# open the pdf file
read_pdf = PyPDF2.PdfFileReader("books/%s.pdf"%(k))
# Test to see if Step one is complete and succesful
#print (read_pdf)
# Step Two Gain intel on how many Pages are in the Document
# get the page numbers
num = read_pdf.getNumPages()
print ("PDF pages:", num)
# Step Three understand the data by page
# create a dictionary object for page data
all_pages = []
# Step For Create a new index in Scout Server
# client.create_index('test3')
# iterate the page numbers
for page in range(num):
data = read_pdf.getPage(page)
#page_mode = read_pdf.getPageMode()
# extract the page's text
page_text = data.extractText()
# put the text data into the dict
all_pages.append(page_text)
# initiate the Client from scout_client.py
client = Scout('http://localhost:8000')
# THe issue: I tryed for loops, and while loops but cant get past: urllib.error.HTTPError: HTTP Error 400: BAD REQUEST
i = 1
while i <= num:
client.create_document(all_pages[i],['test3'])
print(i,"....done")
i += 1
I get an error:
Traceback (most recent call last):
File "test.py", line 37, in <module>
client.create_document(all_pages[i],['test3'])
File "../Searchtest4/scout/scout_client.py", line 149, in create_document
return self.post('/documents/', post_data, attachments)
File "../Searchtest4/scout/scout_client.py", line 53, in post
return self.post_json(url, data)
File "../Searchtest4/scout/scout_client.py", line 63, in post_json
return json.loads(urlopen(request).read().decode('utf8'))
File "../lib/python3.7/urllib/request.py", line 222, in urlopen
return opener.open(url, data, timeout)
File "../lib/python3.7/urllib/request.py", line 531, in open
response = meth(req, response)
File "../lib/python3.7/urllib/request.py", line 641, in http_response
'http', request, response, code, msg, hdrs)
File "../lib/python3.7/urllib/request.py", line 569, in error
return self._call_chain(*args)
File "../lib/python3.7/urllib/request.py", line 503, in _call_chain
result = func(*args)
File "../lib/python3.7/urllib/request.py", line 649, in http_error_default
raise HTTPError(req.full_url, code, msg, hdrs, fp)
**urllib.error.HTTPError: HTTP Error 400: BAD REQUEST**
Here is the server that runs fine (server.py):
import logging
import optparse
import os
import sys
from flask import Flask
from werkzeug.serving import run_simple
from scout.exceptions import InvalidRequestException
from scout.models import database
from scout.models import Attachment
from scout.models import BlobData
from scout.models import Document
from scout.models import Index
from scout.models import IndexDocument
from scout.models import Metadata
from scout.views import register_views
logger = logging.getLogger('scout')
def create_server(config=None, config_file=None):
app = Flask(__name__)
# Configure application using a config file.
if config_file is not None:
app.config.from_pyfile(config_file)
# (Re-)Configure application using command-line switches/environment flags.
if config is not None:
app.config.update(config)
# Initialize the SQLite database.
initialize_database(app.config.get('DATABASE') or 'scout.db',
pragmas=app.config.get('SQLITE_PRAGMAS') or None)
register_views(app)
#app.errorhandler(InvalidRequestException)
def handle_invalid_request(exc):
return exc.response()
#app.before_request
def connect_database():
if database.database != ':memory:':
database.connect()
#app.teardown_request
def close_database(exc):
if database.database != ':memory:' and not database.is_closed():
database.close()
return app
def initialize_database(database_file, pragmas=None):
database.init(database_file, pragmas=pragmas)
try:
meth = database.execution_context
except AttributeError:
meth = database
with meth:
database.create_tables([
Attachment,
BlobData,
Document,
Index,
IndexDocument,
Metadata])
def run(app):
if app.config['DEBUG']:
app.run(host=app.config['HOST'], port=app.config['PORT'], debug=True)
else:
run_simple(
hostname=app.config['HOST'],
port=app.config['PORT'],
application=app,
threaded=True)
def panic(s, exit_code=1):
sys.stderr.write('\033[91m%s\033[0m\n' % s)
sys.stderr.flush()
sys.exit(exit_code)
def get_option_parser():
parser = optparse.OptionParser()
parser.add_option(
'-H',
'--host',
default='127.0.0.1',
dest='host',
help='The hostname to listen on. Defaults to 127.0.0.1.')
parser.add_option(
'-p',
'--port',
default=8000,
dest='port',
help='The port to listen on. Defaults to 8000.',
type='int')
parser.add_option(
'-u',
'--url-prefix',
dest='url_prefix',
help='URL path to prefix Scout API.')
parser.add_option(
'-s',
'--stem',
dest='stem',
help='Specify stemming algorithm for content.')
parser.add_option(
'-d',
'--debug',
action='store_true',
dest='debug',
help='Run Flask app in debug mode.')
parser.add_option(
'-c',
'--config',
dest='config',
help='Configuration module (python file).')
parser.add_option(
'--paginate-by',
default=50,
dest='paginate_by',
help='Number of documents displayed per page of results, default=50',
type='int')
parser.add_option(
'-k',
'--api-key',
dest='api_key',
help='Set the API key required to access Scout.')
parser.add_option(
'-C',
'--cache-size',
default=64,
dest='cache_size',
help='SQLite page-cache size (MB). Defaults to 64MB.',
type='int')
parser.add_option(
'-f',
'--fsync',
action='store_true',
dest='fsync',
help='Synchronize database to disk on every write.')
parser.add_option(
'-j',
'--journal-mode',
default='wal',
dest='journal_mode',
help='SQLite journal mode. Defaults to WAL (recommended).')
parser.add_option(
'-l',
'--logfile',
dest='logfile',
help='Log file')
return parser
def parse_options():
option_parser = get_option_parser()
options, args = option_parser.parse_args()
if options.logfile:
handler = logging.FileHandler(options.logfile)
logger.addHandler(handler)
config_file = os.environ.get('SCOUT_CONFIG') or options.config
config = {'DATABASE': os.environ.get('SCOUT_DATABASE')}
if len(args) == 0 and not config['DATABASE']:
panic('Error: missing required path to database file.')
elif len(args) > 1:
panic('Error: [%s] only accepts one argument, which is the path '
'to the database file.' % __file__)
elif args:
config['DATABASE'] = args[0]
pragmas = [('journal_mode', options.journal_mode)]
if options.cache_size:
pragmas.append(('cache_size', -1024 * options.cache_size))
if not options.fsync:
pragmas.append(('synchronous', 0))
config['SQLITE_PRAGMAS'] = pragmas
# Handle command-line options. These values will override any values
# that may have been specified in the config file.
if options.api_key:
config['AUTHENTICATION'] = options.api_key
if options.debug:
config['DEBUG'] = True
config['HOST'] = options.host or '127.0.0.1'
config['PORT'] = options.port or 8000
config['URL_PREFIX'] = options.url_prefix or ''
if options.paginate_by:
if options.paginate_by < 1 or options.paginate_by > 1000:
panic('paginate-by must be between 1 and 1000')
config['PAGINATE_BY'] = options.paginate_by
if options.stem:
if options.stem not in ('simple', 'porter'):
panic('Unrecognized stemmer. Must be "porter" or "simple".')
config['STEM'] = options.stem
return create_server(config, config_file)
def main():
app = parse_options()
run(app)
if __name__ == '__main__':
main()
and the so-called client (scout_client.py):
import base64
import json
try:
from email.generator import _make_boundary as choose_boundary
except ImportError:
from mimetools import choose_boundary
import mimetypes
import os
try:
from urllib.parse import urlencode
except ImportError:
from urllib import urlencode
try:
from urllib.request import Request
from urllib.request import urlopen
except ImportError:
from urllib2 import Request
from urllib2 import urlopen
import zlib
ENDPOINT = None
KEY = None
class Scout(object):
def __init__(self, endpoint=ENDPOINT, key=KEY):
self.endpoint = endpoint.rstrip('/')
self.key = key
def get_full_url(self, url):
return self.endpoint + url
def get_raw(self, url, **kwargs):
headers = {'Content-Type': 'application/json'}
if self.key:
headers['key'] = self.key
if kwargs:
if '?' not in url:
url += '?'
url += urlencode(kwargs, True)
request = Request(self.get_full_url(url), headers=headers)
fh = urlopen(request)
return fh.read()
def get(self, url, **kwargs):
return json.loads(self.get_raw(url, **kwargs))
def post(self, url, data=None, files=None):
if files:
return self.post_files(url, data, files)
else:
return self.post_json(url, data)
def post_json(self, url, data=None):
headers = {'Content-Type': 'application/json'}
if self.key:
headers['key'] = self.key
data = json.dumps(data or {})
if not isinstance(data, bytes):
data = data.encode('utf-8')
request = Request(self.get_full_url(url), data=data, headers=headers)
return json.loads(urlopen(request).read().decode('utf8'))
def post_files(self, url, json_data, files=None):
if not files or not isinstance(files, dict):
raise ValueError('One or more files is required. Files should be '
'passed as a dictionary of filename: file-like-'
'object.')
boundary = choose_boundary()
form_files = []
for i, (filename, file_obj) in enumerate(files.items()):
try:
data = file_obj.read()
except AttributeError:
data = bytes(file_obj)
mimetype = mimetypes.guess_type(filename)[0]
form_files.append((
'file_%s' % i,
filename,
mimetype or 'application/octet-stream',
data))
part_boundary = '--' + boundary
parts = [
part_boundary,
'Content-Disposition: form-data; name="data"',
'',
json.dumps(json_data)]
for field_name, filename, mimetype, data in form_files:
parts.extend((
part_boundary,
'Content-Disposition: file; name="%s"; filename="%s"' % (
field_name, filename),
'Content-Type: %s' % mimetype,
'',
data))
parts.append('--' + boundary + '--')
parts.append('')
headers = {'Content-Type': 'multipart/form-data; boundary=%s' %
boundary}
if self.key:
headers['key'] = self.key
data = '\r\n'.join(parts)
if not isinstance(data, bytes):
data = data.encode('utf-8')
request = Request(self.get_full_url(url), data=data, headers=headers)
return json.loads(urlopen(request).read())
def delete(self, url):
headers = {}
if self.key:
headers['key'] = self.key
request = Request(self.get_full_url(url), headers=headers)
request.get_method = lambda: 'DELETE'
fh = urlopen(request)
return json.loads(fh.read())
def get_indexes(self, **kwargs):
return self.get('/', **kwargs)['indexes']
def create_index(self, name):
return self.post('/', {'name': name})
def rename_index(self, old_name, new_name):
return self.post('/%s/' % old_name, {'name': new_name})
def delete_index(self, name):
return self.delete('/%s/' % name)
def get_index(self, name, **kwargs):
return self.get('/%s/' % name, **kwargs)
def get_documents(self, **kwargs):
return self.get('/documents/', **kwargs)
def create_document(self, content, indexes, identifier=None,
attachments=None, **metadata):
if not isinstance(indexes, (list, tuple)):
indexes = [indexes]
post_data = {
'content': content,
'identifier': identifier,
'indexes': indexes,
'metadata': metadata}
return self.post('/documents/', post_data, attachments)
def update_document(self, document_id=None, content=None, indexes=None,
metadata=None, identifier=None, attachments=None):
if not document_id and not identifier:
raise ValueError('`document_id` must be provided.')
data = {}
if content is not None:
data['content'] = content
if indexes is not None:
if not isinstance(indexes, (list, tuple)):
indexes = [indexes]
data['indexes'] = indexes
if metadata is not None:
data['metadata'] = metadata
if not data and not attachments:
raise ValueError('Nothing to update.')
return self.post('/documents/%s/' % document_id, data, attachments)
def delete_document(self, document_id=None):
if not document_id:
raise ValueError('`document_id` must be provided.')
return self.delete('/documents/%s/' % document_id)
def get_document(self, document_id=None):
if not document_id:
raise ValueError('`document_id` must be provided.')
return self.get('/documents/%s/' % document_id)
def attach_files(self, document_id, attachments):
return self.post_files('/documents/%s/attachments/' % document_id,
{}, attachments)
def detach_file(self, document_id, filename):
return self.delete('/documents/%s/attachments/%s/' %
(document_id, filename))
def update_file(self, document_id, filename, file_object):
return self.post_files('/documents/%s/attachments/%s/' %
(document_id, filename),
{}, {filename: file_object})
def get_attachments(self, document_id, **kwargs):
return self.get('/documents/%s/attachments/' % document_id, **kwargs)
def get_attachment(self, document_id, filename):
return self.get('/documents/%s/attachments/%s/' %
(document_id, filename))
def download_attachment(self, document_id, filename):
return self.get_raw('/documents/%s/attachments/%s/download/' %
(document_id, filename))
def search_attachments(self, **kwargs):
return self.get('/documents/attachments/', **kwargs)
class SearchProvider(object):
def content(self, obj):
raise NotImplementedError
def identifier(self, obj):
raise NotImplementedError
def metadata(self, obj):
raise NotImplementedError
class SearchSite(object):
def __init__(self, client, index):
self.client = client
self.index = index
self.registry = {}
def register(self, model_class, search_provider):
self.registry.setdefault(model_class, [])
self.registry[model_class].append(search_provider())
def unregister(self, model_class, search_provider=None):
if search_provider is None:
self.registry.pop(model_class, None)
elif model_class in self.registry:
self.registry[model_class] = [
sp for sp in self.registry[model_class]
if not isinstance(sp, search_provider)]
def store(self, obj):
if type(obj) not in self.registry:
return False
for provider in self.registry[type(obj)]:
content = provider.content(obj)
try:
metadata = provider.metadata(obj)
except NotImplementedError:
metadata = {}
try:
identifier = provider.identifier(obj)
except NotImplementedError:
pass
else:
metadata['identifier'] = identifier
self.client.create_document(content, self.index, **metadata)
return True
def remove(self, obj):
if type(obj) not in self.registry:
return False
for provider in self.registry[type(obj)]:
self.client.delete_document(provider.identifier(obj))
return True
Finally the Documentation for Scout:
https://scout.readthedocs.io/en/latest/server.html#index-detail-index-name
https://charlesleifer.com/blog/meet-scout-a-search-server-powered-by-sqlite/
Any Detailed Help is much appreciated:)
So i find a lib called scout and...got it to work!
from scout_client import Scout
# import libraries to help read and create PDF
import PyPDF2
from fpdf import FPDF
import base64
import os
from flask import Flask, jsonify, request, render_template, json
client = Scout('http://localhost:8000')
for k in range(7,18):
read_pdf = PyPDF2.PdfFileReader("books/%s.pdf"%(k))
num = read_pdf.getNumPages()
print ("PDF pages:", num)
all_pages = []
for page in range(num):
data = read_pdf.getPage(page)
page_text = data.extractText()
all_pages.append(page_text)
import requests
for z in all_pages:
url = 'http://localhost:8000/documents/'
data = {'content': z, 'indexes': ['test13']}
headers = {
'Content-Type': 'application/json',
}
response = requests.post(url, data=json.dumps(data), headers=headers)
print(response)
I can now loop though as many PDF's as I want locally
Post to the server for indexing
and search for keywords
Now I just need help with Making a basic front end with a search bar that calls data from a JSON response in python and flask.

Python Ivona (Pyvona) script issue (local variable unbound)

I'm using the pyvona package (from July 3rd 2016). I have all dependencies installed. It works correctly when I call it to speak for the first time. But if I run the command again, it gives me the local unbound error:
>>> import pyvona
>>> v = pyvona.create_voice('<key>', '<secret>')
>>> v.speak('hello')
>>> v.speak('hello')
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "C:\Python34\lib\site-packages\pyvona.py", line 159, in speak
channel.play(sound)
UnboundLocalError: local variable 'channel' referenced before assignment
>>>
Here's the pyvona.py script:
#!/usr/bin/env python
# encoding: utf-8
"""Pyvona : an IVONA python library
Author: Zachary Bears
Contact Email: bears.zachary#gmail.com
Note: Full operation of this library requires the requests and pygame libraries
"""
import datetime
import hashlib
import hmac
import json
import tempfile
import contextlib
import os
class PyvonaException(Exception):
pass
try:
import pygame
except ImportError:
pygame_available = False
else:
pygame_available = True
try:
import requests
requests.packages.urllib3.disable_warnings()
except ImportError:
msg = 'The requests library is essential for Pyvona operation. '
msg += 'Without it, Pyvona will not function correctly.'
raise PyvonaException(msg)
_amazon_date_format = '%Y%m%dT%H%M%SZ'
_date_format = '%Y%m%d'
def create_voice(access_key, secret_key):
"""Creates and returns a voice object to interact with
"""
return Voice(access_key, secret_key)
class Voice(object):
"""An object that contains all the required methods for interacting
with the IVONA text-to-speech system
"""
voice_name = None
language = None
gender = None
speech_rate = None
sentence_break = None
paragraph_break = None
_codec = "ogg"
region_options = {
'us-east': 'us-east-1',
'us-west': 'us-west-2',
'eu-west': 'eu-west-1',
}
access_key = None
secret_key = None
algorithm = 'AWS4-HMAC-SHA256'
signed_headers = 'content-type;host;x-amz-content-sha256;x-amz-date'
_region = None
_host = None
#property
def region(self):
return self._region
#region.setter
def region(self, region_name):
self._region = self.region_options.get(region_name, 'us-east-1')
self._host = 'tts.{}.ivonacloud.com'.format(self._region)
#property
def codec(self):
return self._codec
#codec.setter
def codec(self, codec):
if codec not in ["mp3", "ogg"]:
raise PyvonaException(
"Invalid codec specified. Please choose 'mp3' or 'ogg'")
self._codec = codec
#contextlib.contextmanager
def use_ogg_codec(self):
current_codec = self.codec
self.codec = "ogg"
try:
yield
finally:
self.codec = current_codec
def fetch_voice_ogg(self, text_to_speak, filename):
"""Fetch an ogg file for given text and save it to the given file name
"""
with self.use_ogg_codec():
self.fetch_voice(text_to_speak, filename)
def fetch_voice(self, text_to_speak, filename):
"""Fetch a voice file for given text and save it to the given file name
"""
file_extension = ".{codec}".format(codec=self.codec)
filename += file_extension if not filename.endswith(
file_extension) else ""
with open(filename, 'wb') as f:
self.fetch_voice_fp(text_to_speak, f)
def fetch_voice_fp(self, text_to_speak, fp):
"""Fetch a voice file for given text and save it to the given file pointer
"""
r = self._send_amazon_auth_packet_v4(
'POST', 'tts', 'application/json', '/CreateSpeech', '',
self._generate_payload(text_to_speak), self._region, self._host)
if r.content.startswith(b'{'):
raise PyvonaException('Error fetching voice: {}'.format(r.content))
else:
fp.write(r.content)
def speak(self, text_to_speak, use_cache=False):
"""Speak a given text
"""
if not pygame_available:
raise PyvonaException(
"Pygame not installed. Please install to use speech.")
if not pygame.mixer.get_init():
pygame.mixer.init()
channel = pygame.mixer.Channel(5)
if use_cache is False:
with tempfile.SpooledTemporaryFile() as f:
with self.use_ogg_codec():
self.fetch_voice_fp(text_to_speak, f)
f.seek(0)
sound = pygame.mixer.Sound(f)
else:
cache_f = hashlib.md5(text_to_speak).hexdigest() + '.ogg'
speech_cache_dir = os.getcwd() + '/speech_cache/'
if not os.path.isdir(speech_cache_dir):
os.makedirs(speech_cache_dir)
if not os.path.isfile(speech_cache_dir + cache_f):
with self.use_ogg_codec():
self.fetch_voice(text_to_speak, 'speech_cache/' + cache_f)
f = speech_cache_dir + cache_f
sound = pygame.mixer.Sound(f)
channel.play(sound)
while channel.get_busy():
pass
def list_voices(self):
"""Returns all the possible voices
"""
r = self._send_amazon_auth_packet_v4(
'POST', 'tts', 'application/json', '/ListVoices', '', '',
self._region, self._host)
return r.json()
def _generate_payload(self, text_to_speak):
return json.dumps({
'Input': {
"Type":"application/ssml+xml",
'Data': text_to_speak
},
'OutputFormat': {
'Codec': self.codec.upper()
},
'Parameters': {
'Rate': self.speech_rate,
'SentenceBreak': self.sentence_break,
'ParagraphBreak': self.paragraph_break
},
'Voice': {
'Name': self.voice_name,
'Language': self.language,
'Gender': self.gender
}
})
def _send_amazon_auth_packet_v4(self, method, service, content_type,
canonical_uri, canonical_querystring,
request_parameters, region, host):
"""Send a packet to a given amazon server using Amazon's signature Version 4,
Returns the resulting response object
"""
# Create date for headers and the credential string
t = datetime.datetime.utcnow()
amazon_date = t.strftime(_amazon_date_format)
date_stamp = t.strftime(_date_format)
# Step 1: Create canonical request
payload_hash = self._sha_hash(request_parameters)
canonical_headers = 'content-type:{}\n'.format(content_type)
canonical_headers += 'host:{}\n'.format(host)
canonical_headers += 'x-amz-content-sha256:{}\n'.format(payload_hash)
canonical_headers += 'x-amz-date:{}\n'.format(amazon_date)
canonical_request = '\n'.join([
method, canonical_uri, canonical_querystring, canonical_headers,
self.signed_headers, payload_hash])
# Step 2: Create the string to sign
credential_scope = '{}/{}/{}/aws4_request'.format(
date_stamp, region, service)
string_to_sign = '\n'.join([
self.algorithm, amazon_date, credential_scope,
self._sha_hash(canonical_request)])
# Step 3: Calculate the signature
signing_key = self._get_signature_key(
self.secret_key, date_stamp, region, service)
signature = hmac.new(
signing_key, string_to_sign.encode('utf-8'),
hashlib.sha256).hexdigest()
# Step 4: Create the signed packet
endpoint = 'https://{}{}'.format(host, canonical_uri)
authorization_header = '{} Credential={}/{}, ' +\
'SignedHeaders={}, Signature={}'
authorization_header = authorization_header.format(
self.algorithm, self.access_key, credential_scope,
self.signed_headers, signature)
headers = {
'Host': host,
'Content-type': content_type,
'X-Amz-Date': amazon_date,
'Authorization': authorization_header,
'x-amz-content-sha256': payload_hash,
'Content-Length': len(request_parameters)
}
# Send the packet and return the response
return requests.post(endpoint, data=request_parameters,
headers=headers)
def _sha_hash(self, to_hash):
return hashlib.sha256(to_hash.encode('utf-8')).hexdigest()
def _sign(self, key, msg):
return hmac.new(key, msg.encode('utf-8'), hashlib.sha256).digest()
def _get_signature_key(self, key, date_stamp, region_name, service_name):
k_date = self._sign(('AWS4{}'.format(key)).encode('utf-8'), date_stamp)
k_region = self._sign(k_date, region_name)
k_service = self._sign(k_region, service_name)
k_signing = self._sign(k_service, 'aws4_request')
return k_signing
def __init__(self, access_key, secret_key):
"""Set initial voice object parameters
"""
self.region = 'us-east'
self.voice_name = 'Brian'
self.access_key = access_key
self.secret_key = secret_key
self.speech_rate = 'medium'
self.sentence_break = 400
self.paragraph_break = 650
Any help is highly appreciated.
Was able to find a workaround by adding:
pygame.mixer.init()
channel = pygame.mixer.Channel(5)
immediately after defining the speak function:
def speak(self, text_to_speak, use_cache=False):
def speak(self, text_to_speak, use_cache=False):
"""Speak a given text
"""
pygame.mixer.init()
channel = pygame.mixer.Channel(5)
Usually UnboundLocalError relates to Scopes and Namespaces in Python Scopes and NameSpaces ,
but in Your case:
In function speak() You create channel in code
if not pygame.mixer.get_init():
pygame.mixer.init()
channel = pygame.mixer.Channel(5)
In case this code does not execute channel does not bound to any object.
For example, You can check this situation by this code sample:
def test_if():
if True:
# Bound
channel_1 = "Channel_1"
if False:
# Not bound
channel_2 = "Channel_2"
print(channel_1)
print(channel_2)
test_if()

Error 401 when searching tweets with Python-Twitter

This is an app to search tweets and save the results to mongdb. This line is problematic
search_results = twitter_api.search.tweets(q=q, count=100, **kw)
I'm getting this error:
Traceback (most recent call last):
File "mongo.py", line 89, in
results = twitter_search(twitter_api,q,max_results=10)
File "mongo.py", line 21, in twitter_search
search_results = twitter_api.search.tweets(q=q, count=100, **kw)
File "/usr/local/lib/python2.7/dist-packages/twitter/api.py", line 312, in __call__
return self._handle_response(req, uri, arg_data, _timeout)
File "/usr/local/lib/python2.7/dist-packages/twitter/api.py", line 345, in _handle_response
raise TwitterHTTPError(e, uri, self.format, arg_data)
twitter.api.TwitterHTTPError: Twitter sent status 401 for URL: 1.1/search/tweets.json using parameters: (count=100&oauth_consumer_key= &oauth_nonce= &oauth_signature_method=HMAC-SHA1&oauth_timestamp=1457104474&oauth_token= &oauth_version=1.0&q=CrossFit&oauth_signature=6SMuzJBWfFUEmZHlu%2F4Hmt6Zu8k%3D)
details: {u'errors': [{u'message': u'Could not authenticate you.', u'code': 32}]}
Here is my code:
import json
import pymongo
import twitter
def oauth_login():
CONSUMER_KEY = ''
CONSUMER_SECRET = ''
OAUTH_TOKEN = ''
OAUTH_TOKEN_SECRET = ''
auth = twitter.oauth.OAuth(OAUTH_TOKEN,OAUTH_TOKEN_SECRET,CONSUMER_KEY,CONSUMER_SECRET)
twitter_api = twitter.Twitter(auth=auth)
return twitter_api
def twitter_search(twitter_api, q, max_results=200, **kw):
search_results = twitter_api.search.tweets(q=q, count=100, **kw)
statuses = search_results['statuses']
max_results = min(1000, max_results)
for _ in range(10): # 10*100 = 1000
try:
next_results = search_results['search_metadata']['next_results']
except KeyError, e:
break
kwargs = dict([kv.split('=')
for kv in next_results[1:].split("&") ])
search_results = twitter_api.search.tweets(**kwargs)
statuses += search_results['statuses']
if len(statuses) > max_results:
break
return statuses
def save_to_mongo(data,mongo_db,mongo_db_coll,**mongo_conn_kw):
client = pymongo.MongoClient(**mongo_conn_kw)
db = client[mongo_db]
coll = db[mongo_db_coll]
return coll.insert(data)
def load_from_mongo(mongo_db,mongo_db_coll,return_cursor=False,criteria=None,projection=None,**mongo_conn_kw):
client = pymongo.MongoClient(**mongo_conn_kw)
db = client[mongo_db]
coll = db[mongo_db_coll]
if criteria is None:
criteria = {}
if projection is None:
cursor = coll.find(criteria)
else:
cursor = coll.find(criteria,projection)
#Returning a cursor is recommended for large amounts of data
if return_cursor:
return cursor
else:
return [item for item in cursor]
#Sample usage
q = 'CrossFit'
twitter_api = oauth_login()
results = twitter_search(twitter_api,q,max_results=10)
save_to_mongo(results,'search_results',q)
load_from_mongo('search_results',q)

Python - how to upload large file

I write script, which uploads large file to third party website. I use poster module
Here code
class upload_in_chunks(object):
def __init__(self, filename, chunksize):
self.filename = filename
self.chunksize = chunksize
self.totalsize = os.path.getsize(filename)
self.readsofar = 0
def __iter__(self):
with open(self.filename, 'rb') as file:
while True:
data = file.read(self.chunksize)
if not data:
sys.stderr.write("\n")
break
self.readsofar += len(data)
percent = self.readsofar * 1e2 / self.totalsize
sys.stderr.write("\r{percent:3.0f}%".format(percent=percent))
yield data
def __len__(self):
return self.totalsize
class IterableToFileAdapter(object):
def __init__(self, iterable):
self.iterator = iter(iterable)
self.length = len(iterable)
def read(self, size=-1): # TBD: add buffer for `len(data) > size` case
return next(self.iterator, b'')
def __len__(self):
return self.length
def upload_file(s, data, files, url):
class FuncThread(threading.Thread):
def __init__(self, s, data, files, url):
threading.Thread.__init__(self)
self.result = False
self.error = None
self.s = s
self.data = data
self.files = files
self.url = url
def run(self):
try:
register_openers()
items = []
for name, value in self.data.items():
items.append(MultipartParam(name, value))
#add file
for name, value in self.files.items():
items.append(MultipartParam.from_file(name, value))
datagen, h = multipart_encode(items)
#h.update(headers)
# Create the Request object
#self.result = self.s.post(self.url, data = datagen.read(), headers = h)
for item, value in headers.iteritems():
h[item] = value
#for item, value in headers.iteritems():
#h[item] = value
h["Cookie"] = ""
for item, value in requests.utils.dict_from_cookiejar(self.s.cookies).iteritems():
h["Cookie"] += item + "=" + value + "; "
print h["Cookie"]
#h[item] = value
print h
request = urllib2.Request(self.url, datagen, h)
#Actually do the request, and get the response
self.result = urllib2.urlopen(request).read()
except Exception as error:
self.error = error
self.result = False
def _stop(self):
if self.isAlive():
threading.Thread._Thread__stop(self)
try:
it = FuncThread(s, data, files, url)
it.start()
it.join(upload_timeout)
if it.isAlive():
it._stop()
return (False, "[-] Time expired.")
else:
return (it.result, it.error)
except Exception as error:
return (False, error)
And in my upload.py I use
try:
payload = {'Filename': f_path,
'tags': 'tag1',
'keywords': 'keyword1',
'cookie': cookie,
'title': 'Large file',
'categories': '["35","13","37"]',
'privacy': 'community',
'source': 5,
'production': "professional"}
(r, error) = upload_file(self.s, payload, {'Filedata': f_path}, url)
I have dedicated server with 100m\bit internet channel.
But my script upload files using only 1-2 m\bit
The website recipient has not any limitations and it doesn't throttle me.
How can I force my code to use full 100 m\bit?

Categories

Resources