Alexa Skill To Download File From Google Drive - python

I am trying to download file from google drive using Alexa Skill and Amazon Echo that I made with raspberry Pi 3. My lambda function is as follows: -
#!/usr/bin/python
# -*- coding: utf-8 -*-
from __future__ import print_function
import xml.etree.ElementTree as etree
from datetime import datetime as dt
import os
import urllib
import requests
from urllib.parse import urlparse
def lambda_handler(event, context):
""" Route the incoming request based on type (LaunchRequest, IntentRequest,
etc.) The JSON body of the request is provided in the event parameter.
"""
print('event.session.application.applicationId=' + event['session'
]['application']['applicationId'])
# if (event['session']['application']['applicationId'] !=
# "amzn1.echo-sdk-ams.app.[unique-value-here]"):
# raise ValueError("Invalid Application ID")
if event['session']['new']:
on_session_started({'requestId': event['request']['requestId'
]}, event['session'])
if event['request']['type'] == 'LaunchRequest':
return on_launch(event['request'], event['session'])
elif event['request']['type'] == 'IntentRequest':
return on_intent(event['request'], event['session'])
elif event['request']['type'] == 'SessionEndedRequest':
return on_session_ended(event['request'], event['session'])
def on_session_started(session_started_request, session):
""" Called when the session starts """
print('on_session_started requestId='
+ session_started_request['requestId'] + ', sessionId='
+ session['sessionId'])
def on_launch(launch_request, session):
""" Called when the user launches the skill without specifying what they
want
"""
print('on_launch requestId=' + launch_request['requestId']
+ ', sessionId=' + session['sessionId'])
# Dispatch to your skill's launch
return get_welcome_response()
def on_intent(intent_request, session):
""" Called when the user specifies an intent for this skill """
print('on_intent requestId=' + intent_request['requestId']
+ ', sessionId=' + session['sessionId'])
intent = intent_request['intent']
intent_name = intent_request['intent']['name']
# Dispatch to your skill's intent handlers
if intent_name == 'DownloadFiles':
return get_file(intent, session)
elif intent_name == 'AMAZON.HelpIntent':
return get_welcome_response()
else:
raise ValueError('Invalid intent')
def on_session_ended(session_ended_request, session):
""" Called when the user ends the session.Is not called when the skill returns should_end_session=true """
print('on_session_ended requestId='
+ session_ended_request['requestId'] + ', sessionId='
+ session['sessionId'])
# add cleanup logic here
# --------------- Functions that control the skill's behavior ------------------
def get_welcome_response():
""" If we wanted to initialize the session to have some attributes we could add those here """
session_attributes = {}
card_title = 'Welcome'
speech_output = \
"Welcome to file download Application. Please ask me to download files by saying, Ask auto downloader for download"
# If the user either does not reply to the welcome message or says something
# that is not understood, they will be prompted again with this text.
reprompt_text = \
"Please ask me to download files by saying, Ask auto downloader for download"
should_end_session = False
return build_response(session_attributes,
build_speechlet_response(card_title,
speech_output, reprompt_text,
should_end_session))
def get_file(intent, session):
""" Grabs the files from the path that have to be downloaded """
card_title = intent['name']
session_attributes = {}
should_end_session = True
username = '*******'
password = '*******'
url = 'https://drive.google.com/drive/my-drive/abc.pdf'
filename = os.path.basename(urlparse(url).path)
# urllib.urlretrieve(url, "code.zip")
r = requests.get(url, auth=(username, password))
if r.status_code == 200:
with open(filename, 'wb') as out:
for bits in r.iter_content():
out.write(bits)
speech_output = 'The file filename has been downloaded'
return build_response(session_attributes,
build_speechlet_response(card_title,
speech_output, reprompt_text,
should_end_session))
# --------------- Helpers that build all of the responses ----------------------
def build_speechlet_response(
title,
output,
reprompt_text,
should_end_session,
):
return {
'outputSpeech': {'type': 'PlainText', 'text': output},
'card': {'type': 'Simple', 'title': 'SessionSpeechlet - ' \
+ title, 'content': 'SessionSpeechlet - ' + output},
'reprompt': {'outputSpeech': {'type': 'PlainText',
'text': reprompt_text}},
'shouldEndSession': should_end_session,
}
def build_response(session_attributes, speechlet_response):
return {'version': '1.0', 'sessionAttributes': session_attributes,
'response': speechlet_response}
After I run this lambda_function I get the below error: -
[Errno 30] Read-only file system: 'abc.pdf': OSError
Traceback (most recent call last):
File "/var/task/lambda_function.py", line 32, in lambda_handler
return on_intent(event['request'], event['session'])
File "/var/task/lambda_function.py", line 70, in on_intent
return get_file(intent, session)
File "/var/task/lambda_function.py", line 126, in get_file
with open(filename, 'wb') as out:
OSError: [Errno 30] Read-only file system: 'abc.pdf'
Can anyone suggest how to go about this error! What I want is Alexa would be prompted to download files from a fixed url using amazon echo.

Make sure to write to /tmp as it's the only writable path in a lambda.
with open("/tmp/" + filename, 'wb') as out:
for bits in r.iter_content():
out.write(bits)

Related

Python SIEM log collector hangs randomly

I am trying to troubleshoot a script for Mimecast's API. The script runs fine for the most part, but a few times, I have noticed that it stops pulling logs and generally appears to be a hung process. After restarting the script and manually pushing logs to the syslog server, it starts working again without issue. I am not able to reproduce this issue at will.
The script is supposed to do the following:
Authenticate against Mimecast's API
Sign responses
download, extract and save log files to log dir
utilize a tokenized header to determine which file was downloaded in the last request. Should save the token ID within a file in the checkpoint directory
Push files to remote syslog server
Output any errors and info to console
Below is the sample code from Mimecast.
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import logging.handlers
import json
import os
import requests
import base64
import uuid
import datetime
import hashlib
import shutil
import hmac
import time
from zipfile import ZipFile
import io
# Set up variables
APP_ID = "YOUR DEVELOPER APPLICATION ID"
APP_KEY = "YOUR DEVELOPER APPLICATION KEY"
URI = "/api/audit/get-siem-logs"
EMAIL_ADDRESS = 'EMAIL ADDRESS OF YOUR ADMINISTRATOR'
ACCESS_KEY = 'ACCESS KEY FOR YOUR ADMINISTRATOR'
SECRET_KEY = 'SECRET KEY FOR YOUR ADMINISTRATOR'
LOG_FILE_PATH = "FULLY QUALIFIED PATH TO FOLDER TO WRITE LOGS"
CHK_POINT_DIR = 'FULLY QUALIFIED PATH TO FOLDER TO WRITE PAGE TOKEN'
# Set True to output to syslog, false to only save to file
syslog_output = False
# Enter the IP address or hostname of your syslog server
syslog_server = 'localhost'
# Change this to override default port
syslog_port = 514
# delete files after fetching
delete_files = True
# Set threshold in number of files in log file directory
log_file_threshold = 10000
# Set up logging (in this case to terminal)
log = logging.getLogger(__name__)
log.root.setLevel(logging.DEBUG)
log_formatter = logging.Formatter('%(levelname)s %(message)s')
log_handler = logging.StreamHandler()
log_handler.setFormatter(log_formatter)
log.addHandler(log_handler)
# Set up syslog output
syslog_handler = logging.handlers.SysLogHandler(address=(syslog_server, syslog_port))
syslog_formatter = logging.Formatter('%(message)s')
syslog_handler.setFormatter(syslog_formatter)
syslogger = logging.getLogger(__name__)
syslogger = logging.getLogger('SysLogger')
syslogger.addHandler(syslog_handler)
# Supporting methods
def get_hdr_date():
return datetime.datetime.utcnow().strftime("%a, %d %b %Y %H:%M:%S UTC")
def read_file(file_name):
try:
with open(file_name, 'r') as f:
data = f.read()
return data
except Exception as e:
log.error('Error reading file ' + file_name + '. Cannot continue. Exception: ' + str(e))
quit()
def write_file(file_name, data_to_write):
if '.zip' in file_name:
try:
byte_content = io.BytesIO(data_to_write)
zip_file = ZipFile(byte_content)
zip_file.extractall(LOG_FILE_PATH)
except Exception as e:
log.error('Error writing file ' + file_name + '. Cannot continue. Exception: ' + str(e))
quit()
else:
try:
with open(file_name, 'w') as f:
f.write(data_to_write)
except Exception as e:
log.error('Error writing file ' + file_name + '. Cannot continue. Exception: ' + str(e))
quit()
def get_base_url(email_address):
# Create post body for request
post_body = dict()
post_body['data'] = [{}]
post_body['data'][0]['emailAddress'] = email_address
# Create variables required for request headers
request_id = str(uuid.uuid4())
request_date = get_hdr_date()
headers = {'x-mc-app-id': APP_ID, 'x-mc-req-id': request_id, 'x-mc-date': request_date}
# Send request to API
log.debug('Sending request to https://api.mimecast.com/api/discover-authentication with request Id: ' +
request_id)
try:
r = requests.post(url='https://api.mimecast.com/api/login/discover-authentication',
data=json.dumps(post_body), headers=headers)
# Handle Rate Limiting
if r.status_code == 429:
log.warning('Rate limit hit. sleeping for ' + str(r.headers['X-RateLimit-Reset'] * 1000))
time.sleep(r.headers['X-RateLimit-Reset'] * 1000)
except Exception as e:
log.error('Unexpected error getting base url. Cannot continue.' + str(e))
quit()
# Handle error from API
if r.status_code != 200:
log.error('Request returned with status code: ' + str(r.status_code) + ', response body: ' +
r.text + '. Cannot continue.')
quit()
# Load response body as JSON
resp_data = json.loads(r.text)
# Look for api key in region region object to get base url
if 'region' in resp_data["data"][0]:
base_url = resp_data["data"][0]["region"]["api"].split('//')
base_url = base_url[1]
else:
# Handle no region found, likely the email address was entered incorrectly
log.error(
'No region information returned from API, please check the email address.'
'Cannot continue')
quit()
return base_url
def post_request(base_url, uri, post_body, access_key, secret_key):
# Create variables required for request headers
request_id = str(uuid.uuid4())
request_date = get_hdr_date()
unsigned_auth_header = '{date}:{req_id}:{uri}:{app_key}'.format(
date=request_date,
req_id=request_id,
uri=uri,
app_key=APP_KEY
)
hmac_sha1 = hmac.new(
base64.b64decode(secret_key),
unsigned_auth_header.encode(),
digestmod=hashlib.sha1).digest()
sig = base64.encodebytes(hmac_sha1).rstrip()
headers = {
'Authorization': 'MC ' + access_key + ':' + sig.decode(),
'x-mc-app-id': APP_ID,
'x-mc-date': request_date,
'x-mc-req-id': request_id,
'Content-Type': 'application/json'
}
try:
# Send request to API
log.debug('Sending request to https://' + base_url + uri + ' with request Id: ' + request_id)
r = requests.post(url='https://' + base_url + uri, data=json.dumps(post_body), headers=headers)
# Handle Rate Limiting
if r.status_code == 429:
log.warning('Rate limit hit. sleeping for ' + str(r.headers['X-RateLimit-Reset'] * 1000))
time.sleep(r.headers['X-RateLimit-Reset'] * 1000)
r = requests.post(url='https://' + base_url + uri, data=json.dumps(post_body), headers=headers)
# Handle errors
except Exception as e:
log.error('Unexpected error connecting to API. Exception: ' + str(e))
return 'error'
# Handle errors from API
if r.status_code != 200:
log.error('Request to ' + uri + ' with , request id: ' + request_id + ' returned with status code: ' +
str(r.status_code) + ', response body: ' + r.text)
return 'error'
# Return response body and response headers
return r.content, r.headers
def get_mta_siem_logs(checkpoint_dir, base_url, access_key, secret_key):
uri = "/api/audit/get-siem-logs"
# Set checkpoint file name to store page token
checkpoint_filename = os.path.join(checkpoint_dir, 'get_mta_siem_logs_checkpoint')
# Build post body for request
post_body = dict()
post_body['data'] = [{}]
post_body['data'][0]['type'] = 'MTA'
post_body['data'][0]['compress'] = True
if os.path.exists(checkpoint_filename):
post_body['data'][0]['token'] = read_file(checkpoint_filename)
# Send request to API
resp = post_request(base_url, uri, post_body, access_key, secret_key)
now = datetime.datetime.now().strftime("%a %b %d %H:%M:%S %Y")
# Process response
if resp != 'error':
resp_body = resp[0]
resp_headers = resp[1]
content_type = resp_headers['Content-Type']
# End if response is JSON as there is no log file to download
if content_type == 'application/json':
log.info('No more logs available')
return False
# Process log file
elif content_type == 'application/octet-stream':
file_name = resp_headers['Content-Disposition'].split('=\"')
file_name = file_name[1][:-1]
# Save files to LOG_FILE_PATH
write_file(os.path.join(LOG_FILE_PATH, file_name), resp_body)
# Save mc-siem-token page token to check point directory
write_file(checkpoint_filename, resp_headers['mc-siem-token'])
try:
if syslog_output is True:
for filename in os.listdir(LOG_FILE_PATH):
file_creation_time = time.ctime(os.path.getctime(LOG_FILE_PATH + "/" + filename))
if now < file_creation_time or now == file_creation_time:
log.info('Loading file: ' + filename + ' to output to ' + syslog_server + ':' + str(syslog_port))
with open(file=os.path.join(LOG_FILE_PATH, filename), mode='r', encoding='utf-8') as log_file:
lines = log_file.read().splitlines()
for line in lines:
syslogger.info(line)
log.info('Syslog output completed for file ' + filename)
except Exception as e:
log.error('Unexpected error writing to syslog. Exception: ' + str(e))
# return true to continue loop
return True
else:
# Handle errors
log.error('Unexpected response')
for header in resp_headers:
log.error(header)
return False
def run_script():
# discover base URL
try:
base_url = get_base_url(email_address=EMAIL_ADDRESS)
except Exception as e:
log.error('Error discovering base url for ' + EMAIL_ADDRESS + ' . Exception: ' + str(e))
quit()
# Request log data in a loop until there are no more logs to collect
try:
log.info('Getting MTA log data')
while get_mta_siem_logs(checkpoint_dir=CHK_POINT_DIR, base_url=base_url, access_key=ACCESS_KEY,
secret_key=SECRET_KEY) is True:
log.info('Getting more MTA log files')
except Exception as e:
log.error('Unexpected error getting MTA logs ' + (str(e)))
file_number = len([name for name in os.listdir(LOG_FILE_PATH) if os.path.isfile(name)])
if delete_files or file_number >= log_file_threshold:
for filename in os.listdir(LOG_FILE_PATH):
file_path = os.path.join(LOG_FILE_PATH, filename)
try:
if os.path.isfile(file_path) or os.path.islink(file_path):
os.unlink(file_path)
elif os.path.isdir(file_path):
shutil.rmtree(file_path)
except Exception as e:
print('Failed to delete %s. Reason: %s' % (file_path, e))
quit()
# Run script
run_script()
It seems like it may potentially be a race condition but I am not sure how to confirm since I can't reproduce it. I notice that SumoLogic has a modified version of this script as well with a different methodology for managing the files/paths. If this script works better than the main sample script above, would anybody be able to explain WHY? I haven't had any issues with it yet.
https://github.com/SumoLogic/sumologic-content/blob/master/MimeCast/SumoLogic-Mimecast-Data-Collection/siem_collection.py

WaitForSingleObject not working - invalid handle

I'm currently righting a code for client that can download files from a cloud. The client downloads the file temporarily and when the file is closed (terminated) it gets sent back to the cloud.
Here is the full code:
import socket
import os
from OpenFile import *
from Registry_Change import *
import win32con
import win32api
import win32event
from win32com.shell import shellcon
from win32com.shell.shell import ShellExecuteEx
class Application(object):
def __init__(self):
"""
:param request: the request of the file to receive from server
"""
# initiates request (clicked file path)
self.request = sys.argv[SECOND]
reg = Read_Registry()
ip, port = reg.get_ip_port()
self.ip = ip
self.port = port
# initiates socket
try:
self.sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
self.sock.connect((self.ip, self.port))
# sends the request to the server
Application.send_request_to_server(self.sock, "DOWNLOAD_FILE " + self.request)
format = Application.read_server_response(self.sock).decode()
self.path_file = self.download(format)
self.start_file_and_wait()
self.sock.close()
except Exception as msg:
print(win32api.GetLastError())
print("connection error:", msg)
#staticmethod
def valid_file(path):
"""
checks if the path is a file that exists
"""
if os.path.isfile(path):
return True
return False
#staticmethod
def delete_file(file_path):
"""
deletes file
"""
os.remove(file_path)
#staticmethod
def read_server_response(server_socket_choice):
"""
reads the length and according to that, it reads the rest
of the message
"""
try:
length_of_message = server_socket_choice.recv(BYTE).decode()
if length_of_message.isdigit():
return server_socket_choice.recv(int(length_of_message))
except Exception as msg:
print("at read_server_response:", msg)
return SERVER_FELL
#staticmethod
def send_request_to_server(server_socket, request):
"""
Send the request to the server.
First the length of the request (2 digits), then the request itself
Example: '04EXIT'
Example: '12DIR c:\cyber'
"""
server_socket. \
send((str(len(request)).zfill(MSG_FILL) + request).encode())
def download(self, format):
"""
saves the given chunks to a file in the client
"""
try:
file = Application.new_format_path(self.request, format)
new_location = Application.make_new_file_path(TEMPORARY_FILES, file)
# check if the file is valid
check_len = self.sock.recv(BYTE).decode()
check = self.sock.recv(int(check_len))
if check != FILE_DOESNT_EXIST:
client_file = open(new_location, 'wb')
# write what we took out
client_file.write(check)
done = False
while not done:
byte_message_len = self.sock.recv(BYTE)
length = byte_message_len.decode()
if length.isdigit():
real_len = int(length)
data = self.sock.recv(real_len)
if data == FILE_END:
done = True
else:
client_file.write(data)
client_file.close()
return new_location
else:
return 'nothing'
except Exception as msg:
print("at download:", msg)
def upload(self, file_path):
"""
Sends a file from the server to the client
"""
if Application.valid_file(file_path):
client_file = open(file_path, 'rb')
content = client_file.read(BYTE_NUMBER)
while content != b'':
Application.send_binary_response_to_server(content, self.sock)
content = client_file.read(BYTE_NUMBER)
client_file.close()
Application.send_binary_response_to_server(FILE_END, self.sock)
Application.delete_file(file_path)
Application.make_imaginary_file(file_path)
return Application.read_server_response(self.my_socket)
else:
Application.send_response_to_server(FILE_DOESNT_EXIST, self.sock)
return FILE_DOESNT_EXIST
#staticmethod
def make_new_file_path(new_path, folder):
"""
:param new_path: new path of file to merge
:return: the new path
"""
comp = folder.split("\\")
return new_path + "\\" + comp[END]
#staticmethod
def new_format_path(path, format):
"""
:param format: the new format
:return: the same path but with new format
"""
path_format = path.split('.')
path_format[SECOND] = format
return ".".join(path_format)
def start_file_and_wait(self):
"""
:param fname: the file path - temporary
:return: opens and continues when closed
"""
rc = ShellExecuteEx(
fMask=shellcon.SEE_MASK_NOCLOSEPROCESS,
nShow=win32con.SW_SHOW,
lpFile=self.path_file)
hproc = rc['hProcess']
win32event.WaitForSingleObject(hproc, win32event.INFINITE)
win32api.CloseHandle(hproc)
app = Application()
Now, the win32event.WaitForSingleObject(hproc, win32event.INFINITE) command does not work, and returns the error: "(6, 'WaitForSingleObject', 'The handle is invalid.')".
When I tried to use the start_file_and_wait(self) function in a separate file, it worked:
def start_file_wait(fname):
import win32con
import win32api
import win32event
from win32com.shell import shellcon
from win32com.shell.shell import ShellExecuteEx
rc = ShellExecuteEx(
fMask=shellcon.SEE_MASK_NOCLOSEPROCESS,
nShow=win32con.SW_SHOW,
lpFile=fname)
hproc = rc['hProcess']
win32event.WaitForSingleObject(hproc, win32event.INFINITE)
win32api.CloseHandle(hproc)
def main():
start_file_wait("E:\\12\\alice-chapter-1.txt")
print("hi")
if __name__ == '__main__':
main()
I don't know how to fix it. can someone help me?
Thanks!
It doesn't work on files that aren't text files
As you said, You did not specify a suitable verb for ShellExecuteEx, according to the parameter:
lpVerb :
... This parameter can be NULL, in which case the default verb is
used if available. If not, the "open" verb is used. If neither verb is
available, the system uses the first verb listed in the registry.
It may fail and return an invalid hProcess handle if it cannot find the associated application to open the file. You should specify a valid verb according to the file you open. Or specify the executable file of the app corresponding to the file as lpFile, and specify the file as lpParameters.
You could refer to the following document:
Launching Applications

How Do You Write Files to IPFS via the HTTP API in Python

Could someone demonstrate writing a file to IPFS via the HTTP API (/files/write) and Python?
My code is getting messier every time I modify it.
https://pastebin.com/W9eNz1Pb
def api(*argv, **kwargs):
url = "http://127.0.0.1:5001/api/v0/"
for arg in argv:
arg = arg.replace(" ", "/")
if arg[:-1] != "/":
arg += "/"
url += arg
url = url[0:-1]
if kwargs:
url+="?"
for val in kwargs:
if val != "post":
url = url + val + "=" + kwargs[val] + "&"
url = url[0:-1]
print(url)
try:
if "post" in kwargs:
print("POST DATA")
with urllib.request.urlopen(url=url, data=urllib.parse.urlencode(kwargs["post"]).encode("ascii")) as response:
return response.read()
else:
with urllib.request.urlopen(url, timeout=300) as response:
return response.read()
except:
return b"""{"ERROR": "CANNOT CONNECT TO IPFS!"}"""
class file(object):
def __init__(self, p):
self.p = p
if self.p[0] != "/":
self.p = "/" + self.p
def read(self):
return api("files", "read", arg=self.p).decode()
def write(self, s, *argv):
if argv:
return api("files", "write", arg=self.p, offset=str(argv[0]), create="True", parents="True", post={"Data": s})
else:
return api("files", "write", arg=self.p, truncate="True", create="True", parents="True", post={"Data": s})
file.read() works perfectly. But file.write() is being a pain in the rear.
Here's a minimal example to write a file via the /files/write HTTP API in Python:
import requests, urllib
NODE = "http://localhost:5001"
FILE_PATH = "./example" # path to file you're trying to add
MFS_PATH = "/example" # mfs path you're trying to write to
response = requests.post(NODE+"/api/v0/files/write?arg=%s&create=true" % urllib.parse.quote(MFS_PATH), files={FILE_PATH:open(FILE_PATH, 'rb')})
make sure ipfs daemon is running
ipfs init
ipfs daemon
your url endpoint is wrong. If you check documentaion, for adding file, url should be
url = "http://127.0.0.1:5001/api/v0/add"
create a function to upload so u can use this logic in other parts of your project:
def add_to_ipfs(filepath):
from pathlib import Path
import requests
# rb means open in binary. read binary
with Path(filepath).open("rb") as fp:
image_binary=fp.read()
# we need to make post request to this endpoint.
url = "http://127.0.0.1:5001/api/v0/add"
# check the response object
response = requests.post(url, files={"file": image_binary})
ipfs_hash=response.json()["Hash"]
# "./img/myImage.png" -> "myImage.png" split by "/" into array, take the last element
filename=filepath.split("/")[-1:][0]
image_uri=f"https://ipfs.io/ipfs/{ipfs_hash}?filename={filename}"
print("image uri on ipfs",image_uri)
return image_uri
this is the Response type from ipfs
{
"Bytes": "<int64>",
"Hash": "<string>",
"Name": "<string>",
"Size": "<string>"
}

How do you send many documents to a Scout Server in Python using the Python Scout Client?

Im trying to index PDF text to a python lib called Scout. I have tried doing the same thing with elasticsearch too. In both cases I can't figure out how to post text to an index in bulk, using python.
After a lot of research, I believe I need to use async http request. The only problem is, I don't understand async calls nor do I understand what a Scout python 'client' really is. I'm a self-taught programmer and still have many things I don't understand. my thought is the client cant stay open for a loop to keep using the connection. I have seen coding concepts like "await" and "sessions" in many books on programming. However, I don't know how to implement these concepts. Can someone help me write some python code that will successfully post new documents to a running scout server and explain how it's done?
Here is My attempt:
from scout_client import Scout
# import libraries to help read and create PDF
import PyPDF2
from fpdf import FPDF
import base64
import os
from flask import Flask, jsonify, request, render_template, json
# before you start, Run the Server.py file and create a Sqlite DB
# Step one loop though PDF in 'books' folder
for k in range(14,15):
# open the pdf file
read_pdf = PyPDF2.PdfFileReader("books/%s.pdf"%(k))
# Test to see if Step one is complete and succesful
#print (read_pdf)
# Step Two Gain intel on how many Pages are in the Document
# get the page numbers
num = read_pdf.getNumPages()
print ("PDF pages:", num)
# Step Three understand the data by page
# create a dictionary object for page data
all_pages = []
# Step For Create a new index in Scout Server
# client.create_index('test3')
# iterate the page numbers
for page in range(num):
data = read_pdf.getPage(page)
#page_mode = read_pdf.getPageMode()
# extract the page's text
page_text = data.extractText()
# put the text data into the dict
all_pages.append(page_text)
# initiate the Client from scout_client.py
client = Scout('http://localhost:8000')
# THe issue: I tryed for loops, and while loops but cant get past: urllib.error.HTTPError: HTTP Error 400: BAD REQUEST
i = 1
while i <= num:
client.create_document(all_pages[i],['test3'])
print(i,"....done")
i += 1
I get an error:
Traceback (most recent call last):
File "test.py", line 37, in <module>
client.create_document(all_pages[i],['test3'])
File "../Searchtest4/scout/scout_client.py", line 149, in create_document
return self.post('/documents/', post_data, attachments)
File "../Searchtest4/scout/scout_client.py", line 53, in post
return self.post_json(url, data)
File "../Searchtest4/scout/scout_client.py", line 63, in post_json
return json.loads(urlopen(request).read().decode('utf8'))
File "../lib/python3.7/urllib/request.py", line 222, in urlopen
return opener.open(url, data, timeout)
File "../lib/python3.7/urllib/request.py", line 531, in open
response = meth(req, response)
File "../lib/python3.7/urllib/request.py", line 641, in http_response
'http', request, response, code, msg, hdrs)
File "../lib/python3.7/urllib/request.py", line 569, in error
return self._call_chain(*args)
File "../lib/python3.7/urllib/request.py", line 503, in _call_chain
result = func(*args)
File "../lib/python3.7/urllib/request.py", line 649, in http_error_default
raise HTTPError(req.full_url, code, msg, hdrs, fp)
**urllib.error.HTTPError: HTTP Error 400: BAD REQUEST**
Here is the server that runs fine (server.py):
import logging
import optparse
import os
import sys
from flask import Flask
from werkzeug.serving import run_simple
from scout.exceptions import InvalidRequestException
from scout.models import database
from scout.models import Attachment
from scout.models import BlobData
from scout.models import Document
from scout.models import Index
from scout.models import IndexDocument
from scout.models import Metadata
from scout.views import register_views
logger = logging.getLogger('scout')
def create_server(config=None, config_file=None):
app = Flask(__name__)
# Configure application using a config file.
if config_file is not None:
app.config.from_pyfile(config_file)
# (Re-)Configure application using command-line switches/environment flags.
if config is not None:
app.config.update(config)
# Initialize the SQLite database.
initialize_database(app.config.get('DATABASE') or 'scout.db',
pragmas=app.config.get('SQLITE_PRAGMAS') or None)
register_views(app)
#app.errorhandler(InvalidRequestException)
def handle_invalid_request(exc):
return exc.response()
#app.before_request
def connect_database():
if database.database != ':memory:':
database.connect()
#app.teardown_request
def close_database(exc):
if database.database != ':memory:' and not database.is_closed():
database.close()
return app
def initialize_database(database_file, pragmas=None):
database.init(database_file, pragmas=pragmas)
try:
meth = database.execution_context
except AttributeError:
meth = database
with meth:
database.create_tables([
Attachment,
BlobData,
Document,
Index,
IndexDocument,
Metadata])
def run(app):
if app.config['DEBUG']:
app.run(host=app.config['HOST'], port=app.config['PORT'], debug=True)
else:
run_simple(
hostname=app.config['HOST'],
port=app.config['PORT'],
application=app,
threaded=True)
def panic(s, exit_code=1):
sys.stderr.write('\033[91m%s\033[0m\n' % s)
sys.stderr.flush()
sys.exit(exit_code)
def get_option_parser():
parser = optparse.OptionParser()
parser.add_option(
'-H',
'--host',
default='127.0.0.1',
dest='host',
help='The hostname to listen on. Defaults to 127.0.0.1.')
parser.add_option(
'-p',
'--port',
default=8000,
dest='port',
help='The port to listen on. Defaults to 8000.',
type='int')
parser.add_option(
'-u',
'--url-prefix',
dest='url_prefix',
help='URL path to prefix Scout API.')
parser.add_option(
'-s',
'--stem',
dest='stem',
help='Specify stemming algorithm for content.')
parser.add_option(
'-d',
'--debug',
action='store_true',
dest='debug',
help='Run Flask app in debug mode.')
parser.add_option(
'-c',
'--config',
dest='config',
help='Configuration module (python file).')
parser.add_option(
'--paginate-by',
default=50,
dest='paginate_by',
help='Number of documents displayed per page of results, default=50',
type='int')
parser.add_option(
'-k',
'--api-key',
dest='api_key',
help='Set the API key required to access Scout.')
parser.add_option(
'-C',
'--cache-size',
default=64,
dest='cache_size',
help='SQLite page-cache size (MB). Defaults to 64MB.',
type='int')
parser.add_option(
'-f',
'--fsync',
action='store_true',
dest='fsync',
help='Synchronize database to disk on every write.')
parser.add_option(
'-j',
'--journal-mode',
default='wal',
dest='journal_mode',
help='SQLite journal mode. Defaults to WAL (recommended).')
parser.add_option(
'-l',
'--logfile',
dest='logfile',
help='Log file')
return parser
def parse_options():
option_parser = get_option_parser()
options, args = option_parser.parse_args()
if options.logfile:
handler = logging.FileHandler(options.logfile)
logger.addHandler(handler)
config_file = os.environ.get('SCOUT_CONFIG') or options.config
config = {'DATABASE': os.environ.get('SCOUT_DATABASE')}
if len(args) == 0 and not config['DATABASE']:
panic('Error: missing required path to database file.')
elif len(args) > 1:
panic('Error: [%s] only accepts one argument, which is the path '
'to the database file.' % __file__)
elif args:
config['DATABASE'] = args[0]
pragmas = [('journal_mode', options.journal_mode)]
if options.cache_size:
pragmas.append(('cache_size', -1024 * options.cache_size))
if not options.fsync:
pragmas.append(('synchronous', 0))
config['SQLITE_PRAGMAS'] = pragmas
# Handle command-line options. These values will override any values
# that may have been specified in the config file.
if options.api_key:
config['AUTHENTICATION'] = options.api_key
if options.debug:
config['DEBUG'] = True
config['HOST'] = options.host or '127.0.0.1'
config['PORT'] = options.port or 8000
config['URL_PREFIX'] = options.url_prefix or ''
if options.paginate_by:
if options.paginate_by < 1 or options.paginate_by > 1000:
panic('paginate-by must be between 1 and 1000')
config['PAGINATE_BY'] = options.paginate_by
if options.stem:
if options.stem not in ('simple', 'porter'):
panic('Unrecognized stemmer. Must be "porter" or "simple".')
config['STEM'] = options.stem
return create_server(config, config_file)
def main():
app = parse_options()
run(app)
if __name__ == '__main__':
main()
and the so-called client (scout_client.py):
import base64
import json
try:
from email.generator import _make_boundary as choose_boundary
except ImportError:
from mimetools import choose_boundary
import mimetypes
import os
try:
from urllib.parse import urlencode
except ImportError:
from urllib import urlencode
try:
from urllib.request import Request
from urllib.request import urlopen
except ImportError:
from urllib2 import Request
from urllib2 import urlopen
import zlib
ENDPOINT = None
KEY = None
class Scout(object):
def __init__(self, endpoint=ENDPOINT, key=KEY):
self.endpoint = endpoint.rstrip('/')
self.key = key
def get_full_url(self, url):
return self.endpoint + url
def get_raw(self, url, **kwargs):
headers = {'Content-Type': 'application/json'}
if self.key:
headers['key'] = self.key
if kwargs:
if '?' not in url:
url += '?'
url += urlencode(kwargs, True)
request = Request(self.get_full_url(url), headers=headers)
fh = urlopen(request)
return fh.read()
def get(self, url, **kwargs):
return json.loads(self.get_raw(url, **kwargs))
def post(self, url, data=None, files=None):
if files:
return self.post_files(url, data, files)
else:
return self.post_json(url, data)
def post_json(self, url, data=None):
headers = {'Content-Type': 'application/json'}
if self.key:
headers['key'] = self.key
data = json.dumps(data or {})
if not isinstance(data, bytes):
data = data.encode('utf-8')
request = Request(self.get_full_url(url), data=data, headers=headers)
return json.loads(urlopen(request).read().decode('utf8'))
def post_files(self, url, json_data, files=None):
if not files or not isinstance(files, dict):
raise ValueError('One or more files is required. Files should be '
'passed as a dictionary of filename: file-like-'
'object.')
boundary = choose_boundary()
form_files = []
for i, (filename, file_obj) in enumerate(files.items()):
try:
data = file_obj.read()
except AttributeError:
data = bytes(file_obj)
mimetype = mimetypes.guess_type(filename)[0]
form_files.append((
'file_%s' % i,
filename,
mimetype or 'application/octet-stream',
data))
part_boundary = '--' + boundary
parts = [
part_boundary,
'Content-Disposition: form-data; name="data"',
'',
json.dumps(json_data)]
for field_name, filename, mimetype, data in form_files:
parts.extend((
part_boundary,
'Content-Disposition: file; name="%s"; filename="%s"' % (
field_name, filename),
'Content-Type: %s' % mimetype,
'',
data))
parts.append('--' + boundary + '--')
parts.append('')
headers = {'Content-Type': 'multipart/form-data; boundary=%s' %
boundary}
if self.key:
headers['key'] = self.key
data = '\r\n'.join(parts)
if not isinstance(data, bytes):
data = data.encode('utf-8')
request = Request(self.get_full_url(url), data=data, headers=headers)
return json.loads(urlopen(request).read())
def delete(self, url):
headers = {}
if self.key:
headers['key'] = self.key
request = Request(self.get_full_url(url), headers=headers)
request.get_method = lambda: 'DELETE'
fh = urlopen(request)
return json.loads(fh.read())
def get_indexes(self, **kwargs):
return self.get('/', **kwargs)['indexes']
def create_index(self, name):
return self.post('/', {'name': name})
def rename_index(self, old_name, new_name):
return self.post('/%s/' % old_name, {'name': new_name})
def delete_index(self, name):
return self.delete('/%s/' % name)
def get_index(self, name, **kwargs):
return self.get('/%s/' % name, **kwargs)
def get_documents(self, **kwargs):
return self.get('/documents/', **kwargs)
def create_document(self, content, indexes, identifier=None,
attachments=None, **metadata):
if not isinstance(indexes, (list, tuple)):
indexes = [indexes]
post_data = {
'content': content,
'identifier': identifier,
'indexes': indexes,
'metadata': metadata}
return self.post('/documents/', post_data, attachments)
def update_document(self, document_id=None, content=None, indexes=None,
metadata=None, identifier=None, attachments=None):
if not document_id and not identifier:
raise ValueError('`document_id` must be provided.')
data = {}
if content is not None:
data['content'] = content
if indexes is not None:
if not isinstance(indexes, (list, tuple)):
indexes = [indexes]
data['indexes'] = indexes
if metadata is not None:
data['metadata'] = metadata
if not data and not attachments:
raise ValueError('Nothing to update.')
return self.post('/documents/%s/' % document_id, data, attachments)
def delete_document(self, document_id=None):
if not document_id:
raise ValueError('`document_id` must be provided.')
return self.delete('/documents/%s/' % document_id)
def get_document(self, document_id=None):
if not document_id:
raise ValueError('`document_id` must be provided.')
return self.get('/documents/%s/' % document_id)
def attach_files(self, document_id, attachments):
return self.post_files('/documents/%s/attachments/' % document_id,
{}, attachments)
def detach_file(self, document_id, filename):
return self.delete('/documents/%s/attachments/%s/' %
(document_id, filename))
def update_file(self, document_id, filename, file_object):
return self.post_files('/documents/%s/attachments/%s/' %
(document_id, filename),
{}, {filename: file_object})
def get_attachments(self, document_id, **kwargs):
return self.get('/documents/%s/attachments/' % document_id, **kwargs)
def get_attachment(self, document_id, filename):
return self.get('/documents/%s/attachments/%s/' %
(document_id, filename))
def download_attachment(self, document_id, filename):
return self.get_raw('/documents/%s/attachments/%s/download/' %
(document_id, filename))
def search_attachments(self, **kwargs):
return self.get('/documents/attachments/', **kwargs)
class SearchProvider(object):
def content(self, obj):
raise NotImplementedError
def identifier(self, obj):
raise NotImplementedError
def metadata(self, obj):
raise NotImplementedError
class SearchSite(object):
def __init__(self, client, index):
self.client = client
self.index = index
self.registry = {}
def register(self, model_class, search_provider):
self.registry.setdefault(model_class, [])
self.registry[model_class].append(search_provider())
def unregister(self, model_class, search_provider=None):
if search_provider is None:
self.registry.pop(model_class, None)
elif model_class in self.registry:
self.registry[model_class] = [
sp for sp in self.registry[model_class]
if not isinstance(sp, search_provider)]
def store(self, obj):
if type(obj) not in self.registry:
return False
for provider in self.registry[type(obj)]:
content = provider.content(obj)
try:
metadata = provider.metadata(obj)
except NotImplementedError:
metadata = {}
try:
identifier = provider.identifier(obj)
except NotImplementedError:
pass
else:
metadata['identifier'] = identifier
self.client.create_document(content, self.index, **metadata)
return True
def remove(self, obj):
if type(obj) not in self.registry:
return False
for provider in self.registry[type(obj)]:
self.client.delete_document(provider.identifier(obj))
return True
Finally the Documentation for Scout:
https://scout.readthedocs.io/en/latest/server.html#index-detail-index-name
https://charlesleifer.com/blog/meet-scout-a-search-server-powered-by-sqlite/
Any Detailed Help is much appreciated:)
So i find a lib called scout and...got it to work!
from scout_client import Scout
# import libraries to help read and create PDF
import PyPDF2
from fpdf import FPDF
import base64
import os
from flask import Flask, jsonify, request, render_template, json
client = Scout('http://localhost:8000')
for k in range(7,18):
read_pdf = PyPDF2.PdfFileReader("books/%s.pdf"%(k))
num = read_pdf.getNumPages()
print ("PDF pages:", num)
all_pages = []
for page in range(num):
data = read_pdf.getPage(page)
page_text = data.extractText()
all_pages.append(page_text)
import requests
for z in all_pages:
url = 'http://localhost:8000/documents/'
data = {'content': z, 'indexes': ['test13']}
headers = {
'Content-Type': 'application/json',
}
response = requests.post(url, data=json.dumps(data), headers=headers)
print(response)
I can now loop though as many PDF's as I want locally
Post to the server for indexing
and search for keywords
Now I just need help with Making a basic front end with a search bar that calls data from a JSON response in python and flask.

Python Ivona (Pyvona) script issue (local variable unbound)

I'm using the pyvona package (from July 3rd 2016). I have all dependencies installed. It works correctly when I call it to speak for the first time. But if I run the command again, it gives me the local unbound error:
>>> import pyvona
>>> v = pyvona.create_voice('<key>', '<secret>')
>>> v.speak('hello')
>>> v.speak('hello')
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "C:\Python34\lib\site-packages\pyvona.py", line 159, in speak
channel.play(sound)
UnboundLocalError: local variable 'channel' referenced before assignment
>>>
Here's the pyvona.py script:
#!/usr/bin/env python
# encoding: utf-8
"""Pyvona : an IVONA python library
Author: Zachary Bears
Contact Email: bears.zachary#gmail.com
Note: Full operation of this library requires the requests and pygame libraries
"""
import datetime
import hashlib
import hmac
import json
import tempfile
import contextlib
import os
class PyvonaException(Exception):
pass
try:
import pygame
except ImportError:
pygame_available = False
else:
pygame_available = True
try:
import requests
requests.packages.urllib3.disable_warnings()
except ImportError:
msg = 'The requests library is essential for Pyvona operation. '
msg += 'Without it, Pyvona will not function correctly.'
raise PyvonaException(msg)
_amazon_date_format = '%Y%m%dT%H%M%SZ'
_date_format = '%Y%m%d'
def create_voice(access_key, secret_key):
"""Creates and returns a voice object to interact with
"""
return Voice(access_key, secret_key)
class Voice(object):
"""An object that contains all the required methods for interacting
with the IVONA text-to-speech system
"""
voice_name = None
language = None
gender = None
speech_rate = None
sentence_break = None
paragraph_break = None
_codec = "ogg"
region_options = {
'us-east': 'us-east-1',
'us-west': 'us-west-2',
'eu-west': 'eu-west-1',
}
access_key = None
secret_key = None
algorithm = 'AWS4-HMAC-SHA256'
signed_headers = 'content-type;host;x-amz-content-sha256;x-amz-date'
_region = None
_host = None
#property
def region(self):
return self._region
#region.setter
def region(self, region_name):
self._region = self.region_options.get(region_name, 'us-east-1')
self._host = 'tts.{}.ivonacloud.com'.format(self._region)
#property
def codec(self):
return self._codec
#codec.setter
def codec(self, codec):
if codec not in ["mp3", "ogg"]:
raise PyvonaException(
"Invalid codec specified. Please choose 'mp3' or 'ogg'")
self._codec = codec
#contextlib.contextmanager
def use_ogg_codec(self):
current_codec = self.codec
self.codec = "ogg"
try:
yield
finally:
self.codec = current_codec
def fetch_voice_ogg(self, text_to_speak, filename):
"""Fetch an ogg file for given text and save it to the given file name
"""
with self.use_ogg_codec():
self.fetch_voice(text_to_speak, filename)
def fetch_voice(self, text_to_speak, filename):
"""Fetch a voice file for given text and save it to the given file name
"""
file_extension = ".{codec}".format(codec=self.codec)
filename += file_extension if not filename.endswith(
file_extension) else ""
with open(filename, 'wb') as f:
self.fetch_voice_fp(text_to_speak, f)
def fetch_voice_fp(self, text_to_speak, fp):
"""Fetch a voice file for given text and save it to the given file pointer
"""
r = self._send_amazon_auth_packet_v4(
'POST', 'tts', 'application/json', '/CreateSpeech', '',
self._generate_payload(text_to_speak), self._region, self._host)
if r.content.startswith(b'{'):
raise PyvonaException('Error fetching voice: {}'.format(r.content))
else:
fp.write(r.content)
def speak(self, text_to_speak, use_cache=False):
"""Speak a given text
"""
if not pygame_available:
raise PyvonaException(
"Pygame not installed. Please install to use speech.")
if not pygame.mixer.get_init():
pygame.mixer.init()
channel = pygame.mixer.Channel(5)
if use_cache is False:
with tempfile.SpooledTemporaryFile() as f:
with self.use_ogg_codec():
self.fetch_voice_fp(text_to_speak, f)
f.seek(0)
sound = pygame.mixer.Sound(f)
else:
cache_f = hashlib.md5(text_to_speak).hexdigest() + '.ogg'
speech_cache_dir = os.getcwd() + '/speech_cache/'
if not os.path.isdir(speech_cache_dir):
os.makedirs(speech_cache_dir)
if not os.path.isfile(speech_cache_dir + cache_f):
with self.use_ogg_codec():
self.fetch_voice(text_to_speak, 'speech_cache/' + cache_f)
f = speech_cache_dir + cache_f
sound = pygame.mixer.Sound(f)
channel.play(sound)
while channel.get_busy():
pass
def list_voices(self):
"""Returns all the possible voices
"""
r = self._send_amazon_auth_packet_v4(
'POST', 'tts', 'application/json', '/ListVoices', '', '',
self._region, self._host)
return r.json()
def _generate_payload(self, text_to_speak):
return json.dumps({
'Input': {
"Type":"application/ssml+xml",
'Data': text_to_speak
},
'OutputFormat': {
'Codec': self.codec.upper()
},
'Parameters': {
'Rate': self.speech_rate,
'SentenceBreak': self.sentence_break,
'ParagraphBreak': self.paragraph_break
},
'Voice': {
'Name': self.voice_name,
'Language': self.language,
'Gender': self.gender
}
})
def _send_amazon_auth_packet_v4(self, method, service, content_type,
canonical_uri, canonical_querystring,
request_parameters, region, host):
"""Send a packet to a given amazon server using Amazon's signature Version 4,
Returns the resulting response object
"""
# Create date for headers and the credential string
t = datetime.datetime.utcnow()
amazon_date = t.strftime(_amazon_date_format)
date_stamp = t.strftime(_date_format)
# Step 1: Create canonical request
payload_hash = self._sha_hash(request_parameters)
canonical_headers = 'content-type:{}\n'.format(content_type)
canonical_headers += 'host:{}\n'.format(host)
canonical_headers += 'x-amz-content-sha256:{}\n'.format(payload_hash)
canonical_headers += 'x-amz-date:{}\n'.format(amazon_date)
canonical_request = '\n'.join([
method, canonical_uri, canonical_querystring, canonical_headers,
self.signed_headers, payload_hash])
# Step 2: Create the string to sign
credential_scope = '{}/{}/{}/aws4_request'.format(
date_stamp, region, service)
string_to_sign = '\n'.join([
self.algorithm, amazon_date, credential_scope,
self._sha_hash(canonical_request)])
# Step 3: Calculate the signature
signing_key = self._get_signature_key(
self.secret_key, date_stamp, region, service)
signature = hmac.new(
signing_key, string_to_sign.encode('utf-8'),
hashlib.sha256).hexdigest()
# Step 4: Create the signed packet
endpoint = 'https://{}{}'.format(host, canonical_uri)
authorization_header = '{} Credential={}/{}, ' +\
'SignedHeaders={}, Signature={}'
authorization_header = authorization_header.format(
self.algorithm, self.access_key, credential_scope,
self.signed_headers, signature)
headers = {
'Host': host,
'Content-type': content_type,
'X-Amz-Date': amazon_date,
'Authorization': authorization_header,
'x-amz-content-sha256': payload_hash,
'Content-Length': len(request_parameters)
}
# Send the packet and return the response
return requests.post(endpoint, data=request_parameters,
headers=headers)
def _sha_hash(self, to_hash):
return hashlib.sha256(to_hash.encode('utf-8')).hexdigest()
def _sign(self, key, msg):
return hmac.new(key, msg.encode('utf-8'), hashlib.sha256).digest()
def _get_signature_key(self, key, date_stamp, region_name, service_name):
k_date = self._sign(('AWS4{}'.format(key)).encode('utf-8'), date_stamp)
k_region = self._sign(k_date, region_name)
k_service = self._sign(k_region, service_name)
k_signing = self._sign(k_service, 'aws4_request')
return k_signing
def __init__(self, access_key, secret_key):
"""Set initial voice object parameters
"""
self.region = 'us-east'
self.voice_name = 'Brian'
self.access_key = access_key
self.secret_key = secret_key
self.speech_rate = 'medium'
self.sentence_break = 400
self.paragraph_break = 650
Any help is highly appreciated.
Was able to find a workaround by adding:
pygame.mixer.init()
channel = pygame.mixer.Channel(5)
immediately after defining the speak function:
def speak(self, text_to_speak, use_cache=False):
def speak(self, text_to_speak, use_cache=False):
"""Speak a given text
"""
pygame.mixer.init()
channel = pygame.mixer.Channel(5)
Usually UnboundLocalError relates to Scopes and Namespaces in Python Scopes and NameSpaces ,
but in Your case:
In function speak() You create channel in code
if not pygame.mixer.get_init():
pygame.mixer.init()
channel = pygame.mixer.Channel(5)
In case this code does not execute channel does not bound to any object.
For example, You can check this situation by this code sample:
def test_if():
if True:
# Bound
channel_1 = "Channel_1"
if False:
# Not bound
channel_2 = "Channel_2"
print(channel_1)
print(channel_2)
test_if()

Categories

Resources