Json body is truncated using non english characters in AWS Lambda function - python

I am using an API gateway and AWS Lamdba function as Proxy to my company's API (C# Web API 2.0)
The Lambda function in written in Python 2.7 and I am using Pyhton's urllib2 to pass the http request to the API.
I encounterd a strange issue When I am sending a json body containing hebrew characters.
The Json is being cut in the middle. I am making sure that the Json sent from the Lambda is complete, but the json body received in the Lambda is being turcated somewhere along the way.
This is the Lambda function:
from __future__ import print_function
import json
import urllib2
import HTMLParser
base = "http://xxxxxx/api"
hparser = HTMLParser.HTMLParser()
def lambda_handler(event, context):
print("Got event\n" + json.dumps(event, indent=2))
# Form URL
url = base + event['queryStringParameters']['rmt']
print('URL = %s' % url)
req = urllib2.Request(url)
if 'body' in event:
if event['body']:
print('BODY = %s' % json.dumps(event['body'], ensure_ascii=False, encoding='utf8') )
req.add_data(json.dumps(event['body'], ensure_ascii=False, encoding='utf8'))
# Copy only some headers
if 'headers' in event:
if event['headers']:
copy_headers = ('Accept', 'Content-Type', 'content-type')
for h in copy_headers:
if h in event['headers']:
print('header added = %s' % event['headers'][h])
req.add_header(h, event['headers'][h])
# Build response
out = {}
headersjsonstr = ('Access-Control-Allow-Origin', '')
response_header = {}
try:
print('Trying here...')
resp = urllib2.urlopen(req)
out['statusCode'] = resp.getcode()
out['body'] = resp.read()
for head in resp.info().headers:
keyval = head.split(':')
if any(keyval[0] in h for h in headersjsonstr):
response_header[keyval[0]] = keyval[1].replace('\r','').replace('\n','').strip()
print('response_header = %s' % response_header )
out['headers'] = response_header
print('status = %s' % out['statusCode'] )
except urllib2.HTTPError as e:
out['statusCode'] = e.getcode()
out['body'] = e.read()
out['headers'] = e.headers
print('status = %s' % out['statusCode'] )
return out
This is the Post request raw body Json
{"company":"שלום","guests":[{"fullname":"אבי","carno":"67"}],"fromdate":"2018-10-10","todate":"2018-10-10","fromtime":"07:31","totime":"07:31","comments":null,"Employee":{"UserId":"ink1445"}}
And this is what I am getting on the API:
"{\"company\":\"שלום\",\"guests\":[{\"fullname\":\"אבי\",\"carno\":\"67\"}],\"fromdate\":\"2018-10-10\",\"todate\":\"2018-10-10\",\"fromtime\":\"07:31\",\"totime\":\"07:31\",\"comments\":null,\"Employee\":{\"UserId\":\"ink1
Again, when I am sending only English letters everything is fine.
Please help!
Thanks

Very likely your json buffer is too small, and you are getting overflow truncation.
The size was probably set assuming ASCII or utf-8 encoding, and your unicode characters are wider (consume more bytes).
Depending on what json package you are using, you may be able to set an option for unicode or you might need to adjust the buffer size manually.

Related

Python SIEM log collector hangs randomly

I am trying to troubleshoot a script for Mimecast's API. The script runs fine for the most part, but a few times, I have noticed that it stops pulling logs and generally appears to be a hung process. After restarting the script and manually pushing logs to the syslog server, it starts working again without issue. I am not able to reproduce this issue at will.
The script is supposed to do the following:
Authenticate against Mimecast's API
Sign responses
download, extract and save log files to log dir
utilize a tokenized header to determine which file was downloaded in the last request. Should save the token ID within a file in the checkpoint directory
Push files to remote syslog server
Output any errors and info to console
Below is the sample code from Mimecast.
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import logging.handlers
import json
import os
import requests
import base64
import uuid
import datetime
import hashlib
import shutil
import hmac
import time
from zipfile import ZipFile
import io
# Set up variables
APP_ID = "YOUR DEVELOPER APPLICATION ID"
APP_KEY = "YOUR DEVELOPER APPLICATION KEY"
URI = "/api/audit/get-siem-logs"
EMAIL_ADDRESS = 'EMAIL ADDRESS OF YOUR ADMINISTRATOR'
ACCESS_KEY = 'ACCESS KEY FOR YOUR ADMINISTRATOR'
SECRET_KEY = 'SECRET KEY FOR YOUR ADMINISTRATOR'
LOG_FILE_PATH = "FULLY QUALIFIED PATH TO FOLDER TO WRITE LOGS"
CHK_POINT_DIR = 'FULLY QUALIFIED PATH TO FOLDER TO WRITE PAGE TOKEN'
# Set True to output to syslog, false to only save to file
syslog_output = False
# Enter the IP address or hostname of your syslog server
syslog_server = 'localhost'
# Change this to override default port
syslog_port = 514
# delete files after fetching
delete_files = True
# Set threshold in number of files in log file directory
log_file_threshold = 10000
# Set up logging (in this case to terminal)
log = logging.getLogger(__name__)
log.root.setLevel(logging.DEBUG)
log_formatter = logging.Formatter('%(levelname)s %(message)s')
log_handler = logging.StreamHandler()
log_handler.setFormatter(log_formatter)
log.addHandler(log_handler)
# Set up syslog output
syslog_handler = logging.handlers.SysLogHandler(address=(syslog_server, syslog_port))
syslog_formatter = logging.Formatter('%(message)s')
syslog_handler.setFormatter(syslog_formatter)
syslogger = logging.getLogger(__name__)
syslogger = logging.getLogger('SysLogger')
syslogger.addHandler(syslog_handler)
# Supporting methods
def get_hdr_date():
return datetime.datetime.utcnow().strftime("%a, %d %b %Y %H:%M:%S UTC")
def read_file(file_name):
try:
with open(file_name, 'r') as f:
data = f.read()
return data
except Exception as e:
log.error('Error reading file ' + file_name + '. Cannot continue. Exception: ' + str(e))
quit()
def write_file(file_name, data_to_write):
if '.zip' in file_name:
try:
byte_content = io.BytesIO(data_to_write)
zip_file = ZipFile(byte_content)
zip_file.extractall(LOG_FILE_PATH)
except Exception as e:
log.error('Error writing file ' + file_name + '. Cannot continue. Exception: ' + str(e))
quit()
else:
try:
with open(file_name, 'w') as f:
f.write(data_to_write)
except Exception as e:
log.error('Error writing file ' + file_name + '. Cannot continue. Exception: ' + str(e))
quit()
def get_base_url(email_address):
# Create post body for request
post_body = dict()
post_body['data'] = [{}]
post_body['data'][0]['emailAddress'] = email_address
# Create variables required for request headers
request_id = str(uuid.uuid4())
request_date = get_hdr_date()
headers = {'x-mc-app-id': APP_ID, 'x-mc-req-id': request_id, 'x-mc-date': request_date}
# Send request to API
log.debug('Sending request to https://api.mimecast.com/api/discover-authentication with request Id: ' +
request_id)
try:
r = requests.post(url='https://api.mimecast.com/api/login/discover-authentication',
data=json.dumps(post_body), headers=headers)
# Handle Rate Limiting
if r.status_code == 429:
log.warning('Rate limit hit. sleeping for ' + str(r.headers['X-RateLimit-Reset'] * 1000))
time.sleep(r.headers['X-RateLimit-Reset'] * 1000)
except Exception as e:
log.error('Unexpected error getting base url. Cannot continue.' + str(e))
quit()
# Handle error from API
if r.status_code != 200:
log.error('Request returned with status code: ' + str(r.status_code) + ', response body: ' +
r.text + '. Cannot continue.')
quit()
# Load response body as JSON
resp_data = json.loads(r.text)
# Look for api key in region region object to get base url
if 'region' in resp_data["data"][0]:
base_url = resp_data["data"][0]["region"]["api"].split('//')
base_url = base_url[1]
else:
# Handle no region found, likely the email address was entered incorrectly
log.error(
'No region information returned from API, please check the email address.'
'Cannot continue')
quit()
return base_url
def post_request(base_url, uri, post_body, access_key, secret_key):
# Create variables required for request headers
request_id = str(uuid.uuid4())
request_date = get_hdr_date()
unsigned_auth_header = '{date}:{req_id}:{uri}:{app_key}'.format(
date=request_date,
req_id=request_id,
uri=uri,
app_key=APP_KEY
)
hmac_sha1 = hmac.new(
base64.b64decode(secret_key),
unsigned_auth_header.encode(),
digestmod=hashlib.sha1).digest()
sig = base64.encodebytes(hmac_sha1).rstrip()
headers = {
'Authorization': 'MC ' + access_key + ':' + sig.decode(),
'x-mc-app-id': APP_ID,
'x-mc-date': request_date,
'x-mc-req-id': request_id,
'Content-Type': 'application/json'
}
try:
# Send request to API
log.debug('Sending request to https://' + base_url + uri + ' with request Id: ' + request_id)
r = requests.post(url='https://' + base_url + uri, data=json.dumps(post_body), headers=headers)
# Handle Rate Limiting
if r.status_code == 429:
log.warning('Rate limit hit. sleeping for ' + str(r.headers['X-RateLimit-Reset'] * 1000))
time.sleep(r.headers['X-RateLimit-Reset'] * 1000)
r = requests.post(url='https://' + base_url + uri, data=json.dumps(post_body), headers=headers)
# Handle errors
except Exception as e:
log.error('Unexpected error connecting to API. Exception: ' + str(e))
return 'error'
# Handle errors from API
if r.status_code != 200:
log.error('Request to ' + uri + ' with , request id: ' + request_id + ' returned with status code: ' +
str(r.status_code) + ', response body: ' + r.text)
return 'error'
# Return response body and response headers
return r.content, r.headers
def get_mta_siem_logs(checkpoint_dir, base_url, access_key, secret_key):
uri = "/api/audit/get-siem-logs"
# Set checkpoint file name to store page token
checkpoint_filename = os.path.join(checkpoint_dir, 'get_mta_siem_logs_checkpoint')
# Build post body for request
post_body = dict()
post_body['data'] = [{}]
post_body['data'][0]['type'] = 'MTA'
post_body['data'][0]['compress'] = True
if os.path.exists(checkpoint_filename):
post_body['data'][0]['token'] = read_file(checkpoint_filename)
# Send request to API
resp = post_request(base_url, uri, post_body, access_key, secret_key)
now = datetime.datetime.now().strftime("%a %b %d %H:%M:%S %Y")
# Process response
if resp != 'error':
resp_body = resp[0]
resp_headers = resp[1]
content_type = resp_headers['Content-Type']
# End if response is JSON as there is no log file to download
if content_type == 'application/json':
log.info('No more logs available')
return False
# Process log file
elif content_type == 'application/octet-stream':
file_name = resp_headers['Content-Disposition'].split('=\"')
file_name = file_name[1][:-1]
# Save files to LOG_FILE_PATH
write_file(os.path.join(LOG_FILE_PATH, file_name), resp_body)
# Save mc-siem-token page token to check point directory
write_file(checkpoint_filename, resp_headers['mc-siem-token'])
try:
if syslog_output is True:
for filename in os.listdir(LOG_FILE_PATH):
file_creation_time = time.ctime(os.path.getctime(LOG_FILE_PATH + "/" + filename))
if now < file_creation_time or now == file_creation_time:
log.info('Loading file: ' + filename + ' to output to ' + syslog_server + ':' + str(syslog_port))
with open(file=os.path.join(LOG_FILE_PATH, filename), mode='r', encoding='utf-8') as log_file:
lines = log_file.read().splitlines()
for line in lines:
syslogger.info(line)
log.info('Syslog output completed for file ' + filename)
except Exception as e:
log.error('Unexpected error writing to syslog. Exception: ' + str(e))
# return true to continue loop
return True
else:
# Handle errors
log.error('Unexpected response')
for header in resp_headers:
log.error(header)
return False
def run_script():
# discover base URL
try:
base_url = get_base_url(email_address=EMAIL_ADDRESS)
except Exception as e:
log.error('Error discovering base url for ' + EMAIL_ADDRESS + ' . Exception: ' + str(e))
quit()
# Request log data in a loop until there are no more logs to collect
try:
log.info('Getting MTA log data')
while get_mta_siem_logs(checkpoint_dir=CHK_POINT_DIR, base_url=base_url, access_key=ACCESS_KEY,
secret_key=SECRET_KEY) is True:
log.info('Getting more MTA log files')
except Exception as e:
log.error('Unexpected error getting MTA logs ' + (str(e)))
file_number = len([name for name in os.listdir(LOG_FILE_PATH) if os.path.isfile(name)])
if delete_files or file_number >= log_file_threshold:
for filename in os.listdir(LOG_FILE_PATH):
file_path = os.path.join(LOG_FILE_PATH, filename)
try:
if os.path.isfile(file_path) or os.path.islink(file_path):
os.unlink(file_path)
elif os.path.isdir(file_path):
shutil.rmtree(file_path)
except Exception as e:
print('Failed to delete %s. Reason: %s' % (file_path, e))
quit()
# Run script
run_script()
It seems like it may potentially be a race condition but I am not sure how to confirm since I can't reproduce it. I notice that SumoLogic has a modified version of this script as well with a different methodology for managing the files/paths. If this script works better than the main sample script above, would anybody be able to explain WHY? I haven't had any issues with it yet.
https://github.com/SumoLogic/sumologic-content/blob/master/MimeCast/SumoLogic-Mimecast-Data-Collection/siem_collection.py

Cannot Process decoded Image files, Flask, OpenCV

I am receiving a bunch of images to the flask app via the client file.
client.py
# Generate the parallel requests based on the ThreadPool Executor
from concurrent.futures import ThreadPoolExecutor as PoolExecutor
import sys
import time
import glob
import requests
import threading
import uuid
import base64
import json
import os
#send http request
def call_object_detection_service(image):
try:
url = str(sys.argv[2])
data = {}
#generate uuid for image
id = uuid.uuid5(uuid.NAMESPACE_OID, image)
# Encode image into base64 string
with open (image, 'rb') as image_file:
data['image'] = base64.b64encode(image_file.read()).decode('utf-8')
data['id'] = str(id)
headers = {'Content-Type': 'application/json'}
response = requests.post(url, json= json.dumps(data), headers = headers)
if response.ok:
output = "Thread : {}, input image: {}, output:{}".format(threading.current_thread().getName(),
image, response.text)
print(output)
else:
print ("Error, response status:{}".format(response))
except Exception as e:
print("Exception in webservice call: {}".format(e))
# gets list of all images path from the input folder
def get_images_to_be_processed(input_folder):
images = []
for image_file in glob.iglob(input_folder + "*.jpg"):
images.append(image_file)
return images
def main():
## provide argumetns-> input folder, url, number of wrokers
if len(sys.argv) != 4:
raise ValueError("Arguments list is wrong. Please use the following format: {} {} {} {}".
format("python iWebLens_client.py", "<input_folder>", "<URL>", "<number_of_workers>"))
input_folder = os.path.join(sys.argv[1], "")
images = get_images_to_be_processed(input_folder)
num_images = len(images)
num_workers = int(sys.argv[3])
start_time = time.time()
#craete a worker thread to invoke the requests in parallel
with PoolExecutor(max_workers=num_workers) as executor:
for _ in executor.map(call_object_detection_service, images):
pass
#elapsed_time = time.time() - start_time
#print("Total time spent: {} average response time: {}".format(elapsed_time, elapsed_time/num_images))
if __name__ == "__main__":
main()
I decode them like so
Flask App
app = Flask(__name__)
c = 1
#app.route('/api/object_detection', methods = ['POST'])
def main():
global c
try:
data = request.get_json(force=True)
uid = data.get('id')
image = data.get('image')
print(image)
im = base64.decodebytes(image)
with open("image{}".format(c), 'wb') as f:
f.write(im)
c += 1
for l in range(128):
img = cv2.imread("image{}".format(l), cv2.IMREAD_ANYCOLOR);
# load the neural net. Should be local to this method as its multi-threaded endpoint
nets = load_model(CFG, Weights)
s = do_prediction(img, nets, Lables)
return jsonify(s)
except Exception as e:
print(e)
if __name__ == '__main__':
app.run(host='0.0.0.0', debug=True, threaded=True)
This creates the image files with different sizes but I cannot view them in image viewer. The files being recieved are jpg files. Ignoring that, I went ahead with the processing and I get
TypeError: The view function for 'main' did not return a valid response. The function either returned None or ended without a return statement.
Incorrect padding
Incorrect padding
[INFO] loading YOLO from disk...
'NoneType' object has no attribute 'shape'
Images are being sent like this.
python iWebLens_client.py inputfolder/ http://192.168.29.75:5000/api/object_detection 4
The images are being received like this.
b'"{\\"image\\": \\"/9j/4AAQSkZJRgABAQEASABIAAD/4gxYSUNDX1BST0ZJTEUAAQEAAAxITGlubwIQAABtbnRyUkdCIFhZWiAHzgACAAkABgAxAABhY3NwTVNGVAAAAABJRUMgc1JHQgAAAAAAAAAAAAAAAAAA......fiU05tQopHNf//Z\\", \\"id\\": \\"e3ad9809-b84c-57f1-bd03-a54e25c59bcc\\"}"'
I am thinking I need to decode('utf-8') this, but don't know how.
Currently, you are double-coding the data on the client side. Within requests, the argument passed is already converted to JSON.
Just pass the dict on as a json parameter.
def call_object_detection_service(image):
try:
url = str(sys.argv[2])
data = {}
#generate uuid for image
id = uuid.uuid5(uuid.NAMESPACE_OID, image)
# Encode image into base64 string
with open (image, 'rb') as image_file:
data['image'] = base64.b64encode(image_file.read()).decode('utf-8')
data['id'] = str(id)
headers = {'Content-Type': 'application/json'}
# HERE IS THE CHANGE !!!
response = requests.post(url, json=data, headers=headers)
if response.ok:
output = "Thread : {}, input image: {}, output:{}".format(
threading.current_thread().getName(),
image,
response.text
)
print(output)
else:
print ("Error, response status:{}".format(response))
except Exception as e:
print("Exception in webservice call: {}".format(e))
The data can now be received on the server as JSON and extracted into a dict.
#app.route('/api/object_detection', methods=['POST'])
def main():
data = request.get_json(force=True)
uid = data.get('id')
image = data.get('image')
# ... decode the base64 data here ...
return jsonify(message='done')

ERROR uploading: 'latin-1' codec can't encode character '\u2019' with JSON data upload

I am using python to upload some JSON data to the application UI but getting following error.
ERROR uploading: 'latin-1' codec can't encode character '\u2019' in position 5735: Body ('â') is not valid Latin-1. Use body.encode('utf-8') if you want to send it encoded in UTF-8.
The program takes the input from a sample.json file which includes a special character ( ' ) and that's giving the error.
Value: amex%?'
My code looks like:
def read_from_file(file_path, target_path=None):
try:
f = open(file_path, "r")
data = json.load(f)
f.close()
if target_path:
result_obj = []
for obj in data:
if target_path in obj['Key']:
result_obj.append(obj)
data = result_obj
except Exception as e:
print ("ERROR reading file:", e, file=sys.stderr)
exit(1)
return data
def upload(server, token, data):
params = {"token": token}
for obj in data:
try:
payload = obj['Value']
url = server + obj['Key']
response = requests.put(url, data=payload, params=params)
if response.status_code != 200:
raise Exception("HTTP code %s on PUT %s" % (response.status_code, url))
except Exception as e:
print ("ERROR uploading:", e, file=sys.stderr)
exit(1)
Can somebody please advise where I need to change my code to include special character ( ' ) while upload?

Python script stops working when I put it inside a function

Little bit of background: I'm using Python 2.7.12 on a Windows 10 computer.
This is by far one of the oddest problems I have ever encountered with Python.
I have written a script that makes a GET request to an API, with the correct headers, and gets some XML data back. For the record, when I paste the script like this in a python file and run it via CMD, it works perfectly fine.
But..
It stops working as soon as I wrap this inside a function. Nothing
else, just wrap it inside a function, and use
if __name__ == '__main__':
my_new_function()
to run it from CMD and it won't work anymore. It still works but the API says I have wrong auth credentials, and thus I don't get any data back.
I went over every piece of string that is in this code, and it's all ASCII encoded. I also checked the timestamps, and they are all correct.
This is my script:
SECRET_KEY = 'YYY'
PUBLIC_KEY = 'XXX'
content_type = 'application/xml'
date = time.strftime('%a, %d %b %Y %H:%M:%S GMT', time.gmtime())
method = 'GET'
uri = '/uri'
msg = """{method}
{content_type}
{date}
x-bol-date:{date}
{uri}""".format(content_type=content_type,
date=date,
method=method,
uri=uri)
h = hmac.new(
SECRET_KEY,
msg, hashlib.sha256)
b64 = base64.b64encode(h.digest())
signature = PUBLIC_KEY + b':' + b64
headers = {'Content-Type': content_type,
'X-BOL-Date': date,
'X-BOL-Authorization': signature}
r = requests.get('example.com/uri', headers=headers)
the same code inside a function:
def get_orders():
SECRET_KEY = 'XXX'
PUBLIC_KEY = 'YYY'
content_type = 'application/xml'
date = time.strftime('%a, %d %b %Y %H:%M:%S GMT', time.gmtime())
method = 'GET'
uri = '/uri'
msg = """{method}
{content_type}
{date}
x-bol-date:{date}
{uri}""".format(content_type=content_type,
date=date,
method=method,
uri=uri)
h = hmac.new(
SECRET_KEY,
msg, hashlib.sha256)
b64 = base64.b64encode(h.digest())
signature = PUBLIC_KEY + b':' + b64
headers = {'Content-Type': content_type,
'X-BOL-Date': date,
'X-BOL-Authorization': signature}
r = requests.get('example.com/uri', headers=headers)
if __name__ == '__main__':
get_orders()
I think your multi-line string is getting spaces in it when you indent it in a function. Concatenate it on each line instead and it should work.

How to force http.client to send chunked-encoding HTTP body in python?

I want to send chunked HTTP body to test my own HTTP server.
So I wrote this python code:
import http.client
body = 'Hello World!' * 80
conn = http.client.HTTPConnection("some.domain.com")
url = "/some_path?arg=true_arg"
conn.request("POST", url, body, {"Transfer-Encoding":"chunked"})
resp = conn.getresponse()
print(resp.status, resp.reason)
I expect the HTTP request's body is transferrd chunked,
but I capture the network package with Wireshark, the HTTP request's body is not transferred chunked.
How to transfer chunked body by http.client lib in python?
OK, I get it.
First, write my own chunked encode function.
Then use putrequest(), putheader(), endheaders() and send() instead of request()
import http.client
def chunk_data(data, chunk_size):
dl = len(data)
ret = ""
for i in range(dl // chunk_size):
ret += "%s\r\n" % (hex(chunk_size)[2:])
ret += "%s\r\n\r\n" % (data[i * chunk_size : (i + 1) * chunk_size])
if len(data) % chunk_size != 0:
ret += "%s\r\n" % (hex(len(data) % chunk_size)[2:])
ret += "%s\r\n" % (data[-(len(data) % chunk_size):])
ret += "0\r\n\r\n"
return ret
conn = http.client.HTTPConnection(host)
url = "/some_path"
conn.putrequest('POST', url)
conn.putheader('Transfer-Encoding', 'chunked')
conn.endheaders()
conn.send(chunk_data(body, size_per_chunk).encode('utf-8'))
resp = conn.getresponse()
print(resp.status, resp.reason)
conn.close()
I'd suggest that if you already know the size of your data like in the answer given you could just set the Content-Length and send it all back in one hit, which is kind of what you're doing with the single call to conn.send anyway.
Chunked transfer encoding is most useful when you don't know how big the data is e.g. dynamically generated content. I've modified your code to illustrate:
import httplib
def write_chunk(conn, data):
conn.send("%s\r\n" % hex(len(data))[2:])
conn.send("%s\r\n" % data)
def dynamically_generate_data():
for i in range(80):
yield "hello world"
conn = httplib.HTTPConnection("localhost")
url = "/some_path"
conn.putrequest('POST', url)
conn.putheader('Transfer-Encoding', 'chunked')
conn.endheaders()
for new_chunk in dynamically_generate_data():
write_chunk(conn, new_chunk)
conn.send('0\r\n')
resp = conn.getresponse()
print(resp.status, resp.reason)
conn.close()

Categories

Resources