I am able to generate and stream text on the fly, but unable to generate and stream a compressed file on the fly.
from flask import Flask, request, Response,stream_with_context
import zlib
import gzip
app = Flask(__name__)
def generate_text():
for x in range(10000):
yield f"this is my line: {x}\n".encode()
#app.route('/stream_text')
def stream_text():
response = Response(stream_with_context(generate_text()))
return response
def generate_zip():
for x in range(10000):
yield zlib.compress(f"this is my line: {x}\n".encode())
#app.route('/stream_zip')
def stream_zip():
response = Response(stream_with_context(generate_zip()), mimetype='application/zip')
response.headers['Content-Disposition'] = 'attachment; filename=data.gz'
return response
if __name__ == '__main__':
app.run(host='0.0.0.0', port=8000, debug=True)
Than using curl and gunzip:
curl http://127.0.0.1:8000/stream_zip > data.gz
gunzip data.gz
gunzip: data.gz: not in gzip format
I don't care if it is zip, gzip, or any other type of compression.
generate_text in my real code generates over 4 GB of data so I would like to compress on the fly.
Saving text to file, zipping, returning zip file, and than deleting is not the solution I'm after.
I need to be in a loop generating some text -> compress that text -> streaming compressed data until I'm done.
zip/gzip ... anything is fine as long as it works.
You are yielding a series of compressed documents, not a single compressed stream. Don't use zlib.compress(), it includes the header and forms a single document.
You need to create a zlib.compressobj() object instead, and use the Compress.compress() method on that object to produce a stream of data (followed by a final call to Compress.flush()):
def generate_zip():
compressor = zlib.compressobj()
for x in range(10000):
chunk = compressor.compress(f"this is my line: {x}\n".encode())
if chunk:
yield chunk
yield compressor.flush()
The compressor can produce empty blocks when there is not enough data yet to produce a full compressed-data chunk, the above only yields if there is actually anything to send. Because your input data is so highly repetitive and thus the data can be efficiently compressed, this yields only 3 times (once with 2-byte header, once with about 21kb of compressed data covering the first 8288 iterations over range(), and finally with the remaining 4kb for the rest of the loop).
In aggregate, this produces the same data as a single zlib.compress() call with all inputs concatenated. The correct mime-type for this data format is application/zlib, not application/zip.
This format is not readily decompressible with gzip however, not without some trickery. That's because the above doesn't yet produce a GZIP file, it just produces a raw zlib-compressed stream. To make it GZIP compatible, you need to configure the compression correctly, send a header first, and add a CRC checksum and data length value at the end:
import zlib
import struct
import time
def generate_gzip():
# Yield a gzip file header first.
yield bytes([
0x1F, 0x8B, 0x08, 0x00, # Gzip file, deflate, no filename
*struct.pack('<L', int(time.time())), # compression start time
0x02, 0xFF, # maximum compression, no OS specified
])
# bookkeeping: the compression state, running CRC and total length
compressor = zlib.compressobj(
9, zlib.DEFLATED, -zlib.MAX_WBITS, zlib.DEF_MEM_LEVEL, 0)
crc = zlib.crc32(b"")
length = 0
for x in range(10000):
data = f"this is my line: {x}\n".encode()
chunk = compressor.compress(data)
if chunk:
yield chunk
crc = zlib.crc32(data, crc) & 0xFFFFFFFF
length += len(data)
# Finishing off, send remainder of the compressed data, and CRC and length
yield compressor.flush()
yield struct.pack("<2L", crc, length & 0xFFFFFFFF)
Serve this as application/gzip:
#app.route('/stream_gzip')
def stream_gzip():
response = Response(stream_with_context(generate_gzip()), mimetype='application/gzip')
response.headers['Content-Disposition'] = 'attachment; filename=data.gz'
return response
and the result can be decompressed on the fly:
curl http://127.0.0.1:8000/stream_gzip | gunzip -c | less
While I was extremely impressed by Martijn's solution, I decided to roll my own one that uses pigz for better performance:
def yield_pigz(results, compresslevel=1):
cmd = ['pigz', '-%d' % compresslevel]
pigz_proc = subprocess.Popen(cmd, bufsize=0,
stdin=subprocess.PIPE, stdout=subprocess.PIPE)
def f():
for result in results:
pigz_proc.stdin.write(result)
pigz_proc.stdin.flush()
pigz_proc.stdin.close()
try:
t = threading.Thread(target=f)
t.start()
while True:
buf = pigz_proc.stdout.read(4096)
if len(buf) == 0:
break
yield buf
finally:
t.join()
pigz_proc.wait()
Keep in mind that you'll need to import subprocess and threading for this to work. You will also need to install pigz program (already in repositories of most Linux distributions -- on Ubuntu, just use sudo apt install pigz -y).
Example usage:
from flask import Flask, Response
import subprocess
import threading
import random
app = Flask(__name__)
def yield_something_random():
for i in range(10000):
seq = [chr(random.randint(ord('A'), ord('Z'))) for c in range(1000)]
yield ''.join(seq)
#app.route('/')
def index():
return Response(yield_pigz(yield_something_random()))
I think that currently you just sending the generator instead of the data!
You may want to do something like this (I haven't tested it, so may need some change):
def generate_zip():
import io
with gzip.GzipFile(fileobj=io.BytesIO(), mode='w') as gfile:
for x in xrange(10000):
gfile.write("this is my line: {}\n".format(x))
return gfile.read()
Working generate_zip() with low memory consumption :) :
def generate_zip():
buff = io.BytesIO()
gz = gzip.GzipFile(mode='w', fileobj=buff)
for x in xrange(10000):
gz.write("this is my line: {}\n".format(x))
yield buff.read()
buff.truncate()
gz.close()
yield buff.getvalue()
Related
I am trying to read a huge binary file.
I have used chunks method and i am breaking my stream in chunks like:
with open("testUniform.bin", 'rb') as f:
for chunk in iter(lambda: f.read(4096), b''):
dostuff(chunk)
All I am getting in chunk is this:
.y.$.!...
like stream,which I have attached a screenshot of.
I don't get the way out to convert it back to binary stream and I still don't know that why is this stream being converted into something like this.
Please help:
I am trying to convert a binary stream to a decimal values, but since being a huge file I cannot apply
f.read()
Here is my code attached:
from math import log,ceil,pow
from flask import Flask, request
from flask_restful import Resource, Api
import struct
app = Flask(__name__)
api = Api(app)
def binaryToDecimal(n):
return int(n,2)
def dostuff(inputarray):
args = request.args
lower_end_range = args['lower_end_range']
higher_end_range = args['higher_end_range']
amount = args['amount']
lower_end_range =int(lower_end_range)
higher_end_range=int(higher_end_range)
amount =int(amount)
#range_input is the range
range_input=higher_end_range-lower_end_range+1
#taking the log of the range to generate offset
log_of_range=log(range_input,2)
log_of_range=int(ceil(log_of_range))
higher_end_range_represented_by_bits = 0
lower_end_range_represented_by_bits = 0
lst = []
FinalRandomArray = []
#creating the maximum of numbers which it can go to by saving,for ex: 2^3+2^2+2^1+2^0
for i in range(0,(log_of_range)):
higher_end_range_represented_by_bits+=pow(2,i)
while True:
i=range_input%2
range_input=range_input/2
lst.append(i)
if range_input==0:
break
length = len(lst)
#where length is equal to the window size
for file in range(0,len(inputarray),length):
print(inputarray[0])
number=binaryToDecimal((inputarray[file]+inputarray[file+1]+inputarray[file+2]))+lower_end_range
if(number>=lower_end_range and number<=higher_end_range):
if(amount!=0):
FinalRandomArray.append(number)
amount-=1
return {'finalrandomarray':FinalRandomArray}
class ReturnMainModule(Resource):
def get(self):
with open("testUniform.bin", 'rb') as f:
for chunk in iter(lambda: f.read(4096), b''):
dostuff(chunk)
api.add_resource(ReturnMainModule, '/main')
# Driver code
if __name__ == '__main__':
app.run(port='5004')
I've read the Google documentation and looked at their examples however have not managed to get this working correctly in my particular use case. The problem is that the packets of the audio stream are broken up into smaller chunks (frame size) base64 encoded and sent over MQTT - meaning that the generator approach is likely to stop part way through despite not being fully completed by the sender. My MicrophoneSender component will send the final part of the message with a segment_key = -1, so this is the flag that the complete message has been sent and that a full/final process of the stream can be completed. Prior to that point the buffer may not have all of the complete stream so it's difficult to get either a) the generator to stop yielding b) the google as to return a partial transcription. A partial transcription is required once every 10 or so frames.
To illustrate this better here is my code.
inside receiver:
STREAMFRAMETHRESHOLD = 10
def mqttMsgCallback(self, client, userData, msg):
if msg.topic.startswith("MicSender/stream"):
msgDict = json.loads(msg.payload)
streamBytes = b64decode(msgDict['audio_data'].encode('utf-8'))
frameNum = int(msgDict['segment_num'])
if frameNum == 0:
self.asr_time_start = time.time()
self.asr.endOfStream = False
if frameNum >= 0:
self.asr.store_stream_bytes(streamBytes)
self.asr.endOfStream = False
if frameNum % STREAMFRAMETHRESHOLD == 0:
self.asr.get_intermediate_and_print()
else:
#FINAL, recieved -1
trans = self.asr.finish_stream()
self.send_message(trans)
self.frameCount=0
inside Google Speech Class implementation:
class GoogleASR(ASR):
def __init__(self, name):
super().__init__(name)
# STREAMING
self.stream_buf = queue.Queue()
self.stream_gen = self.getGenerator(self.stream_buf)
self.endOfStream = True
self.requests = (types.StreamingRecognizeRequest(audio_content=chunk) for chunk in self.stream_gen)
self.streaming_config = types.StreamingRecognitionConfig(config=self.config)
self.current_transcript = ''
self.numCharsPrinted = 0
def getGenerator(self, buff):
while not self.endOfStream:
# Use a blocking get() to ensure there's at least one chunk of
# data, and stop iteration if the chunk is None, indicating the
# end of the audio stream.
chunk = buff.get()
if chunk is None:
return
data = [chunk]
# Now consume whatever other data's still buffered.
while True:
try:
chunk = buff.get(block=False)
data.append(chunk)
except queue.Empty:
self.endOfStream = True
yield b''.join(data)
break
yield b''.join(data)
def store_stream_bytes(self, bytes):
self.stream_buf.put(bytes)
def get_intermediate_and_print(self):
self.get_intermediate()
def get_intermediate(self):
if self.stream_buf.qsize() > 1:
print("stream buf size: {}".format(self.stream_buf.qsize()))
responses = self.client.streaming_recognize(self.streaming_config, self.requests)
# print(responses)
try:
# Now, put the transcription responses to use.
if not self.numCharsPrinted:
self.numCharsPrinted = 0
for response in responses:
if not response.results:
continue
# The `results` list is consecutive. For streaming, we only care about
# the first result being considered, since once it's `is_final`, it
# moves on to considering the next utterance.
result = response.results[0]
if not result.alternatives:
continue
# Display the transcription of the top alternative.
self.current_transcript = result.alternatives[0].transcript
# Display interim results, but with a carriage return at the end of the
# line, so subsequent lines will overwrite them.
#
# If the previous result was longer than this one, we need to print
# some extra spaces to overwrite the previous result
overwrite_chars = ' ' * (self.numCharsPrinted - len(self.current_transcript))
sys.stdout.write(self.current_transcript + overwrite_chars + '\r')
sys.stdout.flush()
self.numCharsPrinted = len(self.current_transcript)
def finish_stream(self):
self.endOfStream = False
self.get_intermediate()
self.endOfStream = True
final_result = self.current_transcript
self.stream_buf= queue.Queue()
self.allBytes = bytearray()
self.current_transcript = ''
self.requests = (types.StreamingRecognizeRequest(audio_content=chunk) for chunk in self.stream_gen)
self.streaming_config = types.StreamingRecognitionConfig(config=self.config)
return final_result
Currently what this does is output nothing from the transcriptions side.
stream buf size: 21
stream buf size: 41
stream buf size: 61
stream buf size: 81
stream buf size: 101
stream buf size: 121
stream buf size: 141
stream buf size: 159
But the response/transcript is empty. If I put a breakpoint on the for response in responses inside the get_intermediate function then it never runs which means that for some reason it's empty (not retuned from Google). However, if I put a breakpoint on the generator and take too long (> 5 seconds) to continue to yield the data, it (Google) tells me that the data is probably being sent to the server too slow. google.api_core.exceptions.OutOfRange: 400 Audio data is being streamed too slow. Please stream audio data approximately at real time.
Maybe someone can spot the obvious here...
The way you have organized your code, the generator you give to the Google API is initialized exactly once - on line 10, using a generator expression: self.requests = (...). As constructed, this generator will also run exactly once and become 'exhausted'. Same applies to the generator function that the (for ...) generator itself calls (self.getGeneerator()). It will run once only and stop when it retrieved 10 chunks of data (which are very small, from what I can see). Then, the outer generator (what you assigned to self.requests) will also stop forever - giving the ASR only a short bit of data (10 times 20 bytes, looking at the printed debug output). There's nothing recognizable in that, most likely.
BTW, note you have a redundant yield b''.join(data) in your function, the data will be sent twice.
You will need to redo the (outer) generator so it does not return until all data is received. If you want to use another generator as you do to gather each bigger chunk for the 'outer' generator from which the Google API is reading, you will need to re-make it every time you begin a new loop with it.
I'm running a Python script on a Sun Grid Engine supercompute cluster that reads in a list of file ids, sends each to a worker process for analysis, and writes one output per input file to disk.
The trouble is I'm getting IOError(110, 'Connection timed out') somewhere inside the worker function, and I'm not sure why. I've received this error in the past when making network requests that were severely delayed, but in this case the worker is only trying to read data from disk.
My question is: What would cause a Connection timed out error when reading from disk, and how can one resolve this error? Any help others can offer would be very appreciated.
Full script (the IOError crops up in minhash_text()):
from datasketch import MinHash
from multiprocessing import Pool
from collections import defaultdict
from nltk import ngrams
import json
import sys
import codecs
import config
cores = 24
window_len = 12
step = 4
worker_files = 50
permutations = 256
hashband_len = 4
def minhash_text(args):
'''Return a list of hashband strings for an input doc'''
try:
file_id, path = args
with codecs.open(path, 'r', 'utf8') as f:
f = f.read()
all_hashbands = []
for window_idx, window in enumerate(ngrams(f.split(), window_len)):
window_hashbands = []
if window_idx % step != 0:
continue
minhash = MinHash(num_perm=permutations, seed=1)
for ngram in set(ngrams(' '.join(window), 3)):
minhash.update( ''.join(ngram).encode('utf8') )
hashband_vals = []
for i in minhash.hashvalues:
hashband_vals.append(i)
if len(hashband_vals) == hashband_len:
window_hashbands.append( '.'.join([str(j) for j in hashband_vals]) )
hashband_vals = []
all_hashbands.append(window_hashbands)
return {'file_id': file_id, 'hashbands': all_hashbands}
except Exception as exc:
print(' ! error occurred while processing', file_id, exc)
return {'file_id': file_id, 'hashbands': []}
if __name__ == '__main__':
file_ids = json.load(open('file_ids.json'))
file_id_path_tuples = [(file_id, path) for file_id, path in file_ids.items()]
worker_id = int(sys.argv[1])
worker_ids = list(ngrams(file_id_path_tuples, worker_files))[worker_id]
hashband_to_ids = defaultdict(list)
pool = Pool(cores)
for idx, result in enumerate(pool.imap(minhash_text, worker_ids)):
print(' * processed', idx, 'results')
file_id = result['file_id']
hashbands = result['hashbands']
for window_idx, window_hashbands in enumerate(hashbands):
for hashband in window_hashbands:
hashband_to_ids[hashband].append(file_id + '.' + str(window_idx))
with open(config.out_dir + 'minhashes-' + str(worker_id) + '.json', 'w') as out:
json.dump(dict(hashband_to_ids), out)
It turned out I was hammering the filesystem too hard, making too many concurrent read requests for files on the same server. That server could only allow a fixed number of reads in a given period, so any requests over that limit received a Connection Timed Out response.
The solution was to wrap each file read request in a while loop. Inside that while loop, try to read the appropriate file from disk. If the Connection timed out error springs, sleep for a second and try again. Only once the file has been read may the while loop be broken.
I have a non-seekable file-like object. In particular it is a file of indeterminate size coming from an HTTP request.
import requests
fileobj = requests.get(url, stream=True)
I am streaming this file to a call to an Amazon AWS SDK function which is writing the contents to Amazon S3. This is working fine.
import boto3
s3 = boto3.resource('s3')
s3.bucket('my-bucket').upload_fileobj(fileobj, 'target-file-name')
However, at the same time as streaming it to S3 I want to also stream the data to another process. This other process may not need the entire stream and might stop listening at some point; this is fine and should not affect the stream to S3.
It's important I don't use too much memory, since some of these files could be enormous. I don't want to write anything to disk for the same reason.
I don't mind if either sink is slowed down due to the other being slow, as long as S3 eventually gets the entire file, and the data goes to both sinks (rather, to each one which still wants it).
What's the best way to go about this in Python (3)? I know I can't just pass the same file object to both sinks, such as
s3.bucket('my-bucket').upload_fileobj(fileobj, 'target-file-name')
# At the same time somehow as
process = subprocess.Popen(['myapp'], stdin=fileobj)
I think I could write a wrapper for the file-like object which passes any data read not only to the caller (which would be the S3 sink) but also to the other process. Something like
class MyFilewrapper(object):
def __init__(self, fileobj):
self._fileobj = fileobj
self._process = subprocess.Popen(['myapp'], stdin=popen.PIPE)
def read(self, size=-1):
data = self._fileobj.read(size)
self._process.stdin.write(data)
return data
filewrapper = MyFilewrapper(fileobj)
s3.bucket('my-bucket').upload_fileobj(filewrapper, 'target-file-name')
But is there a better way to do it? Perhaps something like
streams = StreamDuplicator(fileobj, streams=2)
s3.bucket('my-bucket').upload_fileobj(streams[0], 'target-file-name')
# At the same time somehow as
process = subprocess.Popen(['myapp'], stdin=streams[1])
The discomfort regarding your MyFilewrapper solution arises, because the IO loop inside upload_fileobj is now in control of feeding the data to a subprocess that is strictly speaking unrelated to the upload.
A "proper" solution would involve an upload API that provides a file-like object for writing the upload stream with an outside loop. That would allow you to feed the data to both target streams "cleanly".
The following example shows the basic concept. The fictional startupload method provides the file-like object for uploading. Of cource you would need to add proper error handling etc.
fileobj = requests.get(url, stream=True)
upload_fd = s3.bucket('my-bucket').startupload('target-file-name')
other_fd = ... # Popen or whatever
buf = memoryview(bytearray(4046))
while True:
r = fileobj.read_into(buf)
if r == 0:
break
read_slice = buf[:r]
upload_fd.write(read_slice)
other_fd.write(read_slice)
Here is an implementation of StreamDuplicator with requested functionality and use model. I verified that it handles correctly the case when one of the sinks stops consuming the respective stream half-way.
Usage:
./streamduplicator.py <sink1_command> <sink2_command> ...
Example:
$ seq 100000 | ./streamduplicator.py "sed -n '/0000/ {s/^/sed: /;p}'" "grep 1234"
Output:
sed: 10000
1234
11234
12340
12341
12342
12343
12344
12345
12346
12347
12348
12349
21234
sed: 20000
31234
sed: 30000
41234
sed: 40000
51234
sed: 50000
61234
sed: 60000
71234
sed: 70000
81234
sed: 80000
91234
sed: 90000
sed: 100000
streamduplicator.py:
#!/usr/bin/env python3
import sys
import os
from subprocess import Popen
from threading import Thread
from time import sleep
import shlex
import fcntl
WRITE_TIMEOUT=0.1
def write_or_timeout(stream, data, timeout):
data_to_write = data[:]
time_to_sleep = 1e-6
time_remaining = 1.0 * timeout
while time_to_sleep != 0:
try:
stream.write(data_to_write)
return True
except BlockingIOError as ex:
data_to_write = data_to_write[ex.characters_written:]
if ex.characters_written == 0:
time_to_sleep *= 2
else:
time_to_sleep = 1e-6
time_remaining = timeout
time_to_sleep = min(time_remaining, time_to_sleep)
sleep(time_to_sleep)
time_remaining -= time_to_sleep
return False
class StreamDuplicator(object):
def __init__(self, stream, n, timeout=WRITE_TIMEOUT):
self.stream = stream
self.write_timeout = timeout
self.pipereadstreams = []
self.pipewritestreams = []
for i in range(n):
(r, w) = os.pipe()
readStream = open(r, 'rb')
self.pipereadstreams.append(readStream)
old_flags = fcntl.fcntl(w, fcntl.F_GETFL);
fcntl.fcntl(w, fcntl.F_SETFL, old_flags|os.O_NONBLOCK)
self.pipewritestreams.append(os.fdopen(w, 'wb'))
Thread(target=self).start()
def __call__(self):
while True:
data = self.stream.read(1024*16)
if len(data) == 0:
break
surviving_pipes = []
for p in self.pipewritestreams:
if write_or_timeout(p, data, self.write_timeout) == True:
surviving_pipes.append(p)
self.pipewritestreams = surviving_pipes
def __getitem__(self, i):
return self.pipereadstreams[i]
if __name__ == '__main__':
n = len(sys.argv)
streams = StreamDuplicator(sys.stdin.buffer, n-1, 3)
for (i,cmd) in zip(range(n-1), sys.argv[1:]):
Popen(shlex.split(cmd), stdin=streams[i])
Implementation limitations:
usage of fcntl to set a pipe writing file descriptor to non-blocking mode probably makes it unusable under Windows.
a closed/unsubscribed sink is detected through a write timeout.
Consider the following program:
#!/usr/bin/env pypy
import json
import cStringIO
import sys
def main():
BUFSIZE = 10240
f = sys.stdin
decoder = json.JSONDecoder()
io = cStringIO.StringIO()
do_continue = True
while True:
read = f.read(BUFSIZE)
if len(read) < BUFSIZE:
do_continue = False
io.write(read)
try:
data, offset = decoder.raw_decode(io.getvalue())
print(data)
rest = io.getvalue()[offset:]
if rest.startswith('\n'):
rest = rest[1:]
decoder = json.JSONDecoder()
io = cStringIO.StringIO()
io.write(rest)
except ValueError, e:
#print(e)
#print(repr(io.getvalue()))
continue
if not do_continue:
break
if __name__ == '__main__':
main()
And here's a test case:
$ yes '{}' | pv | pypy parser-test.py >/dev/null
As you can see, the following script slows down when you add more input to it. This also happens with cPython. I tried to profile the script using mprof and cProfile, but I found no hint on why is that. Does anybody have a clue?
Apparently the string operations slowed it down. Instead of:
data, offset = decoder.raw_decode(io.getvalue())
print(data)
rest = io.getvalue()[offset:]
if rest.startswith('\n'):
rest = rest[1:]
It is better to do:
data, offset = decoder.raw_decode(io.read())
print(data)
rest = io.getvalue()[offset:]
io.truncate()
io.write(rest)
if rest.startswith('\n'):
io.seek(1)
You may want to close your StringIO at the end of the iteration (after writing).
io.close()
The memory buffer for a StringIO will free once it is closed, but will stay open otherwise. This would explain why each additional input is slowing your script down.